Merge branch 'master' of https://github.com/tensorflow/models into latest

8f5f819f · Kaushik Shivakumar · 7c062a56 · 709a6617 · 8f5f819f · 8f5f819f
Commit 8f5f819f authored Jul 16, 2020 by Kaushik Shivakumar
20 changed files
--- a/official/nlp/tasks/electra_task.py
+++ b/official/nlp/tasks/electra_task.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ELECTRA pretraining task (Joint Masked LM and Replaced Token Detection)."""
+import dataclasses
+import tensorflow as tf
+
+from official.core import base_task
+from official.modeling.hyperparams import config_definitions as cfg
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.data import pretrain_dataloader
+
+
+@dataclasses.dataclass
+class ELECTRAPretrainConfig(cfg.TaskConfig):
+  """The model config."""
+  model: electra.ELECTRAPretrainerConfig = electra.ELECTRAPretrainerConfig(
+      cls_heads=[
+          bert.ClsHeadConfig(
+              inner_dim=768,
+              num_classes=2,
+              dropout_rate=0.1,
+              name='next_sentence')
+      ])
+  train_data: cfg.DataConfig = cfg.DataConfig()
+  validation_data: cfg.DataConfig = cfg.DataConfig()
+
+
+@base_task.register_task_cls(ELECTRAPretrainConfig)
+class ELECTRAPretrainTask(base_task.Task):
+  """ELECTRA Pretrain Task (Masked LM + Replaced Token Detection)."""
+
+  def build_model(self):
+    return electra.instantiate_pretrainer_from_cfg(
+        self.task_config.model)
+
+  def build_losses(self,
+                   labels,
+                   model_outputs,
+                   metrics,
+                   aux_losses=None) -> tf.Tensor:
+    metrics = dict([(metric.name, metric) for metric in metrics])
+
+    # generator lm and (optional) nsp loss.
+    lm_prediction_losses = tf.keras.losses.sparse_categorical_crossentropy(
+        labels['masked_lm_ids'],
+        tf.cast(model_outputs['lm_outputs'], tf.float32),
+        from_logits=True)
+    lm_label_weights = labels['masked_lm_weights']
+    lm_numerator_loss = tf.reduce_sum(lm_prediction_losses * lm_label_weights)
+    lm_denominator_loss = tf.reduce_sum(lm_label_weights)
+    mlm_loss = tf.math.divide_no_nan(lm_numerator_loss, lm_denominator_loss)
+    metrics['lm_example_loss'].update_state(mlm_loss)
+    if 'next_sentence_labels' in labels:
+      sentence_labels = labels['next_sentence_labels']
+      sentence_outputs = tf.cast(
+          model_outputs['sentence_outputs'], dtype=tf.float32)
+      sentence_loss = tf.keras.losses.sparse_categorical_crossentropy(
+          sentence_labels,
+          sentence_outputs,
+          from_logits=True)
+      metrics['next_sentence_loss'].update_state(sentence_loss)
+      total_loss = mlm_loss + sentence_loss
+    else:
+      total_loss = mlm_loss
+
+    # discriminator replaced token detection (rtd) loss.
+    rtd_logits = model_outputs['disc_logits']
+    rtd_labels = tf.cast(model_outputs['disc_label'], tf.float32)
+    input_mask = tf.cast(labels['input_mask'], tf.float32)
+    rtd_ind_loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        logits=rtd_logits, labels=rtd_labels)
+    rtd_numerator = tf.reduce_sum(input_mask * rtd_ind_loss)
+    rtd_denominator = tf.reduce_sum(input_mask)
+    rtd_loss = tf.math.divide_no_nan(rtd_numerator, rtd_denominator)
+    metrics['discriminator_loss'].update_state(rtd_loss)
+    total_loss = total_loss + \
+        self.task_config.model.discriminator_loss_weight * rtd_loss
+
+    if aux_losses:
+      total_loss += tf.add_n(aux_losses)
+
+    metrics['total_loss'].update_state(total_loss)
+    return total_loss
+
+  def build_inputs(self, params, input_context=None):
+    """Returns tf.data.Dataset for pretraining."""
+    if params.input_path == 'dummy':
+
+      def dummy_data(_):
+        dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32)
+        dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32)
+        return dict(
+            input_word_ids=dummy_ids,
+            input_mask=dummy_ids,
+            input_type_ids=dummy_ids,
+            masked_lm_positions=dummy_lm,
+            masked_lm_ids=dummy_lm,
+            masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32),
+            next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32))
+
+      dataset = tf.data.Dataset.range(1)
+      dataset = dataset.repeat()
+      dataset = dataset.map(
+          dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+      return dataset
+
+    return pretrain_dataloader.BertPretrainDataLoader(params).load(
+        input_context)
+
+  def build_metrics(self, training=None):
+    del training
+    metrics = [
+        tf.keras.metrics.SparseCategoricalAccuracy(name='masked_lm_accuracy'),
+        tf.keras.metrics.Mean(name='lm_example_loss'),
+        tf.keras.metrics.SparseCategoricalAccuracy(
+            name='discriminator_accuracy'),
+    ]
+    if self.task_config.train_data.use_next_sentence_label:
+      metrics.append(
+          tf.keras.metrics.SparseCategoricalAccuracy(
+              name='next_sentence_accuracy'))
+      metrics.append(tf.keras.metrics.Mean(name='next_sentence_loss'))
+
+    metrics.append(tf.keras.metrics.Mean(name='discriminator_loss'))
+    metrics.append(tf.keras.metrics.Mean(name='total_loss'))
+
+    return metrics
+
+  def process_metrics(self, metrics, labels, model_outputs):
+    metrics = dict([(metric.name, metric) for metric in metrics])
+    if 'masked_lm_accuracy' in metrics:
+      metrics['masked_lm_accuracy'].update_state(labels['masked_lm_ids'],
+                                                 model_outputs['lm_outputs'],
+                                                 labels['masked_lm_weights'])
+    if 'next_sentence_accuracy' in metrics:
+      metrics['next_sentence_accuracy'].update_state(
+          labels['next_sentence_labels'], model_outputs['sentence_outputs'])
+    if 'discriminator_accuracy' in metrics:
+      disc_logits_expanded = tf.expand_dims(model_outputs['disc_logits'], -1)
+      discrim_full_logits = tf.concat(
+          [-1.0 * disc_logits_expanded, disc_logits_expanded], -1)
+      metrics['discriminator_accuracy'].update_state(
+          model_outputs['disc_label'], discrim_full_logits,
+          labels['input_mask'])
+
+  def train_step(self, inputs, model: tf.keras.Model,
+                 optimizer: tf.keras.optimizers.Optimizer, metrics):
+    """Does forward and backward.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the model, forward pass definition.
+      optimizer: the optimizer for this training step.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    with tf.GradientTape() as tape:
+      outputs = model(inputs, training=True)
+      # Computes per-replica loss.
+      loss = self.build_losses(
+          labels=inputs,
+          model_outputs=outputs,
+          metrics=metrics,
+          aux_losses=model.losses)
+      # Scales loss as the default gradients allreduce performs sum inside the
+      # optimizer.
+      # TODO(b/154564893): enable loss scaling.
+      scaled_loss = loss / tf.distribute.get_strategy().num_replicas_in_sync
+    tvars = model.trainable_variables
+    grads = tape.gradient(scaled_loss, tvars)
+    optimizer.apply_gradients(list(zip(grads, tvars)))
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
+
+  def validation_step(self, inputs, model: tf.keras.Model, metrics):
+    """Validatation step.
+
+    Args:
+      inputs: a dictionary of input tensors.
+      model: the keras.Model.
+      metrics: a nested structure of metrics objects.
+
+    Returns:
+      A dictionary of logs.
+    """
+    outputs = model(inputs, training=False)
+    loss = self.build_losses(
+        labels=inputs,
+        model_outputs=outputs,
+        metrics=metrics,
+        aux_losses=model.losses)
+    self.process_metrics(metrics, inputs, outputs)
+    return {self.loss: loss}
--- a/official/nlp/tasks/electra_task_test.py
+++ b/official/nlp/tasks/electra_task_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for official.nlp.tasks.electra_task."""
+
+import tensorflow as tf
+
+from official.nlp.configs import bert
+from official.nlp.configs import electra
+from official.nlp.configs import encoders
+from official.nlp.data import pretrain_dataloader
+from official.nlp.tasks import electra_task
+
+
+class ELECTRAPretrainTaskTest(tf.test.TestCase):
+
+  def test_task(self):
+    config = electra_task.ELECTRAPretrainConfig(
+        model=electra.ELECTRAPretrainerConfig(
+            generator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            discriminator_encoder=encoders.TransformerEncoderConfig(
+                vocab_size=30522, num_layers=1),
+            num_masked_tokens=20,
+            sequence_length=128,
+            cls_heads=[
+                bert.ClsHeadConfig(
+                    inner_dim=10, num_classes=2, name="next_sentence")
+            ]),
+        train_data=pretrain_dataloader.BertPretrainDataConfig(
+            input_path="dummy",
+            max_predictions_per_seq=20,
+            seq_length=128,
+            global_batch_size=1))
+    task = electra_task.ELECTRAPretrainTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    dataset = task.build_inputs(config.train_data)
+
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+    task.validation_step(next(iterator), model, metrics=metrics)
+
+
+if __name__ == "__main__":
+  tf.test.main()
--- a/official/nlp/tasks/sentence_prediction.py
+++ b/official/nlp/tasks/sentence_prediction.py
@@ -14,9 +14,12 @@
 # limitations under the License.
 # ==============================================================================
 """Sentence prediction (classification) task."""
+from typing import List, Union
+
 from absl import logging
 import dataclasses
 import numpy as np
+import orbit
 from scipy import stats
 from sklearn import metrics as sklearn_metrics
 import tensorflow as tf
@@ -31,6 +34,10 @@ from official.nlp.modeling import models
 from official.nlp.tasks import utils


+METRIC_TYPES = frozenset(
+    ['accuracy', 'matthews_corrcoef', 'pearson_spearman_corr'])
+
+
 @dataclasses.dataclass
 class ModelConfig(base_config.Config):
  """A classifier/regressor configuration."""
@@ -68,6 +75,9 @@ class SentencePredictionTask(base_task.Task):
      self._hub_module = hub.load(params.hub_module_url)
    else:
      self._hub_module = None
+
+    if params.metric_type not in METRIC_TYPES:
+      raise ValueError('Invalid metric_type: {}'.format(params.metric_type))
    self.metric_type = params.metric_type

  def build_model(self):
@@ -77,7 +87,7 @@ class SentencePredictionTask(base_task.Task):
      encoder_network = encoders.instantiate_encoder_from_cfg(
          self.task_config.model.encoder)

-    # Currently, we only supports bert-style sentence prediction finetuning.
+    # Currently, we only support bert-style sentence prediction finetuning.
    return models.BertClassifier(
        network=encoder_network,
        num_classes=self.task_config.model.num_classes,
@@ -86,8 +96,11 @@ class SentencePredictionTask(base_task.Task):
        use_encoder_pooler=self.task_config.model.use_encoder_pooler)

  def build_losses(self, labels, model_outputs, aux_losses=None) -> tf.Tensor:
-    loss = tf.keras.losses.sparse_categorical_crossentropy(
-        labels, tf.cast(model_outputs, tf.float32), from_logits=True)
+    if self.task_config.model.num_classes == 1:
+      loss = tf.keras.losses.mean_squared_error(labels, model_outputs)
+    else:
+      loss = tf.keras.losses.sparse_categorical_crossentropy(
+          labels, tf.cast(model_outputs, tf.float32), from_logits=True)

    if aux_losses:
      loss += tf.add_n(aux_losses)
@@ -103,8 +116,12 @@ class SentencePredictionTask(base_task.Task):
            input_word_ids=dummy_ids,
            input_mask=dummy_ids,
            input_type_ids=dummy_ids)
-        y = tf.zeros((1, 1), dtype=tf.int32)
-        return (x, y)
+
+        if self.task_config.model.num_classes == 1:
+          y = tf.zeros((1,), dtype=tf.float32)
+        else:
+          y = tf.zeros((1, 1), dtype=tf.int32)
+        return x, y

      dataset = tf.data.Dataset.range(1)
      dataset = dataset.repeat()
@@ -116,7 +133,11 @@ class SentencePredictionTask(base_task.Task):

  def build_metrics(self, training=None):
    del training
-    metrics = [tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
+    if self.task_config.model.num_classes == 1:
+      metrics = [tf.keras.metrics.MeanSquaredError()]
+    else:
+      metrics = [
+          tf.keras.metrics.SparseCategoricalAccuracy(name='cls_accuracy')]
    return metrics

  def process_metrics(self, metrics, labels, model_outputs):
@@ -154,6 +175,7 @@ class SentencePredictionTask(base_task.Task):
      return None
    if state is None:
      state = {'sentence_prediction': [], 'labels': []}
+    # TODO(b/160712818): Add support for concatenating partial batches.
    state['sentence_prediction'].append(
        np.concatenate([v.numpy() for v in step_outputs['sentence_prediction']],
                       axis=0))
@@ -162,15 +184,21 @@ class SentencePredictionTask(base_task.Task):
    return state

  def reduce_aggregated_logs(self, aggregated_logs):
-    if self.metric_type == 'matthews_corrcoef':
+    if self.metric_type == 'accuracy':
+      return None
+    elif self.metric_type == 'matthews_corrcoef':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      return {
          self.metric_type: sklearn_metrics.matthews_corrcoef(preds, labels)
      }
-    if self.metric_type == 'pearson_spearman_corr':
+    elif self.metric_type == 'pearson_spearman_corr':
      preds = np.concatenate(aggregated_logs['sentence_prediction'], axis=0)
+      preds = np.reshape(preds, -1)
      labels = np.concatenate(aggregated_logs['labels'], axis=0)
+      labels = np.reshape(labels, -1)
      pearson_corr = stats.pearsonr(preds, labels)[0]
      spearman_corr = stats.spearmanr(preds, labels)[0]
      corr_metric = (pearson_corr + spearman_corr) / 2
@@ -198,3 +226,52 @@ class SentencePredictionTask(base_task.Task):
    status.expect_partial().assert_existing_objects_matched()
    logging.info('Finished loading pretrained checkpoint from %s',
                 ckpt_dir_or_file)
+
+
+def predict(task: SentencePredictionTask, params: cfg.DataConfig,
+            model: tf.keras.Model) -> List[Union[int, float]]:
+  """Predicts on the input data.
+
+  Args:
+    task: A `SentencePredictionTask` object.
+    params: A `cfg.DataConfig` object.
+    model: A keras.Model.
+
+  Returns:
+    A list of predictions with length of `num_examples`. For regression task,
+      each element in the list is the predicted score; for classification task,
+      each element is the predicted class id.
+  """
+  is_regression = task.task_config.model.num_classes == 1
+
+  @tf.function
+  def predict_step(iterator):
+    """Predicts on distributed devices."""
+
+    def _replicated_step(inputs):
+      """Replicated prediction calculation."""
+      x, _ = inputs
+      outputs = task.inference_step(x, model)
+      if is_regression:
+        return outputs
+      else:
+        return tf.argmax(outputs, axis=-1)
+
+    outputs = tf.distribute.get_strategy().run(
+        _replicated_step, args=(next(iterator),))
+    return tf.nest.map_structure(
+        tf.distribute.get_strategy().experimental_local_results, outputs)
+
+  def reduce_fn(state, outputs):
+    """Concatenates model's outputs."""
+    for per_replica_batch_predictions in outputs:
+      state.extend(per_replica_batch_predictions)
+    return state
+
+  loop_fn = orbit.utils.create_loop_fn(predict_step)
+  dataset = orbit.utils.make_distributed_dataset(tf.distribute.get_strategy(),
+                                                 task.build_inputs, params)
+  # Set `num_steps` to -1 to exhaust the dataset.
+  predictions = loop_fn(
+      iter(dataset), num_steps=-1, state=[], reduce_fn=reduce_fn)
+  return predictions
--- a/official/nlp/tasks/sentence_prediction_test.py
+++ b/official/nlp/tasks/sentence_prediction_test.py
@@ -18,6 +18,7 @@ import functools
 import os

 from absl.testing import parameterized
+import numpy as np
 import tensorflow as tf

 from official.nlp.bert import configs
@@ -28,6 +29,35 @@ from official.nlp.data import sentence_prediction_dataloader
 from official.nlp.tasks import sentence_prediction


+def _create_fake_dataset(output_path, seq_length, num_classes, num_examples):
+  """Creates a fake dataset."""
+  writer = tf.io.TFRecordWriter(output_path)
+
+  def create_int_feature(values):
+    return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+
+  def create_float_feature(values):
+    return tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+
+  for _ in range(num_examples):
+    features = {}
+    input_ids = np.random.randint(100, size=(seq_length))
+    features["input_ids"] = create_int_feature(input_ids)
+    features["input_mask"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+    features["segment_ids"] = create_int_feature(np.ones_like(input_ids))
+
+    if num_classes == 1:
+      features["label_ids"] = create_float_feature([np.random.random()])
+    else:
+      features["label_ids"] = create_int_feature(
+          [np.random.random_integers(0, num_classes - 1, size=())])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
 class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):

  def setUp(self):
@@ -85,6 +115,42 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
    ckpt.save(config.init_checkpoint)
    task.initialize(model)

+  @parameterized.named_parameters(
+      {
+          "testcase_name": "regression",
+          "num_classes": 1,
+      },
+      {
+          "testcase_name": "classification",
+          "num_classes": 2,
+      },
+  )
+  def test_metrics_and_losses(self, num_classes):
+    config = sentence_prediction.SentencePredictionConfig(
+        init_checkpoint=self.get_temp_dir(),
+        model=self.get_model_config(num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(config)
+    model = task.build_model()
+    metrics = task.build_metrics()
+    if num_classes == 1:
+      self.assertIsInstance(metrics[0], tf.keras.metrics.MeanSquaredError)
+    else:
+      self.assertIsInstance(
+          metrics[0], tf.keras.metrics.SparseCategoricalAccuracy)
+
+    dataset = task.build_inputs(config.train_data)
+    iterator = iter(dataset)
+    optimizer = tf.keras.optimizers.SGD(lr=0.1)
+    task.train_step(next(iterator), model, optimizer, metrics=metrics)
+
+    logs = task.validation_step(next(iterator), model, metrics=metrics)
+    loss = logs["loss"].numpy()
+    if num_classes == 1:
+      self.assertAlmostEqual(loss, 42.77483, places=3)
+    else:
+      self.assertAlmostEqual(loss, 3.57627e-6, places=3)
+
  @parameterized.parameters(("matthews_corrcoef", 2),
                            ("pearson_spearman_corr", 1))
  def test_np_metrics(self, metric_type, num_classes):
@@ -153,6 +219,35 @@ class SentencePredictionTaskTest(tf.test.TestCase, parameterized.TestCase):
        train_data=self._train_data_config)
    self._run_task(config)

+  @parameterized.named_parameters(("classification", 5), ("regression", 1))
+  def test_prediction(self, num_classes):
+    task_config = sentence_prediction.SentencePredictionConfig(
+        model=self.get_model_config(num_classes=num_classes),
+        train_data=self._train_data_config)
+    task = sentence_prediction.SentencePredictionTask(task_config)
+    model = task.build_model()
+
+    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
+    seq_length = 16
+    num_examples = 100
+    _create_fake_dataset(
+        test_data_path,
+        seq_length=seq_length,
+        num_classes=num_classes,
+        num_examples=num_examples)
+
+    test_data_config = (
+        sentence_prediction_dataloader.SentencePredictionDataConfig(
+            input_path=test_data_path,
+            seq_length=seq_length,
+            is_training=False,
+            label_type="int" if num_classes > 1 else "float",
+            global_batch_size=16,
+            drop_remainder=False))
+
+    predictions = sentence_prediction.predict(task, test_data_config, model)
+    self.assertLen(predictions, num_examples)
+

 if __name__ == "__main__":
  tf.test.main()
--- a/official/nlp/tasks/tagging.py
+++ b/official/nlp/tasks/tagging.py
@@ -262,7 +262,7 @@ def predict(task: TaggingTask, params: cfg.DataConfig,
          label_mask=label_mask,
          sentence_ids=sentence_ids)

-    outputs = tf.distribute.get_strategy().experimental_run_v2(
+    outputs = tf.distribute.get_strategy().run(
        _replicated_step, args=(next(iterator),))
    return tf.nest.map_structure(
        tf.distribute.get_strategy().experimental_local_results, outputs)

--- a/official/staging/training/controller.py
+++ b/official/staging/training/controller.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A light weight utilities to train TF2 models."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import time
-
-from absl import logging
-
-import tensorflow.compat.v2 as tf
-from typing import Callable, Dict, Optional, Text
-
-from official.staging.training import utils
-
-
-class Controller(object):
-  """Class that facilitates training and evaluation of models."""
-
-  def __init__(
-      self,
-      strategy: Optional[tf.distribute.Strategy] = None,
-      train_fn: Optional[Callable[[tf.Tensor],
-                                  Optional[Dict[Text, tf.Tensor]]]] = None,
-      eval_fn: Optional[Callable[[tf.Tensor],
-                                 Optional[Dict[Text, tf.Tensor]]]] = None,
-      global_step: Optional[tf.Variable] = None,
-      # Train related
-      train_steps: Optional[int] = None,
-      steps_per_loop: Optional[int] = None,
-      summary_dir: Optional[Text] = None,
-      checkpoint_manager: Optional[tf.train.CheckpointManager] = None,
-      # summary related
-      summary_interval: Optional[int] = None,
-      # Evaluation related
-      eval_summary_dir: Optional[Text] = None,
-      eval_steps: Optional[int] = None,
-      eval_interval: Optional[int] = None):
-    """Constructs a `Controller` instance.
-
-    Args:
-      strategy: An instance of `tf.distribute.Strategy`.
-      train_fn: A callable defined as `def train_fn(num_steps)`, which
-        `num_steps` indicates the number of steps to run for each loop.
-      eval_fn: A callable defined as `def eval_fn(num_steps)`, which `num_steps`
-        indicates the number of steps for one evaluation.
-      global_step: An integer `tf.Variable` indicating the global training step
-        number. Usually this can be obtained from `iterations` property of the
-        model's optimizer (e.g. `self.optimizer.iterations`), or users can
-        create their own global step variable as well. If the users create their
-        own global step variable, it is recommended to create the `tf.Variable`
-        inside strategy scope, and with
-        `aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      train_steps: The total (maximum) number of training steps to perform.
-      steps_per_loop: The number of steps to run in each "inner loop" of
-        training (passed to the `num_steps` parameter of `train_fn`).
-      summary_dir: The directory to restore and write checkpoints and summaries.
-        If None, it will be set to `checkpoint_manager.directory`.
-      checkpoint_manager: An instance of `tf.train.CheckpointManager`.
-      summary_interval: Step interval for training summaries. Note that this
-        argument only applies to the summaries outside the training loop. If the
-        value is None, then training summaries are not enabled.
-      eval_summary_dir: The directory to write eval summaries. If None, it will
-        be set to `summary_dir`.
-      eval_steps: Number of steps to run evaluation.
-      eval_interval: Step interval for evaluation. If None, will skip evaluation
-        in the middle of training. Note that evaluation only happens outside the
-        training loop, which the loop iteration is specify by `steps_per_loop`
-        parameter.
-
-    Raises:
-      ValueError: If both `train_fn` and `eval_fn` are None.
-      ValueError: If `train_fn` is not None and `train_steps` is None.
-      ValueError: If `steps_per_loop` is None when `train_fn` is provided.
-      ValueError: If `steps_per_loop` is not a positive integer.
-    """
-    if train_fn is None and eval_fn is None:
-      raise ValueError("`train_fn` and `eval_fn` should not both be None")
-
-    # TODO(rxsang): Support training until exhaustion by passing
-    # `train_steps=-1`. Currently it cannot be supported with a host training
-    # loop because break statements are not supported with distributed dataset.
-    if train_fn is not None:
-      if train_steps is None:
-        raise ValueError("`train_steps` is required when `train_fn` is "
-                         "provided.")
-      if steps_per_loop is None:
-        raise ValueError("`steps_per_loop` is required when `train_fn is "
-                         "provided.")
-      if not isinstance(steps_per_loop, int) or steps_per_loop < 1:
-        raise ValueError("`steps_per_loop` should be a positive integer")
-    if summary_interval is not None and summary_interval <= 0:
-      raise ValueError("`summary_interval` should be larger than 0")
-
-    self.strategy = strategy or tf.distribute.get_strategy()
-
-    self.train_fn = train_fn
-    self.eval_fn = eval_fn
-    self.global_step = global_step
-    self.checkpoint_manager = checkpoint_manager
-
-    if self.train_fn is not None:
-      self.train_steps = train_steps
-      self.steps_per_loop = steps_per_loop
-      if summary_dir:
-        self.summary_dir = summary_dir
-      elif checkpoint_manager:
-        self.summary_dir = checkpoint_manager.directory
-      else:
-        self.summary_dir = None
-
-      self.summary_interval = summary_interval
-      if self.summary_dir and self.summary_interval:
-        summary_writer = tf.summary.create_file_writer(self.summary_dir)
-      else:
-        summary_writer = None
-      # TODO(rxsang): Consider pass SummaryManager directly into Controller for
-      # maximum customizability.
-      self.summary_manager = utils.SummaryManager(
-          summary_writer,
-          tf.summary.scalar,
-          global_step=self.global_step,
-          summary_interval=self.summary_interval)
-
-    if self.eval_fn is not None:
-      eval_summary_dir = eval_summary_dir or self.summary_dir
-      eval_summary_writer = tf.summary.create_file_writer(
-          eval_summary_dir) if eval_summary_dir else None
-      self.eval_summary_manager = utils.SummaryManager(
-          eval_summary_writer, tf.summary.scalar, global_step=self.global_step)
-
-      self.eval_steps = eval_steps
-      self.eval_interval = eval_interval
-
-      # Creates and initializes the interval triggers.
-      self.eval_trigger = utils.IntervalTrigger(self.eval_interval,
-                                                self.global_step.numpy())  # pytype: disable=attribute-error
-
-    if self.global_step:
-      tf.summary.experimental.set_step(self.global_step)
-
-    # Restores the model if needed.
-    if self.checkpoint_manager is not None:
-      model_restored = self._restore_model()
-      if not model_restored and self.checkpoint_manager.checkpoint_interval:
-        # If the model is not restored from a checkpoint, save an initial
-        # checkpoint.
-        ckpt_path = self.checkpoint_manager.save(
-            checkpoint_number=self.global_step)
-        logging.info("Saved checkpoins in %s", ckpt_path)
-
-  def _restore_model(self, checkpoint_path=None):
-    """Restore or initialize the model.
-
-    Args:
-      checkpoint_path: An optional string indicates the checkpoint path to
-        restore. If None, will restore from `self.checkpoint_manager`.
-
-    Returns:
-      True if the latest checkpoint is found or restored. Otherwise False.
-    """
-    with self.strategy.scope():
-      # Checkpoint restoring should be inside scope. b/139450638
-      if checkpoint_path is not None:
-        self.checkpoint_manager.checkpoint.restore(checkpoint_path)
-        return True
-      return self.checkpoint_manager.restore_or_initialize()
-
-  def _evaluate_once(self, current_step):
-    """Runs the evaluation once."""
-    logging.info("Start evaluation at step: %s", current_step)
-
-    with self.eval_summary_manager.summary_writer.as_default():
-      eval_outputs = self.eval_fn(self.eval_steps)
-
-    if eval_outputs:
-      eval_outputs = tf.nest.map_structure(lambda x: x.numpy(), eval_outputs)
-
-    info = "step: {}        evaluation metric: {}".format(
-        current_step, eval_outputs)
-    self._log_info(info)
-
-    self.eval_summary_manager.write_summaries(eval_outputs)
-    self.eval_summary_manager.flush()
-
-  def _maybe_save_checkpoints(self, current_step, force_trigger=False):
-    if self.checkpoint_manager and self.checkpoint_manager.checkpoint_interval:
-      ckpt_path = self.checkpoint_manager.save(
-          checkpoint_number=current_step, check_interval=not force_trigger)
-      if ckpt_path is not None:
-        logging.info("Saved checkpoins in %s", ckpt_path)
-
-  def _maybe_evaluate(self, current_step, force_trigger=False):
-    if self.eval_trigger(current_step, force_trigger):
-      self._evaluate_once(current_step)
-
-  def _log_info(self, message):
-    """Logs `message` to the `info` log, and also prints to stdout."""
-    logging.info(message)
-    print(message)
-
-  def train(self, evaluate=True):
-    """Runs the training, with optional evaluation.
-
-    This handles evaluation, gathering summaries, and saving checkpoints.
-
-    Args:
-      evaluate: A boolean indicates whether to perform evaluation during
-        training.
-
-    Raises:
-      RuntimeError: If `global_step` is not updated correctly in `train_fn`.
-    """
-    if self.train_fn is None:
-      raise ValueError("`self.train_fn` is required when calling `train` "
-                       "method.")
-    if self.global_step is None:
-      raise ValueError("`self.global_step` is required when calling `train` "
-                       "method.")
-    if evaluate and self.eval_fn is None:
-      raise ValueError("`self.eval_fn` is required when calling `train` method "
-                       "with `evaluate=True`")
-
-    step_timer = _StepTimer(self.global_step)
-    current_step = self.global_step.numpy()
-    logging.info("Train at step %s of %s", current_step, self.train_steps)
-    while current_step < self.train_steps:
-      # Calculates steps to run for the next train loop.
-      steps_per_loop = min(self.train_steps - current_step, self.steps_per_loop)
-      logging.info("Entering training loop with %s steps, at step %s of %s",
-                   steps_per_loop, current_step, self.train_steps)
-      current_step += steps_per_loop
-      steps_per_loop = tf.convert_to_tensor(steps_per_loop, dtype=tf.int32)
-
-      with self.summary_manager.summary_writer.as_default():
-        train_outputs = self.train_fn(steps_per_loop)
-
-      # Updates and verifies the current step after a training loop finishes.
-      if current_step != self.global_step.numpy():
-        raise RuntimeError("`self.train_fn` is not updating `global_step` "
-                           "correctly, expected: %s, actual: %s" %
-                           (current_step, self.global_step.numpy()))
-
-      # Print information like metrics and steps_per_second after a training
-      # loop.
-      if train_outputs:
-        train_outputs = tf.nest.map_structure(
-            lambda x: x.numpy(), train_outputs)
-      steps_per_second = step_timer.steps_per_second()
-      info = "step: {}        steps_per_second: {:.2f}        {}".format(
-          current_step, steps_per_second, train_outputs)
-      self._log_info(info)
-
-      train_outputs = train_outputs or {}
-      train_outputs["steps_per_second"] = steps_per_second
-      self.summary_manager.write_summaries(train_outputs)
-
-      self._maybe_save_checkpoints(current_step)
-
-      if evaluate:
-        self._maybe_evaluate(current_step)
-
-    self.summary_manager.write_summaries(train_outputs, always_write=True)
-    self.summary_manager.flush()
-    self._maybe_save_checkpoints(current_step, force_trigger=True)
-    if evaluate:
-      self._maybe_evaluate(current_step, force_trigger=True)
-
-  def evaluate(self, continuous=False, timeout_fn=None):
-    """Runs the evaluation.
-
-    Args:
-      continuous: If `True`, will continously monitor the checkpoint directory
-        to evaluate on the latest checkpoint. If `False`, will do the evaluation
-        once.
-      timeout_fn: Optional callable to call after a timeout. If the function
-        returns True, then it means that no new checkpoints will be generated
-        and the iterator will exit.
-
-    Raises:
-      ValueError: If no checkpoint found in `self.checkpoint_manager.directory`.
-    """
-    if self.eval_fn is None:
-      raise ValueError("`self.eval_fn` should not be None to call "
-                       "`evaluate()` method.")
-
-    if not continuous and timeout_fn is not None:
-      raise ValueError("`timeout_fn` can be only passed when `continuous` is "
-                       "True")
-
-    if continuous:
-      for checkpoint_path in tf.train.checkpoints_iterator(
-          self.checkpoint_manager.directory, timeout_fn=timeout_fn):
-        self._restore_model(checkpoint_path)
-        self._evaluate_once(self.global_step.numpy())
-      return
-
-    latest_checkpoint = self.checkpoint_manager.latest_checkpoint
-    if not latest_checkpoint:
-      raise ValueError("no checkpoint found in dir %s" %
-                       self.checkpoint_manager.directory)
-    self._restore_model()
-    self._evaluate_once(self.global_step.numpy())
-
-
-class _StepTimer(object):
-  """Utility class for measuring steps/second."""
-
-  def __init__(self, step):
-    self.step = step
-    self.start()
-
-  def start(self):
-    self.last_iteration = self.step.numpy()
-    self.last_time = time.time()
-
-  def steps_per_second(self, restart=True):
-    value = ((self.step.numpy() - self.last_iteration) /
-             (time.time() - self.last_time))
-    if restart:
-      self.start()
-    return value
--- a/official/staging/training/controller_test.py
+++ b/official/staging/training/controller_test.py
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for official.staging.training.controller."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import numpy as np
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.staging.training import controller
-from official.staging.training import standard_runnable
-
-
-def all_strategy_combinations():
-  """Gets combinations of distribution strategies."""
-  return combinations.combine(
-      strategy=[
-          strategy_combinations.one_device_strategy,
-          strategy_combinations.tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-      ],
-      mode="eager",
-  )
-
-
-def create_model():
-  x = tf.keras.layers.Input(shape=(3,), name="input")
-  y = tf.keras.layers.Dense(4, name="dense")(x)
-  model = tf.keras.Model(x, y)
-  return model
-
-
-def summaries_with_matching_keyword(keyword, summary_dir):
-  """Yields summary protos matching given keyword from event file."""
-  event_paths = tf.io.gfile.glob(os.path.join(summary_dir, "events*"))
-  for event in tf.compat.v1.train.summary_iterator(event_paths[-1]):
-    if event.summary is not None:
-      for value in event.summary.value:
-        if keyword in value.tag:
-          tf.compat.v1.logging.error(event)
-          yield event.summary
-
-
-def check_eventfile_for_keyword(keyword, summary_dir):
-  """Checks event files for the keyword."""
-  return any(summaries_with_matching_keyword(keyword, summary_dir))
-
-
-def dataset_fn(ctx):
-  del ctx
-  inputs = np.zeros((10, 3), dtype=np.float32)
-  targets = np.zeros((10, 4), dtype=np.float32)
-  dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
-  dataset = dataset.repeat(100)
-  dataset = dataset.batch(10, drop_remainder=True)
-  return dataset
-
-
-class TestRunnable(standard_runnable.StandardTrainable,
-                   standard_runnable.StandardEvaluable):
-  """Implements the training and evaluation APIs for the test model."""
-
-  def __init__(self):
-    standard_runnable.StandardTrainable.__init__(self)
-    standard_runnable.StandardEvaluable.__init__(self)
-    self.strategy = tf.distribute.get_strategy()
-    self.model = create_model()
-    self.optimizer = tf.keras.optimizers.RMSprop()
-    self.global_step = self.optimizer.iterations
-    self.train_loss = tf.keras.metrics.Mean("train_loss", dtype=tf.float32)
-    self.eval_loss = tf.keras.metrics.Mean("eval_loss", dtype=tf.float32)
-
-  def build_train_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def train_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated training step."""
-      inputs, targets = inputs
-      with tf.GradientTape() as tape:
-        outputs = self.model(inputs)
-        loss = tf.math.reduce_sum(outputs - targets)
-      grads = tape.gradient(loss, self.model.variables)
-      self.optimizer.apply_gradients(zip(grads, self.model.variables))
-      self.train_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    return {
-        "loss": self.train_loss.result(),
-    }
-
-  def build_eval_dataset(self):
-    return self.strategy.experimental_distribute_datasets_from_function(
-        dataset_fn)
-
-  def eval_begin(self):
-    self.eval_loss.reset_states()
-
-  def eval_step(self, iterator):
-
-    def _replicated_step(inputs):
-      """Replicated evaluation step."""
-      inputs, targets = inputs
-      outputs = self.model(inputs)
-      loss = tf.math.reduce_sum(outputs - targets)
-      self.eval_loss.update_state(loss)
-
-    self.strategy.run(_replicated_step, args=(next(iterator),))
-
-  def eval_end(self):
-    return {
-        "eval_loss": self.eval_loss.result(),
-    }
-
-
-class ControllerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super(ControllerTest, self).setUp()
-    self.model_dir = self.get_temp_dir()
-
-  def test_no_checkpoint(self):
-    test_runnable = TestRunnable()
-    # No checkpoint manager and no strategy.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-    # No checkpoint, so global step starts from 0.
-    test_runnable.global_step.assign(0)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  def test_no_checkpoint_and_summaries(self):
-    test_runnable = TestRunnable()
-    # No checkpoint + summary directories.
-    test_controller = controller.Controller(
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-    self.assertEqual(test_runnable.global_step.numpy(), 10)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_and_evaluate(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.train(evaluate=True)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Loss and accuracy values should be written into summaries.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_train_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(
-        model=test_runnable.model, optimizer=test_runnable.optimizer)
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step,
-        checkpoint_interval=10)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        train_fn=test_runnable.train,
-        global_step=test_runnable.global_step,
-        train_steps=10,
-        steps_per_loop=2,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        summary_interval=2,
-        checkpoint_manager=checkpoint_manager,
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-    )
-    test_controller.train(evaluate=False)
-
-    # Checkpoints are saved.
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self.model_dir, "ckpt*")))
-
-    # Only train summaries are written.
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/train")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "loss", os.path.join(self.model_dir, "summaries/train")))
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/eval")))
-
-  @combinations.generate(all_strategy_combinations())
-  def test_evaluate_only(self, strategy):
-    with strategy.scope():
-      test_runnable = TestRunnable()
-
-    checkpoint = tf.train.Checkpoint(model=test_runnable.model)
-    checkpoint.save(os.path.join(self.model_dir, "ckpt"))
-
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        self.model_dir,
-        max_to_keep=None,
-        step_counter=test_runnable.global_step)
-    test_controller = controller.Controller(
-        strategy=strategy,
-        eval_fn=test_runnable.evaluate,
-        global_step=test_runnable.global_step,
-        checkpoint_manager=checkpoint_manager,
-        summary_dir=os.path.join(self.model_dir, "summaries/train"),
-        eval_summary_dir=os.path.join(self.model_dir, "summaries/eval"),
-        eval_steps=2,
-        eval_interval=5)
-    test_controller.evaluate()
-
-    # Only eval summaries are written
-    self.assertFalse(
-        tf.io.gfile.exists(os.path.join(self.model_dir, "summaries/train")))
-    self.assertNotEmpty(
-        tf.io.gfile.listdir(os.path.join(self.model_dir, "summaries/eval")))
-    self.assertTrue(
-        check_eventfile_for_keyword(
-            "eval_loss", os.path.join(self.model_dir, "summaries/eval")))
-
-
-if __name__ == "__main__":
-  tf.test.main()
--- a/official/staging/training/runnable.py
+++ b/official/staging/training/runnable.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import six
-import tensorflow.compat.v2 as tf
-from typing import Dict, Optional, Text
-
-
-@six.add_metaclass(abc.ABCMeta)
-class AbstractTrainable(tf.Module):
-  """An abstract class defining the APIs required for training."""
-
-  @abc.abstractmethod
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model training with multiple steps.
-
-    In training, it is common to break the total training steps into several
-    training loops, so users can do checkpointing, write summaries and run some
-    python callbacks. This is necessary for getting good performance in TPU
-    training, as the overhead for launching a multi worker tf.function may be
-    large in Eager mode. It is usually encouraged to create a host training loop
-    (e.g. using a `tf.range` wrapping `strategy.run` inside a
-    `tf.function`) in the TPU case. For the cases that don't require host
-    training loop to acheive peak performance, users can just implement a simple
-    python loop to drive each step.
-
-    Args:
-      num_steps: A guideline for how many training steps to run. Note that it is
-        up to the model what constitutes a "step" (this may involve more than
-        one update to model parameters, e.g. if training a GAN).
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class AbstractEvaluable(tf.Module):
-  """An abstract class defining the APIs required for evaluation."""
-
-  @abc.abstractmethod
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """Implements model evaluation.
-
-    Args:
-      num_steps: A guideline for how many evaluation steps to run. Note that it
-        is up to the model what constitutes a "step". Generally, it may be
-        desirable to support both a limited number of eval steps and iterating
-        over a full dataset (however many steps are required) when `num_steps`
-        is `None`.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
--- a/official/staging/training/standard_runnable.py
+++ b/official/staging/training/standard_runnable.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""An abstraction that users can easily handle their custom training loops."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import six
-import tensorflow.compat.v2 as tf
-from typing import Dict, Optional, Text
-
-from official.staging.training import runnable
-from official.staging.training import utils
-
-
-@six.add_metaclass(abc.ABCMeta)
-class StandardTrainable(runnable.AbstractTrainable):
-  """Implements the standard functionality of AbstractTrainable APIs."""
-
-  def __init__(self, use_tf_while_loop=True, use_tf_function=True):
-    if use_tf_while_loop and not use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
-    self.use_tf_while_loop = use_tf_while_loop
-    self.use_tf_function = use_tf_function
-    self.train_dataset = None
-    self.train_iter = None
-    self.train_loop_fn = None
-
-  @abc.abstractmethod
-  def build_train_dataset(self):
-    """Builds the training datasets.
-
-    Returns:
-      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
-    """
-    pass
-
-  def train(self,
-            num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
-    if self.train_dataset is None:
-      # Build train input dataset
-      self.train_dataset = self.build_train_dataset()
-      self.train_iter = tf.nest.map_structure(iter, self.train_dataset)
-
-    if self.train_loop_fn is None:
-      train_fn = self.train_step
-      if self.use_tf_while_loop:
-        self.train_loop_fn = utils.create_tf_while_loop_fn(train_fn)
-      else:
-        if self.use_tf_function:
-          train_fn = tf.function(train_fn)
-        self.train_loop_fn = utils.create_loop_fn(train_fn)
-
-    self.train_loop_begin()
-    self.train_loop_fn(self.train_iter, num_steps)
-    return self.train_loop_end()
-
-  def train_loop_begin(self):
-    """Called once at the beginning of the training loop.
-
-    This is a good place to reset metrics that accumulate values over multiple
-    steps of training.
-    """
-    pass
-
-  @abc.abstractmethod
-  def train_step(self, iterator):
-    """Implements one step of training.
-
-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
-    """
-    pass
-
-  def train_loop_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the training loop.
-
-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the train() method.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
-
-
-@six.add_metaclass(abc.ABCMeta)
-class StandardEvaluable(runnable.AbstractEvaluable):
-  """Implements the standard functionality of AbstractEvaluable APIs."""
-
-  def __init__(self, use_tf_function=True):
-    self.eval_use_tf_function = use_tf_function
-    self.eval_dataset = None
-    self.eval_loop_fn = None
-
-  @abc.abstractmethod
-  def build_eval_dataset(self):
-    """Builds the evaluation datasets.
-
-    Returns:
-      A tf.nest-compatible structure of tf.data.Dataset or DistributedDataset.
-    """
-    pass
-
-  def evaluate(
-      self, num_steps: Optional[tf.Tensor]) -> Optional[Dict[Text, tf.Tensor]]:
-    """See base class."""
-    if self.eval_dataset is None:
-      # Build train input dataset
-      self.eval_dataset = self.build_eval_dataset()
-
-    if self.eval_loop_fn is None:
-      eval_fn = self.eval_step
-      if self.eval_use_tf_function:
-        eval_fn = tf.function(eval_fn)
-      self.eval_loop_fn = utils.create_loop_fn(eval_fn)
-
-    eval_iter = tf.nest.map_structure(iter, self.eval_dataset)
-
-    self.eval_begin()
-    self.eval_loop_fn(eval_iter, num_steps)
-    return self.eval_end()
-
-  def eval_begin(self):
-    """Called once at the beginning of the evaluation.
-
-    This is a good place to reset metrics that accumulate values over the entire
-    evaluation.
-    """
-    pass
-
-  @abc.abstractmethod
-  def eval_step(self, iterator):
-    """Implements one step of evaluation.
-
-    What a "step" consists of is up to the implementer. If using distribution
-    strategies, the call to this method should take place in the "cross-replica
-    context" for generality, to allow e.g. multiple iterator dequeues and calls
-    to `strategy.run`.
-
-    Args:
-      iterator: A tf.nest-compatible structure of tf.data Iterator or
-        DistributedIterator.
-    """
-    pass
-
-  def eval_end(self) -> Optional[Dict[Text, tf.Tensor]]:
-    """Called at the end of the evaluation.
-
-    This is a good place to get metric results. The value returned from this
-    function will be returned as-is from the evaluate() method.
-
-    Returns:
-      The function may return a dictionary of `Tensors`, which will be
-      written to logs and as TensorBoard summaries.
-    """
-    pass
--- a/official/staging/training/utils.py
+++ b/official/staging/training/utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Some layered modules/functions to help users writing custom training loop."""
-
-from __future__ import absolute_import
-from __future__ import division
-# from __future__ import google_type_annotations
-from __future__ import print_function
-
-import abc
-import inspect
-import six
-
-import tensorflow.compat.v2 as tf
-
-
-def create_loop_fn(step_fn):
-  """Creates a multiple steps function driven by the python while loop.
-
-  Args:
-    step_fn: A function which takes `iterator` as input.
-
-  Returns:
-    A callable defined as the `loop_fn` defination below.
-  """
-
-  def loop_fn(iterator, num_steps, state=None, reduce_fn=None):
-    """A loop function with multiple steps.
-
-    Args:
-      iterator: A nested structure of tf.data `Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. If `num_steps==-1`, will
-        iterate until exausting the iterator.
-      state: An optional initial state before running the loop.
-      reduce_fn: a callable defined as `def reduce_fn(state, value)`, where
-        `value` is the outputs from `step_fn`.
-
-    Returns:
-      The updated state.
-    """
-    try:
-      step = 0
-      # To make sure the OutOfRangeError exception can be handled well with
-      # async remote eager, we need to wrap the loop body in a `async_scope`.
-      with tf.experimental.async_scope():
-        while (num_steps == -1 or step < num_steps):
-          outputs = step_fn(iterator)
-          if reduce_fn is not None:
-            state = reduce_fn(state, outputs)
-          step += 1
-        return state
-    except (StopIteration, tf.errors.OutOfRangeError):
-      tf.experimental.async_clear_error()
-      return state
-
-  return loop_fn
-
-
-def create_tf_while_loop_fn(step_fn):
-  """Create a multiple steps function driven by tf.while_loop on the host.
-
-  Args:
-    step_fn: A function which takes `iterator` as input.
-
-  Returns:
-    A callable defined as the `loop_fn` defination below.
-  """
-
-  @tf.function
-  def loop_fn(iterator, num_steps):
-    """A loop function with multiple steps.
-
-    Args:
-      iterator: A nested structure of tf.data `Iterator` or
-        `DistributedIterator`.
-      num_steps: The number of steps in the loop. Must be a tf.Tensor.
-    """
-    if not isinstance(num_steps, tf.Tensor):
-      raise ValueError("`num_steps` should be an `tf.Tensor`. Python object "
-                       "may cause retracing.")
-
-    for _ in tf.range(num_steps):
-      step_fn(iterator)
-
-  return loop_fn
-
-
-def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
-  """A helper function to create distributed dataset.
-
-  Args:
-    strategy: An instance of `tf.distribute.Strategy`.
-    dataset_or_fn: A instance of `tf.data.Dataset` or a function which takes an
-      `tf.distribute.InputContext` as input and returns a `tf.data.Dataset`. If
-      it is a function, it could optionally have an argument named
-      `input_context` which is `tf.distribute.InputContext` argument type.
-    *args: The list of arguments to be passed to dataset_or_fn.
-    **kwargs: Any keyword arguments to be passed.
-
-  Returns:
-    A distributed Dataset.
-  """
-  if strategy is None:
-    strategy = tf.distribute.get_strategy()
-
-  if isinstance(dataset_or_fn, tf.data.Dataset):
-    return strategy.experimental_distribute_dataset(dataset_or_fn)
-
-  if not callable(dataset_or_fn):
-    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`")
-
-  def dataset_fn(ctx):
-    """Wrapped dataset function for creating distributed dataset.."""
-
-    # If `dataset_or_fn` is a function and has `input_context` as argument
-    # names, pass `ctx` as the value of `input_context` when calling
-    # `dataset_or_fn`. Otherwise `ctx` will not be used when calling
-    # `dataset_or_fn`.
-    if six.PY3:
-      argspec = inspect.getfullargspec(dataset_or_fn)
-    else:
-      argspec = inspect.getargspec(dataset_or_fn)
-    args_names = argspec.args
-
-    if "input_context" in args_names:
-      kwargs["input_context"] = ctx
-    ds = dataset_or_fn(*args, **kwargs)
-    return ds
-
-  return strategy.experimental_distribute_datasets_from_function(dataset_fn)
-
-
-class SummaryManager(object):
-  """A class manages writing summaries."""
-
-  def __init__(self,
-               summary_writer,
-               summary_fn,
-               global_step=None,
-               summary_interval=None):
-    """Construct a summary manager object.
-
-    Args:
-      summary_writer: A `tf.summary.SummaryWriter` instance for writing
-        summaries.
-      summary_fn: A callable defined as `def summary_fn(name, tensor,
-        step=None)`, which describes the summary operation.
-      global_step: A `tf.Variable` instance for checking the current global step
-        value, in case users want to save summaries every N steps.
-      summary_interval: An integer, indicates the minimum step interval between
-        two summaries.
-    """
-    if summary_writer is not None:
-      self._summary_writer = summary_writer
-      self._enabled = True
-    else:
-      self._summary_writer = tf.summary.create_noop_writer()
-      self._enabled = False
-    self._summary_fn = summary_fn
-
-    if global_step is None:
-      self._global_step = tf.summary.experimental.get_step()
-    else:
-      self._global_step = global_step
-
-    if summary_interval is not None:
-      if self._global_step is None:
-        raise ValueError("`summary_interval` is not None, but no `global_step` "
-                         "can be obtained ")
-      self._last_summary_step = self._global_step.numpy()
-    self._summary_interval = summary_interval
-
-  @property
-  def summary_interval(self):
-    return self._summary_interval
-
-  @property
-  def summary_writer(self):
-    """Returns the underlying summary writer."""
-    return self._summary_writer
-
-  def flush(self):
-    """Flush the underlying summary writer."""
-    if self._enabled:
-      tf.summary.flush(self._summary_writer)
-
-  def write_summaries(self, items, always_write=True):
-    """Write a bulk of summaries.
-
-    Args:
-      items: a dictionary of `Tensors` for writing summaries.
-      always_write: An optional boolean. If `True`, the manager will always
-        write summaries unless the summaries have been written for the same
-        step. Otherwise the manager will only write the summaries if the
-        interval between summaries are larger than `summary_interval`.
-
-    Returns:
-      A boolean indicates whether the summaries are written or not.
-    """
-    # TODO(rxsang): Support writing summaries with nested structure, so users
-    # can split the summaries into different directories for nicer visualization
-    # in Tensorboard, like train and eval metrics.
-    if not self._enabled:
-      return False
-
-    if self._summary_interval is not None:
-      current_step = self._global_step.numpy()
-      if current_step == self._last_summary_step:
-        return False
-      if not always_write and current_step < (self._last_summary_step +
-                                              self._summary_interval):
-        return False
-      self._last_summary_step = current_step
-
-    with self._summary_writer.as_default():
-      for name, tensor in items.items():
-        self._summary_fn(name, tensor, step=self._global_step)
-    return True
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Trigger(object):
-  """An abstract class representing a "trigger" for some event."""
-
-  @abc.abstractmethod
-  def __call__(self, value: float, force_trigger=False):
-    """Maybe trigger the event based on the given value.
-
-    Args:
-      value: the value for triggering.
-      force_trigger: Whether the trigger is forced triggered.
-
-    Returns:
-      `True` if the trigger is triggered on the given `value`, and
-      `False` otherwise.
-    """
-
-  @abc.abstractmethod
-  def reset(self):
-    """Reset states in the trigger."""
-
-
-class IntervalTrigger(Trigger):
-  """Triggers on every fixed interval."""
-
-  def __init__(self, interval, start=0):
-    """Constructs the IntervalTrigger.
-
-    Args:
-      interval: The triggering interval.
-      start: An initial value for the trigger.
-    """
-    self._interval = interval
-    self._last_trigger_value = start
-
-  def __call__(self, value, force_trigger=False):
-    """Maybe trigger the event based on the given value.
-
-    Args:
-      value: the value for triggering.
-      force_trigger: If True, the trigger will be forced triggered unless the
-        last trigger value is equal to `value`.
-
-    Returns:
-      `True` if the trigger is triggered on the given `value`, and
-      `False` otherwise.
-    """
-    if force_trigger and value != self._last_trigger_value:
-      self._last_trigger_value = value
-      return True
-
-    if self._interval and self._interval > 0:
-      if value >= self._last_trigger_value + self._interval:
-        self._last_trigger_value = value
-        return True
-    return False
-
-  def reset(self):
-    """See base class."""
-    self._last_trigger_value = 0
-
-
-class EpochHelper(object):
-  """A Helper class to handle epochs in Customized Training Loop."""
-
-  def __init__(self, epoch_steps, global_step):
-    """Constructs the EpochHelper.
-
-    Args:
-      epoch_steps: An integer indicates how many steps in an epoch.
-      global_step: A `tf.Variable` instance indicates the current global step.
-    """
-    self._epoch_steps = epoch_steps
-    self._global_step = global_step
-    self._current_epoch = None
-    self._epoch_start_step = None
-    self._in_epoch = False
-
-  def epoch_begin(self):
-    """Returns whether a new epoch should begin."""
-    if self._in_epoch:
-      return False
-    current_step = self._global_step.numpy()
-    self._epoch_start_step = current_step
-    self._current_epoch = current_step // self._epoch_steps
-    self._in_epoch = True
-    return True
-
-  def epoch_end(self):
-    """Returns whether the current epoch should end."""
-    if not self._in_epoch:
-      raise ValueError("`epoch_end` can only be called inside an epoch")
-    current_step = self._global_step.numpy()
-    epoch = current_step // self._epoch_steps
-
-    if epoch > self._current_epoch:
-      self._in_epoch = False
-      return True
-    return False
-
-  @property
-  def batch_index(self):
-    """Index of the next batch within the current epoch."""
-    return self._global_step.numpy() - self._epoch_start_step
-
-  @property
-  def current_epoch(self):
-    return self._current_epoch
--- a/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
+++ b/official/vision/image_classification/resnet/resnet_ctl_imagenet_main.py
@@ -14,18 +14,16 @@
 # ==============================================================================
 """Runs a ResNet model on the ImageNet dataset using custom training loops."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import math
+import os
+
 from absl import app
 from absl import flags
 from absl import logging
+import orbit
 import tensorflow as tf

 from official.modeling import performance
-from official.staging.training import controller
 from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
@@ -87,15 +85,6 @@ def get_num_train_iterations(flags_obj):
  return train_steps, train_epochs, eval_steps


-def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
-  """Calculates steps to run on device."""
-  if steps_per_loop <= 0:
-    raise ValueError('steps_per_loop should be positive integer.')
-  if steps_per_loop == 1:
-    return steps_per_loop
-  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
-
-
 def run(flags_obj):
  """Run ResNet ImageNet training and eval loop using custom training loops.

@@ -121,7 +110,6 @@ def run(flags_obj):
          datasets_num_private_threads=flags_obj.datasets_num_private_threads)
    common.set_cudnn_batchnorm_mode()

-  # TODO(anj-s): Set data_format without using Keras.
  data_format = flags_obj.data_format
  if data_format is None:
    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
@@ -137,7 +125,14 @@ def run(flags_obj):

  per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations(
      flags_obj)
-  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+  if not flags_obj.steps_per_loop:
+    steps_per_loop = per_epoch_steps
+  elif flags_obj.steps_per_loop > per_epoch_steps:
+    steps_per_loop = per_epoch_steps
+    logging.warn('Setting steps_per_loop to %d to respect epoch boundary.',
+                 steps_per_loop)
+  else:
+    steps_per_loop = flags_obj.steps_per_loop

  logging.info(
      'Training %d epochs, each epoch has %d steps, '
@@ -154,8 +149,8 @@ def run(flags_obj):

  eval_interval = flags_obj.epochs_between_evals * per_epoch_steps
  checkpoint_interval = (
-      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
-  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+      steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None

  checkpoint_manager = tf.train.CheckpointManager(
      runnable.checkpoint,
@@ -164,20 +159,24 @@ def run(flags_obj):
      step_counter=runnable.global_step,
      checkpoint_interval=checkpoint_interval)

-  resnet_controller = controller.Controller(
+  resnet_controller = orbit.Controller(
      strategy,
-      runnable.train,
-      runnable.evaluate if not flags_obj.skip_eval else None,
+      runnable,
+      runnable if not flags_obj.skip_eval else None,
      global_step=runnable.global_step,
      steps_per_loop=steps_per_loop,
-      train_steps=per_epoch_steps * train_epochs,
      checkpoint_manager=checkpoint_manager,
      summary_interval=summary_interval,
-      eval_steps=eval_steps,
-      eval_interval=eval_interval)
+      eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval'))

  time_callback.on_train_begin()
-  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  if not flags_obj.skip_eval:
+    resnet_controller.train_and_evaluate(
+        train_steps=per_epoch_steps * train_epochs,
+        eval_steps=eval_steps,
+        eval_interval=eval_interval)
+  else:
+    resnet_controller.train(steps=per_epoch_steps * train_epochs)
  time_callback.on_train_end()

  stats = build_stats(runnable, time_callback)

--- a/official/vision/image_classification/resnet/resnet_runnable.py
+++ b/official/vision/image_classification/resnet/resnet_runnable.py
@@ -14,33 +14,21 @@
 # ==============================================================================
 """Runs a ResNet model on the ImageNet dataset using custom training loops."""

-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import orbit
 import tensorflow as tf

 from official.modeling import performance
 from official.staging.training import grad_utils
-from official.staging.training import standard_runnable
-from official.staging.training import utils
 from official.utils.flags import core as flags_core
 from official.vision.image_classification.resnet import common
 from official.vision.image_classification.resnet import imagenet_preprocessing
 from official.vision.image_classification.resnet import resnet_model


-class ResnetRunnable(standard_runnable.StandardTrainable,
-                     standard_runnable.StandardEvaluable):
+class ResnetRunnable(orbit.StandardTrainer, orbit.StandardEvaluator):
  """Implements the training and evaluation APIs for Resnet model."""

  def __init__(self, flags_obj, time_callback, epoch_steps):
-    standard_runnable.StandardTrainable.__init__(self,
-                                                 flags_obj.use_tf_while_loop,
-                                                 flags_obj.use_tf_function)
-    standard_runnable.StandardEvaluable.__init__(self,
-                                                 flags_obj.use_tf_function)
-
    self.strategy = tf.distribute.get_strategy()
    self.flags_obj = flags_obj
    self.dtype = flags_core.get_tf_dtype(flags_obj)
@@ -107,11 +95,8 @@ class ResnetRunnable(standard_runnable.StandardTrainable,

    # Handling epochs.
    self.epoch_steps = epoch_steps
-    self.epoch_helper = utils.EpochHelper(epoch_steps, self.global_step)
-
-  def build_train_dataset(self):
-    """See base class."""
-    return utils.make_distributed_dataset(
+    self.epoch_helper = orbit.utils.EpochHelper(epoch_steps, self.global_step)
+    train_dataset = orbit.utils.make_distributed_dataset(
        self.strategy,
        self.input_fn,
        is_training=True,
@@ -122,17 +107,20 @@ class ResnetRunnable(standard_runnable.StandardTrainable,
        .datasets_num_private_threads,
        dtype=self.dtype,
        drop_remainder=True)
-
-  def build_eval_dataset(self):
-    """See base class."""
-    return utils.make_distributed_dataset(
-        self.strategy,
-        self.input_fn,
-        is_training=False,
-        data_dir=self.flags_obj.data_dir,
-        batch_size=self.batch_size,
-        parse_record_fn=imagenet_preprocessing.parse_record,
-        dtype=self.dtype)
+    orbit.StandardTrainer.__init__(self, train_dataset,
+                                   flags_obj.use_tf_while_loop,
+                                   flags_obj.use_tf_function)
+    if not flags_obj.skip_eval:
+      eval_dataset = orbit.utils.make_distributed_dataset(
+          self.strategy,
+          self.input_fn,
+          is_training=False,
+          data_dir=self.flags_obj.data_dir,
+          batch_size=self.batch_size,
+          parse_record_fn=imagenet_preprocessing.parse_record,
+          dtype=self.dtype)
+      orbit.StandardEvaluator.__init__(self, eval_dataset,
+                                       flags_obj.use_tf_function)

  def train_loop_begin(self):
    """See base class."""

--- a/orbit/controller.py
+++ b/orbit/controller.py
@@ -151,8 +151,10 @@ class Controller(object):
          checkpoint_interval, steps_per_loop, interval_name="checkpoint")

      model_restored = self.restore_checkpoint()
-      if not model_restored and checkpoint_interval:
-        # If the model is not restored from a checkpoint, save an initial
+      if not model_restored and (checkpoint_interval and
+                                 self.trainer is not None):
+        # If the model is not restored from a checkpoint, and
+        # `checkpoint_interval` is enabled for training, save an initial
        # checkpoint.
        self.save_checkpoint()

@@ -341,8 +343,8 @@ class Controller(object):

    # Calculates steps to run for the next train loop.
    current_step = self.global_step.numpy()
-    logging.info("Entering training loop at step %s of %s", current_step,
-                 num_steps)
+    logging.info("Entering training loop at step %s to run %s steps",
+                 current_step, num_steps)
    current_step += num_steps
    num_steps = tf.convert_to_tensor(num_steps, dtype=tf.int32)


--- a/research/deeplab/datasets/build_cityscapes_data.py
+++ b/research/deeplab/datasets/build_cityscapes_data.py
@@ -113,17 +113,23 @@ def _get_files(data, dataset_split):

  Args:
    data: String, desired data ('image' or 'label').
-    dataset_split: String, dataset split ('train', 'val', 'test')
+    dataset_split: String, dataset split ('train_fine', 'val_fine', 'test_fine')

  Returns:
    A list of sorted file names or None when getting label for
      test set.
  """
-  if data == 'label' and dataset_split == 'test':
-    return None
+  if dataset_split == 'train_fine':
+    split_dir = 'train'
+  elif dataset_split == 'val_fine':
+    split_dir = 'val'
+  elif dataset_split == 'test_fine':
+    split_dir = 'test'
+  else:
+    raise RuntimeError("Split {} is not supported".format(dataset_split))
  pattern = '*%s.%s' % (_POSTFIX_MAP[data], _DATA_FORMAT_MAP[data])
  search_files = os.path.join(
-      FLAGS.cityscapes_root, _FOLDERS_MAP[data], dataset_split, '*', pattern)
+      FLAGS.cityscapes_root, _FOLDERS_MAP[data], split_dir, '*', pattern)
  filenames = glob.glob(search_files)
  return sorted(filenames)

@@ -132,7 +138,7 @@ def _convert_dataset(dataset_split):
  """Converts the specified dataset split to TFRecord format.

  Args:
-    dataset_split: The dataset split (e.g., train, val).
+    dataset_split: The dataset split (e.g., train_fine, val_fine).

  Raises:
    RuntimeError: If loaded image and label have different shape, or if the
@@ -142,8 +148,12 @@ def _convert_dataset(dataset_split):
  label_files = _get_files('label', dataset_split)

  num_images = len(image_files)
+  num_labels = len(label_files)
  num_per_shard = int(math.ceil(num_images / _NUM_SHARDS))

+  if num_images != num_labels:
+    raise RuntimeError("The number of images and labels doesn't match: {} {}".format(num_images, num_labels))
+
  image_reader = build_data.ImageReader('png', channels=3)
  label_reader = build_data.ImageReader('png', channels=1)

@@ -179,8 +189,8 @@ def _convert_dataset(dataset_split):


 def main(unused_argv):
-  # Only support converting 'train' and 'val' sets for now.
-  for dataset_split in ['train', 'val']:
+  # Only support converting 'train_fine', 'val_fine' and 'test_fine' sets for now.
+  for dataset_split in ['train_fine', 'val_fine', 'test_fine']:
    _convert_dataset(dataset_split)



--- a/research/deeplab/datasets/convert_cityscapes.sh
+++ b/research/deeplab/datasets/convert_cityscapes.sh
@@ -42,6 +42,8 @@ WORK_DIR="."
 # Root path for Cityscapes dataset.
 CITYSCAPES_ROOT="${WORK_DIR}/cityscapes"

+export PYTHONPATH="${CITYSCAPES_ROOT}:${PYTHONPATH}"
+
 # Create training labels.
 python "${CITYSCAPES_ROOT}/cityscapesscripts/preparation/createTrainIdLabelImgs.py"


--- a/research/deeplab/deprecated/segmentation_dataset.py
+++ b/research/deeplab/deprecated/segmentation_dataset.py
@@ -81,8 +81,8 @@ DatasetDescriptor = collections.namedtuple(

 _CITYSCAPES_INFORMATION = DatasetDescriptor(
    splits_to_sizes={
-        'train': 2975,
-        'val': 500,
+        'train_fine': 2975,
+        'val_fine': 500,
    },
    num_classes=19,
    ignore_label=255,

--- a/research/deeplab/g3doc/cityscapes.md
+++ b/research/deeplab/g3doc/cityscapes.md
@@ -43,7 +43,7 @@ A local training job using `xception_65` can be run with the following command:
 python deeplab/train.py \
    --logtostderr \
    --training_number_of_steps=90000 \
-    --train_split="train" \
+    --train_split="train_fine" \
    --model_variant="xception_65" \
    --atrous_rates=6 \
    --atrous_rates=12 \
@@ -95,7 +95,7 @@ command:
 # From tensorflow/models/research/
 python deeplab/eval.py \
    --logtostderr \
-    --eval_split="val" \
+    --eval_split="val_fine" \
    --model_variant="xception_65" \
    --atrous_rates=6 \
    --atrous_rates=12 \
@@ -121,7 +121,7 @@ command:
 # From tensorflow/models/research/
 python deeplab/vis.py \
    --logtostderr \
-    --vis_split="val" \
+    --vis_split="val_fine" \
    --model_variant="xception_65" \
    --atrous_rates=6 \
    --atrous_rates=12 \

--- a/research/deeplab/g3doc/installation.md
+++ b/research/deeplab/g3doc/installation.md
@@ -68,6 +68,6 @@ Quick running the whole code on the PASCAL VOC 2012 dataset:

 ```bash
 # From tensorflow/models/research/deeplab
-sh local_test.sh
+bash local_test.sh
 ```

--- a/research/deeplab/local_test.sh
+++ b/research/deeplab/local_test.sh
@@ -19,7 +19,7 @@
 #
 # Usage:
 #   # From the tensorflow/models/research/deeplab directory.
-#   sh ./local_test.sh
+#   bash ./local_test.sh
 #
 #

@@ -42,7 +42,7 @@ python "${WORK_DIR}"/model_test.py
 # Go to datasets folder and download PASCAL VOC 2012 segmentation dataset.
 DATASET_DIR="datasets"
 cd "${WORK_DIR}/${DATASET_DIR}"
-sh download_and_convert_voc2012.sh
+bash download_and_convert_voc2012.sh

 # Go back to original directory.
 cd "${CURRENT_DIR}"

--- a/research/object_detection/colab_tutorials/eager_few_shot_od_training_tf2_colab.ipynb
+++ b/research/object_detection/colab_tutorials/eager_few_shot_od_training_tf2_colab.ipynb
@@ -12,7 +12,7 @@
        "Welcome to the Eager Few Shot Object Detection Colab --- in this colab we demonstrate fine tuning of a (TF2 friendly) RetinaNet architecture on very few examples of a novel class after initializing from a pre-trained COCO checkpoint.\n",
        "Training runs in eager mode.\n",
        "\n",
-        "Estimated time to run through this colab (with GPU): < 5 minutes."
+        "Estimated time to run through this colab (with GPU): \u003c 5 minutes."
      ]
    },
    {
@@ -408,7 +408,7 @@
      "source": [
        "# Download the checkpoint and put it into models/research/object_detection/test_data/\n",
        "\n",
-        "!wget http://download.tensorflow.org/models/object_detection/tf2/20200710/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz\n",
+        "!wget http://download.tensorflow.org/models/object_detection/tf2/20200711/ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz\n",
        "!tar -xf ssd_resnet50_v1_fpn_640x640_coco17_tpu-8.tar.gz\n",
        "!mv ssd_resnet50_v1_fpn_640x640_coco17_tpu-8/checkpoint models/research/object_detection/test_data/"
      ]