Merge pull request #6 from tensorflow/master

Updated

Merge pull request #6 from tensorflow/master
Updated
1f3247f4 · Ayushman Kumar · GitHub · 370a4c8d · 0265f59c · 1f3247f4
Unverified Commit 1f3247f4 authored Mar 27, 2020 by Ayushman Kumar Committed by GitHub Mar 27, 2020
20 changed files
--- a/official/utils/logs/hooks_helper.py
+++ b/official/utils/logs/hooks_helper.py
@@ -25,6 +25,7 @@ from __future__ import division
 from __future__ import print_function
 import tensorflow as tf  # pylint: disable=g-bad-import-order
+from absl import logging
 from official.utils.logs import hooks
 from official.utils.logs import logger
@@ -57,9 +58,9 @@ def get_train_hooks(name_list, use_tpu=False, **kwargs):
    return []
  if use_tpu:
-    tf.compat.v1.logging.warning('hooks_helper received name_list `{}`, but a '
+    logging.warning(
-                                 'TPU is specified. No hooks will be used.'
+        'hooks_helper received name_list `%s`, but a '
-                                 .format(name_list))
+        'TPU is specified. No hooks will be used.', name_list)
    return []
  train_hooks = []

--- a/official/utils/logs/hooks_test.py
+++ b/official/utils/logs/hooks_test.py
@@ -21,12 +21,13 @@ from __future__ import print_function
 import time
+from absl import logging
 import tensorflow as tf  # pylint: disable=g-bad-import-order
 from official.utils.logs import hooks
 from official.utils.testing import mock_lib
-tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
+logging.set_verbosity(logging.DEBUG)
 class ExamplesPerSecondHookTest(tf.test.TestCase):

--- a/official/utils/logs/logger.py
+++ b/official/utils/logs/logger.py
@@ -35,6 +35,7 @@ from six.moves import _thread as thread
 from absl import flags
 import tensorflow as tf
 from tensorflow.python.client import device_lib
+from absl import logging
 from official.utils.logs import cloud_lib
@@ -119,8 +120,7 @@ class BaseBenchmarkLogger(object):
      eval_results: dict, the result of evaluate.
    """
    if not isinstance(eval_results, dict):
-      tf.compat.v1.logging.warning(
+      logging.warning("eval_results should be dictionary for logging. Got %s",
-          "eval_results should be dictionary for logging. Got %s",
                      type(eval_results))
      return
    global_step = eval_results[tf.compat.v1.GraphKeys.GLOBAL_STEP]
@@ -144,12 +144,12 @@ class BaseBenchmarkLogger(object):
    """
    metric = _process_metric_to_json(name, value, unit, global_step, extras)
    if metric:
-      tf.compat.v1.logging.info("Benchmark metric: %s", metric)
+      logging.info("Benchmark metric: %s", metric)
  def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
-    tf.compat.v1.logging.info(
+    logging.info(
-        "Benchmark run: %s", _gather_run_info(model_name, dataset_name,
+        "Benchmark run: %s",
-                                              run_params, test_id))
+        _gather_run_info(model_name, dataset_name, run_params, test_id))
  def on_finish(self, status):
    pass
@@ -187,7 +187,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
        self._metric_file_handler.write("\n")
        self._metric_file_handler.flush()
      except (TypeError, ValueError) as e:
-        tf.compat.v1.logging.warning(
+        logging.warning(
            "Failed to dump metric to log file: name %s, value %s, error %s",
            name, value, e)
@@ -212,8 +212,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
        json.dump(run_info, f)
        f.write("\n")
      except (TypeError, ValueError) as e:
-        tf.compat.v1.logging.warning(
+        logging.warning("Failed to dump benchmark run info to log file: %s", e)
-            "Failed to dump benchmark run info to log file: %s", e)
  def on_finish(self, status):
    self._metric_file_handler.flush()
@@ -322,8 +321,8 @@ def _process_metric_to_json(
    name, value, unit=None, global_step=None, extras=None):
  """Validate the metric data and generate JSON for insert."""
  if not isinstance(value, numbers.Number):
-    tf.compat.v1.logging.warning(
+    logging.warning("Metric value to log should be a number. Got %s",
-        "Metric value to log should be a number. Got %s", type(value))
+                    type(value))
    return None
  extras = _convert_to_json_dict(extras)
@@ -383,8 +382,7 @@ def _collect_cpu_info(run_info):
    run_info["machine_config"]["cpu_info"] = cpu_info
  except ImportError:
-    tf.compat.v1.logging.warn(
+    logging.warn("'cpuinfo' not imported. CPU info will not be logged.")
-        "'cpuinfo' not imported. CPU info will not be logged.")
 def _collect_memory_info(run_info):
@@ -396,8 +394,7 @@ def _collect_memory_info(run_info):
    run_info["machine_config"]["memory_total"] = vmem.total
    run_info["machine_config"]["memory_available"] = vmem.available
  except ImportError:
-    tf.compat.v1.logging.warn(
+    logging.warn("'psutil' not imported. Memory info will not be logged.")
-        "'psutil' not imported. Memory info will not be logged.")
 def _collect_test_environment(run_info):

--- a/official/utils/logs/logger_test.py
+++ b/official/utils/logs/logger_test.py
@@ -28,6 +28,7 @@ import unittest
 import mock
 from absl.testing import flagsaver
 import tensorflow as tf  # pylint: disable=g-bad-import-order
+from absl import logging
 try:
  from google.cloud import bigquery
@@ -79,7 +80,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
    mock_logger = mock.MagicMock()
    mock_config_benchmark_logger.return_value = mock_logger
    with logger.benchmark_context(None):
-      tf.compat.v1.logging.info("start benchmarking")
+      logging.info("start benchmarking")
    mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_SUCCESS)
  @mock.patch("official.utils.logs.logger.config_benchmark_logger")
@@ -96,18 +97,18 @@ class BaseBenchmarkLoggerTest(tf.test.TestCase):
  def setUp(self):
    super(BaseBenchmarkLoggerTest, self).setUp()
-    self._actual_log = tf.compat.v1.logging.info
+    self._actual_log = logging.info
    self.logged_message = None
    def mock_log(*args, **kwargs):
      self.logged_message = args
      self._actual_log(*args, **kwargs)
-    tf.compat.v1.logging.info = mock_log
+    logging.info = mock_log
  def tearDown(self):
    super(BaseBenchmarkLoggerTest, self).tearDown()
-    tf.compat.v1.logging.info = self._actual_log
+    logging.info = self._actual_log
  def test_log_metric(self):
    log = logger.BaseBenchmarkLogger()

--- a/official/utils/logs/mlperf_helper.py
+++ b/official/utils/logs/mlperf_helper.py
@@ -31,8 +31,9 @@ import re
 import subprocess
 import sys
 import typing
+from absl import logging
+# pylint:disable=logging-format-interpolation
-import tensorflow as tf
 _MIN_VERSION = (0, 0, 10)
 _STACK_OFFSET = 2
@@ -94,8 +95,7 @@ def get_mlperf_log():
      version = pkg_resources.get_distribution("mlperf_compliance")
      version = tuple(int(i) for i in version.version.split("."))
      if version < _MIN_VERSION:
-        tf.compat.v1.logging.warning(
+        logging.warning("mlperf_compliance is version {}, must be >= {}".format(
-            "mlperf_compliance is version {}, must be >= {}".format(
            ".".join([str(i) for i in version]),
            ".".join([str(i) for i in _MIN_VERSION])))
        raise ImportError
@@ -187,6 +187,6 @@ def clear_system_caches():
 if __name__ == "__main__":
-  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  logging.set_verbosity(logging.INFO)
  with LOGGER(True):
    ncf_print(key=TAGS.RUN_START)
--- a/official/utils/misc/distribution_utils.py
+++ b/official/utils/misc/distribution_utils.py
@@ -22,6 +22,8 @@ import json
 import os
 import random
 import string
+from absl import logging
 import tensorflow.compat.v2 as tf
 from official.utils.misc import tpu_lib
@@ -252,7 +254,7 @@ class SyntheticIterator(object):
 def _monkey_patch_dataset_method(strategy):
  """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
  def make_dataset(self, dataset):
-    tf.compat.v1.logging.info('Using pure synthetic data.')
+    logging.info('Using pure synthetic data.')
    with self.scope():
      if self.extended._global_batch_size:  # pylint: disable=protected-access
        return SyntheticDataset(dataset, self.num_replicas_in_sync)

--- a/official/utils/misc/model_helpers.py
+++ b/official/utils/misc/model_helpers.py
@@ -20,8 +20,11 @@ from __future__ import print_function
 import numbers
+from absl import logging
 import tensorflow as tf
 from tensorflow.python.util import nest
+# pylint:disable=logging-format-interpolation
 def past_stop_threshold(stop_threshold, eval_metric):
@@ -48,8 +51,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
                     "must be a number.")
  if eval_metric >= stop_threshold:
-    tf.compat.v1.logging.info(
+    logging.info("Stop threshold of {} was passed with metric value {}.".format(
-        "Stop threshold of {} was passed with metric value {}.".format(
        stop_threshold, eval_metric))
    return True
@@ -88,6 +90,6 @@ def generate_synthetic_data(
 def apply_clean(flags_obj):
  if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
-    tf.compat.v1.logging.info("--clean flag set. Removing existing model dir:"
+    logging.info("--clean flag set. Removing existing model dir:"
                 " {}".format(flags_obj.model_dir))
    tf.io.gfile.rmtree(flags_obj.model_dir)
--- a/official/utils/testing/perfzero_benchmark.py
+++ b/official/utils/testing/perfzero_benchmark.py
@@ -20,8 +20,9 @@ from __future__ import print_function
 import os
 from absl import flags
+from absl import logging
 from absl.testing import flagsaver
-import tensorflow as tf  # pylint: disable=g-bad-import-order
+import tensorflow as tf
 FLAGS = flags.FLAGS
@@ -75,7 +76,7 @@ class PerfZeroBenchmark(tf.test.Benchmark):
  def _setup(self):
    """Sets up and resets flags before each test."""
-    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+    logging.set_verbosity(logging.INFO)
    if PerfZeroBenchmark.local_flags is None:
      for flag_method in self.flag_methods:
        flag_method()

--- a/official/vision/detection/configs/base_config.py
+++ b/official/vision/detection/configs/base_config.py
@@ -87,10 +87,6 @@ BASE_CFG = {
    },
    'resnet': {
        'resnet_depth': 50,
-        'dropblock': {
-            'dropblock_keep_prob': None,
-            'dropblock_size': None,
-        },
        'batch_norm': {
            'batch_norm_momentum': 0.997,
            'batch_norm_epsilon': 1e-4,
@@ -111,43 +107,6 @@ BASE_CFG = {
            'use_sync_bn': False,
        },
    },
-    'nasfpn': {
-        'min_level': 3,
-        'max_level': 7,
-        'fpn_feat_dims': 256,
-        'num_repeats': 5,
-        'use_separable_conv': False,
-        'dropblock': {
-            'dropblock_keep_prob': None,
-            'dropblock_size': None,
-        },
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
-    },
-    # tunable_nasfpn:strip_begin
-    'tunable_nasfpn_v1': {
-        'min_level': 3,
-        'max_level': 7,
-        'fpn_feat_dims': 256,
-        'num_repeats': 5,
-        'use_separable_conv': False,
-        'dropblock': {
-            'dropblock_keep_prob': None,
-            'dropblock_size': None,
-        },
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-            'use_sync_bn': False,
-        },
-        'nodes': None
-    },
-    # tunable_nasfpn:strip_end
    'postprocess': {
        'use_batched_nms': False,
        'max_total_size': 100,

--- a/official/vision/detection/configs/retinanet_config.py
+++ b/official/vision/detection/configs/retinanet_config.py
@@ -106,10 +106,6 @@ RETINANET_CFG = {
    },
    'resnet': {
        'resnet_depth': 50,
-        'dropblock': {
-            'dropblock_keep_prob': None,
-            'dropblock_size': None,
-        },
        'batch_norm': {
            'batch_norm_momentum': 0.997,
            'batch_norm_epsilon': 1e-4,
@@ -128,22 +124,6 @@ RETINANET_CFG = {
            'batch_norm_trainable': True,
        },
    },
-    'nasfpn': {
-        'min_level': 3,
-        'max_level': 7,
-        'fpn_feat_dims': 256,
-        'num_repeats': 5,
-        'use_separable_conv': False,
-        'dropblock': {
-            'dropblock_keep_prob': None,
-            'dropblock_size': None,
-        },
-        'batch_norm': {
-            'batch_norm_momentum': 0.997,
-            'batch_norm_epsilon': 1e-4,
-            'batch_norm_trainable': True,
-        },
-    },
    'retinanet_head': {
        'min_level': 3,
        'max_level': 7,

--- a/official/vision/detection/main.py
+++ b/official/vision/detection/main.py
@@ -52,7 +52,7 @@ flags.DEFINE_string(
 flags.DEFINE_string(
    'model', default='retinanet',
-    help='Model to run: `retinanet` or `shapemask`.')
+    help='Model to run: `retinanet` or `mask_rcnn`.')
 flags.DEFINE_string('training_file_pattern', None,
                    'Location of the train data.')

--- a/official/vision/detection/modeling/architecture/factory.py
+++ b/official/vision/detection/modeling/architecture/factory.py
@@ -37,19 +37,12 @@ def batch_norm_relu_generator(params):
  return _batch_norm_op
-def dropblock_generator(params):
-  return nn_ops.Dropblock(
-      dropblock_keep_prob=params.dropblock_keep_prob,
-      dropblock_size=params.dropblock_size)
 def backbone_generator(params):
  """Generator function for various backbone models."""
  if params.architecture.backbone == 'resnet':
    resnet_params = params.resnet
    backbone_fn = resnet.Resnet(
        resnet_depth=resnet_params.resnet_depth,
-        dropblock=dropblock_generator(resnet_params.dropblock),
        batch_norm_relu=batch_norm_relu_generator(resnet_params.batch_norm))
  else:
    raise ValueError('Backbone model %s is not supported.' %

--- a/official/vision/detection/modeling/architecture/nn_ops.py
+++ b/official/vision/detection/modeling/architecture/nn_ops.py
@@ -84,88 +84,3 @@ class BatchNormRelu(tf.keras.layers.Layer):
      inputs = tf.nn.relu(inputs)
    return inputs
-class Dropblock(object):
-  """DropBlock: a regularization method for convolutional neural networks.
-    DropBlock is a form of structured dropout, where units in a contiguous
-    region of a feature map are dropped together. DropBlock works better than
-    dropout on convolutional layers due to the fact that activation units in
-    convolutional layers are spatially correlated.
-    See https://arxiv.org/pdf/1810.12890.pdf for details.
-  """
-  def __init__(self,
-               dropblock_keep_prob=None,
-               dropblock_size=None,
-               data_format='channels_last'):
-    self._dropblock_keep_prob = dropblock_keep_prob
-    self._dropblock_size = dropblock_size
-    self._data_format = data_format
-  def __call__(self, net, is_training=False):
-    """Builds Dropblock layer.
-    Args:
-      net: `Tensor` input tensor.
-      is_training: `bool` if True, the model is in training mode.
-    Returns:
-      A version of input tensor with DropBlock applied.
-    """
-    if not is_training or self._dropblock_keep_prob is None:
-      return net
-    logging.info('Applying DropBlock: dropblock_size {}, net.shape {}'.format(
-        self._dropblock_size, net.shape))
-    if self._data_format == 'channels_last':
-      _, height, width, _ = net.get_shape().as_list()
-    else:
-      _, _, height, width = net.get_shape().as_list()
-    total_size = width * height
-    dropblock_size = min(self._dropblock_size, min(width, height))
-    # Seed_drop_rate is the gamma parameter of DropBlcok.
-    seed_drop_rate = (
-        1.0 - self._dropblock_keep_prob) * total_size / dropblock_size**2 / (
-            (width - self._dropblock_size + 1) *
-            (height - self._dropblock_size + 1))
-    # Forces the block to be inside the feature map.
-    w_i, h_i = tf.meshgrid(tf.range(width), tf.range(height))
-    valid_block = tf.logical_and(
-        tf.logical_and(w_i >= int(dropblock_size // 2),
-                       w_i < width - (dropblock_size - 1) // 2),
-        tf.logical_and(h_i >= int(dropblock_size // 2),
-                       h_i < width - (dropblock_size - 1) // 2))
-    if self._data_format == 'channels_last':
-      valid_block = tf.reshape(valid_block, [1, height, width, 1])
-    else:
-      valid_block = tf.reshape(valid_block, [1, 1, height, width])
-    randnoise = tf.random.uniform(net.shape, dtype=tf.float32)
-    valid_block = tf.cast(valid_block, dtype=tf.float32)
-    seed_keep_rate = tf.cast(1 - seed_drop_rate, dtype=tf.float32)
-    block_pattern = (1 - valid_block + seed_keep_rate + randnoise) >= 1
-    block_pattern = tf.cast(block_pattern, dtype=tf.float32)
-    if self._data_format == 'channels_last':
-      ksize = [1, self._dropblock_size, self._dropblock_size, 1]
-    else:
-      ksize = [1, 1, self._dropblock_size, self._dropblock_size]
-    block_pattern = -tf.nn.max_pool2d(
-        -block_pattern,
-        ksize=ksize,
-        strides=[1, 1, 1, 1],
-        padding='SAME',
-        data_format='NHWC' if self._data_format == 'channels_last' else 'NCHW')
-    percent_ones = tf.cast(
-        tf.reduce_sum(input_tensor=block_pattern), tf.float32) / tf.cast(
-            tf.size(input=block_pattern), tf.float32)
-    net = net / tf.cast(percent_ones, net.dtype) * tf.cast(
-        block_pattern, net.dtype)
-    return net
--- a/official/vision/detection/modeling/architecture/resnet.py
+++ b/official/vision/detection/modeling/architecture/resnet.py
@@ -34,14 +34,12 @@ class Resnet(object):
  def __init__(self,
               resnet_depth,
-               dropblock=nn_ops.Dropblock(),
               batch_norm_relu=nn_ops.BatchNormRelu,
               data_format='channels_last'):
    """ResNet initialization function.
    Args:
      resnet_depth: `int` depth of ResNet backbone model.
-      dropblock: a dropblock layer.
      batch_norm_relu: an operation that includes a batch normalization layer
        followed by a relu layer(optional).
      data_format: `str` either "channels_first" for `[batch, channels, height,
@@ -49,7 +47,6 @@ class Resnet(object):
    """
    self._resnet_depth = resnet_depth
-    self._dropblock = dropblock
    self._batch_norm_relu = batch_norm_relu
    self._data_format = data_format
@@ -219,24 +216,20 @@ class Resnet(object):
          inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
      shortcut = self._batch_norm_relu(relu=False)(
          shortcut, is_training=is_training)
-    shortcut = self._dropblock(shortcut, is_training=is_training)
    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=1, strides=1)
    inputs = self._batch_norm_relu()(inputs, is_training=is_training)
-    inputs = self._dropblock(inputs, is_training=is_training)
    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=filters, kernel_size=3, strides=strides)
    inputs = self._batch_norm_relu()(inputs, is_training=is_training)
-    inputs = self._dropblock(inputs, is_training=is_training)
    inputs = self.conv2d_fixed_padding(
        inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
    inputs = self._batch_norm_relu(
        relu=False, init_zero=True)(
            inputs, is_training=is_training)
-    inputs = self._dropblock(inputs, is_training=is_training)
    return tf.nn.relu(inputs + shortcut)

--- a/official/vision/image_classification/README.md
+++ b/official/vision/image_classification/README.md
 # Image Classification
-This folder contains the TF 2.0 model examples for image classification:
+This folder contains TF 2.0 model examples for image classification:
-* [ResNet](#resnet)
 * [MNIST](#mnist)
+* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
+compile/fit methods for image classification models, including:
+  * ResNet
+  * EfficientNet[^1]
+[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
 For more information about other types of models, please refer to this
 [README file](../../README.md).
-## ResNet
+## Before you begin
-Similar to the [estimator implementation](../../r1/resnet), the Keras
-implementation has code for the ImageNet dataset. The ImageNet
-version uses a ResNet50 model implemented in
-[`resnet_model.py`](./resnet/resnet_model.py).
 Please make sure that you have the latest version of TensorFlow
 installed and
 [add the models folder to your Python path](/official/#running-the-models).
-### Pretrained Models
+### ImageNet preparation
-* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
-* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
-and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
-### ImageNet Training
 Download the ImageNet dataset and convert it to TFRecord format.
 The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
 and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
 provide a few options.
-Once your dataset is ready, you can begin training the model as follows:
-```bash
-python resnet/resnet_imagenet_main.py
-```
-Again, if you did not download the data to the default directory, specify the
-location with the `--data_dir` flag:
-```bash
-python resnet/resnet_imagenet_main.py --data_dir=/path/to/imagenet
-```
-There are more flag options you can specify. Here are some examples:
- `--use_synthetic_data`: when set to true, synthetic data, rather than real
-data, are used;
- `--batch_size`: the batch size used for the model;
- `--model_dir`: the directory to save the model checkpoint;
- `--train_epochs`: number of epoches to run for training the model;
- `--train_steps`: number of steps to run for training the model. We now only
-support a number that is smaller than the number of batches in an epoch.
- `--skip_eval`: when set to true, evaluation as well as validation during
-training is skipped
-For example, this is a typical command line to run with ImageNet data with
-batch size 128 per GPU:
-```bash
-python -m resnet/resnet_imagenet_main.py \
-    --model_dir=/tmp/model_dir/something \
-    --num_gpus=2 \
-    --batch_size=128 \
-    --train_epochs=90 \
-    --train_steps=10 \
-    --use_synthetic_data=false
-```
-See [`common.py`](common.py) for full list of options.
-### Using multiple GPUs
-You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
-You can read more about them in this
-[guide](https://www.tensorflow.org/guide/distribute_strategy).
-In this example, we have made it easier to use is with just a command line flag
-`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
-and 0 otherwise.
- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
-distributed training across the GPUs.
-If you wish to run without `tf.distribute.Strategy`, you can do so by setting
-`--distribution_strategy=off`.
-### Running on multiple GPU hosts
-You can also train these models on multiple hosts, each with GPUs, using
-`tf.distribute.Strategy`.
-The easiest way to run multi-host benchmarks is to set the
-[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
-appropriately at each host.  e.g., to run using `MultiWorkerMirroredStrategy` on
-2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
-host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
-"index": i}`.  `MultiWorkerMirroredStrategy` will automatically use all the
-available GPUs at each host.
 ### Running on Cloud TPUs
-Note: This model will **not** work with TPUs on Colab.
+Note: These models will **not** work with TPUs on Colab.
-You can train the ResNet CTL model on Cloud TPUs using
+You can train image classification models on Cloud TPUs using
 `tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
 strongly recommended that you go through the
 [quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
 create a TPU and GCE VM.
-To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and
+## MNIST
-`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
-From a GCE VM, you can run the following command to train ResNet for one epoch
+To download the data and run the MNIST sample model locally for the first time,
-on a v2-8 or v3-8 TPU:
+run one of the following command:
 ```bash
-python resnet/resnet_ctl_imagenet_main.py \
+python3 mnist_main.py \
-  --tpu=$TPU_NAME \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --batch_size=1024 \
+  --train_epochs=10 \
-  --steps_per_loop=500 \
+  --distribution_strategy=one_device \
-  --train_epochs=1 \
+  --num_gpus=$NUM_GPUS \
-  --use_synthetic_data=false \
+  --download
-  --dtype=fp32 \
-  --enable_eager=true \
-  --enable_tensorboard=true \
-  --distribution_strategy=tpu \
-  --log_steps=50 \
-  --single_l2_loss_op=true \
-  --use_tf_function=true
 ```
-To train the ResNet to convergence, run it for 90 epochs:
+To train the model on a Cloud TPU, run the following command:
 ```bash
-python resnet/resnet_ctl_imagenet_main.py \
+python3 mnist_main.py \
  --tpu=$TPU_NAME \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --batch_size=1024 \
+  --train_epochs=10 \
-  --steps_per_loop=500 \
-  --train_epochs=90 \
-  --use_synthetic_data=false \
-  --dtype=fp32 \
-  --enable_eager=true \
-  --enable_tensorboard=true \
  --distribution_strategy=tpu \
-  --log_steps=50 \
+  --download
-  --single_l2_loss_op=true \
-  --use_tf_function=true
 ```
-Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths.
+Note: the `--download` flag is only required the first time you run the model.
-## MNIST
+## Classifier Trainer
+The classifier trainer is a unified framework for running image classification
+models using Keras's compile/fit methods. Experiments should be provided in the
+form of YAML files, some examples are included within the configs/examples
+folder. Please see [configs/examples](./configs/examples) for more example
+configurations.
-To download the data and run the MNIST sample model locally for the first time,
+The provided configuration files use a per replica batch size and is scaled
-run one of the following command:
+by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
+the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
+would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
+be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
+### ResNet50
+#### On GPU:
 ```bash
-python mnist_main.py \
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --train_epochs=10 \
+  --config_file=configs/examples/resnet/imagenet/gpu.yaml \
-  --distribution_strategy=one_device \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
-  --num_gpus=$NUM_GPUS \
-  --download
 ```
-To train the model on a Cloud TPU, run the following command:
+#### On TPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=resnet \
+  --dataset=imagenet \
+  --tpu=$TPU_NAME \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=config/examples/resnet/imagenet/tpu.yaml
+```
+### EfficientNet
+**Note: EfficientNet development is a work in progress.**
+#### On GPU:
+```bash
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
+  --model_dir=$MODEL_DIR \
+  --data_dir=$DATA_DIR \
+  --config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
+  --params_override='runtime.num_gpus=$NUM_GPUS'
+```
+#### On TPU:
 ```bash
-python mnist_main.py \
+python3 classifier_trainer.py \
+  --mode=train_and_eval \
+  --model_type=efficientnet \
+  --dataset=imagenet \
  --tpu=$TPU_NAME \
  --model_dir=$MODEL_DIR \
  --data_dir=$DATA_DIR \
-  --train_epochs=10 \
+  --config_file=config/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
-  --distribution_strategy=tpu \
-  --download
 ```
-Note: the `--download` flag is only required the first time you run the model.
+Note that the number of GPU devices can be overridden in the command line using
+`--params_overrides`. The TPU does not need this override as the device is fixed
+by providing the TPU address or name with the `--tpu` flag.
--- a/official/vision/image_classification/augment.py
+++ b/official/vision/image_classification/augment.py
--- a/official/vision/image_classification/augment_test.py
+++ b/official/vision/image_classification/augment_test.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for autoaugment."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+from absl.testing import parameterized
+import tensorflow.compat.v2 as tf
+from official.vision.image_classification import augment
+def get_dtype_test_cases():
+  return [
+      ('uint8', tf.uint8),
+      ('int32', tf.int32),
+      ('float16', tf.float16),
+      ('float32', tf.float32),
+  ]
+@parameterized.named_parameters(get_dtype_test_cases())
+class TransformsTest(parameterized.TestCase, tf.test.TestCase):
+  """Basic tests for fundamental transformations."""
+  def test_to_from_4d(self, dtype):
+    for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
+      original_ndims = len(shape)
+      image = tf.zeros(shape, dtype=dtype)
+      image_4d = augment.to_4d(image)
+      self.assertEqual(4, tf.rank(image_4d))
+      self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
+  def test_transform(self, dtype):
+    image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
+    self.assertAllEqual(augment.transform(image, transforms=[1]*8),
+                        [[4, 4], [4, 4]])
+  def disable_test_translate(self, dtype):
+    image = tf.constant(
+        [[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
+        dtype=dtype)
+    translations = [-1, -1]
+    translated = augment.translate(image=image,
+                                   translations=translations)
+    expected = [[1, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 0]]
+    self.assertAllEqual(translated, expected)
+  def test_translate_shapes(self, dtype):
+    translation = [0, 0]
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.translate(image, translation))
+  def test_translate_invalid_translation(self, dtype):
+    image = tf.zeros((1, 1), dtype=dtype)
+    invalid_translation = [[[1, 1]]]
+    with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
+      _ = augment.translate(image, invalid_translation)
+  def test_rotate(self, dtype):
+    image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
+    rotation = 90.
+    transformed = augment.rotate(image=image, degrees=rotation)
+    expected = [[2, 5, 8],
+                [1, 4, 7],
+                [0, 3, 6]]
+    self.assertAllEqual(transformed, expected)
+  def test_rotate_shapes(self, dtype):
+    degrees = 0.
+    for shape in [(3, 3), (5, 5), (224, 224, 3)]:
+      image = tf.zeros(shape, dtype=dtype)
+      self.assertAllEqual(image, augment.rotate(image, degrees))
+class AutoaugmentTest(tf.test.TestCase):
+  def test_autoaugment(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    augmenter = augment.AutoAugment()
+    aug_image = augmenter.distort(image)
+    self.assertEqual((224, 224, 3), aug_image.shape)
+  def test_randaug(self):
+    """Smoke test to be sure there are no syntax errors."""
+    image = tf.zeros((224, 224, 3), dtype=tf.uint8)
+    augmenter = augment.RandAugment()
+    aug_image = augmenter.distort(image)
+    self.assertEqual((224, 224, 3), aug_image.shape)
+  def test_all_policy_ops(self):
+    """Smoke test to be sure all augmentation functions can execute."""
+    prob = 1
+    magnitude = 10
+    replace_value = [128] * 3
+    cutout_const = 100
+    translate_const = 250
+    image = tf.ones((224, 224, 3), dtype=tf.uint8)
+    for op_name in augment.NAME_TO_FUNC:
+      func, _, args = augment._parse_policy_info(op_name,
+                                                 prob,
+                                                 magnitude,
+                                                 replace_value,
+                                                 cutout_const,
+                                                 translate_const)
+      image = func(image, *args)
+    self.assertEqual((224, 224, 3), image.shape)
+if __name__ == '__main__':
+  assert tf.version.VERSION.startswith('2.')
+  tf.test.main()
--- a/official/vision/image_classification/callbacks.py
+++ b/official/vision/image_classification/callbacks.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common modules for callbacks."""
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+import os
+from absl import logging
+import tensorflow as tf
+from typing import Any, List, MutableMapping, Text
+def get_callbacks(model_checkpoint: bool = True,
+                  include_tensorboard: bool = True,
+                  track_lr: bool = True,
+                  write_model_weights: bool = True,
+                  initial_step: int = 0,
+                  model_dir: Text = None) -> List[tf.keras.callbacks.Callback]:
+  """Get all callbacks."""
+  model_dir = model_dir or ''
+  callbacks = []
+  if model_checkpoint:
+    ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+    callbacks.append(tf.keras.callbacks.ModelCheckpoint(
+        ckpt_full_path, save_weights_only=True, verbose=1))
+  if include_tensorboard:
+    callbacks.append(CustomTensorBoard(
+        log_dir=model_dir,
+        track_lr=track_lr,
+        initial_step=initial_step,
+        write_images=write_model_weights))
+  return callbacks
+def get_scalar_from_tensor(t: tf.Tensor) -> int:
+  """Utility function to convert a Tensor to a scalar."""
+  t = tf.keras.backend.get_value(t)
+  if callable(t):
+    return t()
+  else:
+    return t
+class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
+  """A customized TensorBoard callback that tracks additional datapoints.
+  Metrics tracked:
+  - Global learning rate
+  Attributes:
+    log_dir: the path of the directory where to save the log files to be
+      parsed by TensorBoard.
+    track_lr: `bool`, whether or not to track the global learning rate.
+    initial_step: the initial step, used for preemption recovery.
+    **kwargs: Additional arguments for backwards compatibility. Possible key
+      is `period`.
+  """
+  # TODO(b/146499062): track params, flops, log lr, l2 loss,
+  # classification loss
+  def __init__(self,
+               log_dir: Text,
+               track_lr: bool = False,
+               initial_step: int = 0,
+               **kwargs):
+    super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
+    self.step = initial_step
+    self._track_lr = track_lr
+  def on_batch_begin(self,
+                     epoch: int,
+                     logs: MutableMapping[Text, Any] = None) -> None:
+    self.step += 1
+    if logs is None:
+      logs = {}
+    logs.update(self._calculate_metrics())
+    super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
+  def on_epoch_begin(self,
+                     epoch: int,
+                     logs: MutableMapping[Text, Any] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    for k, v in metrics.items():
+      logging.info('Current %s: %f', k, v)
+    super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
+  def on_epoch_end(self,
+                   epoch: int,
+                   logs: MutableMapping[Text, Any] = None) -> None:
+    if logs is None:
+      logs = {}
+    metrics = self._calculate_metrics()
+    logs.update(metrics)
+    super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
+  def _calculate_metrics(self) -> MutableMapping[Text, Any]:
+    logs = {}
+    if self._track_lr:
+      logs['learning_rate'] = self._calculate_lr()
+    return logs
+  def _calculate_lr(self) -> int:
+    """Calculates the learning rate given the current step."""
+    lr = self._get_base_optimizer().lr
+    if callable(lr):
+      lr = lr(self.step)
+    return get_scalar_from_tensor(lr)
+  def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
+    """Get the base optimizer used by the current model."""
+    optimizer = self.model.optimizer
+    # The optimizer might be wrapped by another class, so unwrap it
+    while hasattr(optimizer, '_optimizer'):
+      optimizer = optimizer._optimizer  # pylint:disable=protected-access
+    return optimizer
--- a/official/vision/image_classification/classifier_trainer.py
+++ b/official/vision/image_classification/classifier_trainer.py
+# Lint as: python3
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs an Image Classification model."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import pprint
+from typing import Any, Tuple, Text, Optional, Mapping
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow.compat.v2 as tf
+from official.modeling import performance
+from official.modeling.hyperparams import params_dict
+from official.utils import hyperparams_flags
+from official.utils.logs import logger
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+from official.vision.image_classification import callbacks as custom_callbacks
+from official.vision.image_classification import dataset_factory
+from official.vision.image_classification import optimizer_factory
+from official.vision.image_classification.configs import base_configs
+from official.vision.image_classification.configs import configs
+from official.vision.image_classification.efficientnet import efficientnet_model
+from official.vision.image_classification.resnet import common
+from official.vision.image_classification.resnet import resnet_model
+MODELS = {
+    'efficientnet': efficientnet_model.EfficientNet.from_name,
+    'resnet': resnet_model.resnet50,
+}
+def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
+  """Get a dict of available metrics to track."""
+  if one_hot:
+    return {
+        # (name, metric_fn)
+        'acc': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'accuracy': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_1': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
+        'top_5': tf.keras.metrics.TopKCategoricalAccuracy(
+            k=5,
+            name='top_5_accuracy'),
+    }
+  else:
+    return {
+        # (name, metric_fn)
+        'acc': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'accuracy': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_1': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
+        'top_5': tf.keras.metrics.SparseTopKCategoricalAccuracy(
+            k=5,
+            name='top_5_accuracy'),
+    }
+def get_image_size_from_model(
+    params: base_configs.ExperimentConfig) -> Optional[int]:
+  """If the given model has a preferred image size, return it."""
+  if params.model_name == 'efficientnet':
+    efficientnet_name = params.model.model_params.model_name
+    if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
+      return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
+  return None
+def _get_dataset_builders(params: base_configs.ExperimentConfig,
+                          strategy: tf.distribute.Strategy,
+                          one_hot: bool
+                         ) -> Tuple[Any, Any, Any]:
+  """Create and return train, validation, and test dataset builders."""
+  if one_hot:
+    logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
+  else:
+    logging.warning('label_smoothing not applied, so datasets will not be one '
+                    'hot encoded.')
+  num_devices = strategy.num_replicas_in_sync
+  image_size = get_image_size_from_model(params)
+  dataset_configs = [
+      params.train_dataset, params.validation_dataset, params.test_dataset
+  ]
+  builders = []
+  for config in dataset_configs:
+    if config is not None and config.has_data:
+      builder = dataset_factory.DatasetBuilder(
+          config,
+          image_size=image_size or config.image_size,
+          num_devices=num_devices,
+          one_hot=one_hot)
+    else:
+      builder = None
+    builders.append(builder)
+  return builders
+def get_loss_scale(params: base_configs.ExperimentConfig,
+                   fp16_default: float = 128.) -> float:
+  """Returns the loss scale for initializations."""
+  loss_scale = params.model.loss.loss_scale
+  if loss_scale == 'dynamic':
+    return loss_scale
+  elif loss_scale is not None:
+    return float(loss_scale)
+  elif params.train_dataset.dtype == 'float32':
+    return 1.
+  else:
+    assert params.train_dataset.dtype == 'float16'
+    return fp16_default
+def _get_params_from_flags(flags_obj: flags.FlagValues):
+  """Get ParamsDict from flags."""
+  model = flags_obj.model_type.lower()
+  dataset = flags_obj.dataset.lower()
+  params = configs.get_config(model=model, dataset=dataset)
+  flags_overrides = {
+      'model_dir': flags_obj.model_dir,
+      'mode': flags_obj.mode,
+      'model': {
+          'name': model,
+      },
+      'runtime': {
+          'enable_eager': flags_obj.enable_eager,
+          'tpu': flags_obj.tpu,
+      },
+      'train_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'validation_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+      'test_dataset': {
+          'data_dir': flags_obj.data_dir,
+      },
+  }
+  overriding_configs = (flags_obj.config_file,
+                        flags_obj.params_override,
+                        flags_overrides)
+  pp = pprint.PrettyPrinter()
+  logging.info('Base params: %s', pp.pformat(params.as_dict()))
+  for param in overriding_configs:
+    logging.info('Overriding params: %s', param)
+    # Set is_strict to false because we can have dynamic dict parameters.
+    params = params_dict.override_params_dict(params, param, is_strict=False)
+  params.validate()
+  params.lock()
+  logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
+  return params
+def resume_from_checkpoint(model: tf.keras.Model,
+                           model_dir: str,
+                           train_steps: int) -> int:
+  """Resumes from the latest checkpoint, if possible.
+  Loads the model weights and optimizer settings from a checkpoint.
+  This function should be used in case of preemption recovery.
+  Args:
+    model: The model whose weights should be restored.
+    model_dir: The directory where model weights were saved.
+    train_steps: The number of steps to train.
+  Returns:
+    The epoch of the latest checkpoint, or 0 if not restoring.
+  """
+  logging.info('Load from checkpoint is enabled.')
+  latest_checkpoint = tf.train.latest_checkpoint(model_dir)
+  logging.info('latest_checkpoint: %s', latest_checkpoint)
+  if not latest_checkpoint:
+    logging.info('No checkpoint detected.')
+    return 0
+  logging.info('Checkpoint file %s found and restoring from '
+               'checkpoint', latest_checkpoint)
+  model.load_weights(latest_checkpoint)
+  initial_epoch = model.optimizer.iterations // train_steps
+  logging.info('Completed loading from checkpoint.')
+  logging.info('Resuming from epoch %d', initial_epoch)
+  return int(initial_epoch)
+def initialize(params: base_configs.ExperimentConfig):
+  """Initializes backend related initializations."""
+  keras_utils.set_session_config(
+      enable_eager=params.runtime.enable_eager,
+      enable_xla=params.runtime.enable_xla)
+  if params.runtime.gpu_threads_enabled:
+    keras_utils.set_gpu_thread_mode_and_count(
+        per_gpu_thread_count=params.runtime.per_gpu_thread_count,
+        gpu_thread_mode=params.runtime.gpu_thread_mode,
+        num_gpus=params.runtime.num_gpus,
+        datasets_num_private_threads=params.runtime.dataset_num_private_threads)
+  dataset = params.train_dataset or params.validation_dataset
+  performance.set_mixed_precision_policy(dataset.dtype)
+  if dataset.data_format:
+    data_format = dataset.data_format
+  elif tf.config.list_physical_devices('GPU'):
+    data_format = 'channels_first'
+  else:
+    data_format = 'channels_last'
+  tf.keras.backend.set_image_data_format(data_format)
+  distribution_utils.configure_cluster(
+      params.runtime.worker_hosts,
+      params.runtime.task_index)
+  if params.runtime.enable_eager:
+    # Enable eager execution to allow step-by-step debugging
+    tf.config.experimental_run_functions_eagerly(True)
+def define_classifier_flags():
+  """Defines common flags for image classification."""
+  hyperparams_flags.initialize_common_flags()
+  flags.DEFINE_string(
+      'data_dir',
+      default=None,
+      help='The location of the input data.')
+  flags.DEFINE_string(
+      'mode',
+      default=None,
+      help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
+  flags.DEFINE_bool(
+      'enable_eager',
+      default=None,
+      help='Use eager execution and disable autograph for debugging.')
+  flags.DEFINE_string(
+      'model_type',
+      default=None,
+      help='The type of the model, e.g. EfficientNet, etc.')
+  flags.DEFINE_string(
+      'dataset',
+      default=None,
+      help='The name of the dataset, e.g. ImageNet, etc.')
+def serialize_config(params: base_configs.ExperimentConfig,
+                     model_dir: str):
+  """Serializes and saves the experiment config."""
+  params_save_path = os.path.join(model_dir, 'params.yaml')
+  logging.info('Saving experiment configuration to %s', params_save_path)
+  tf.io.gfile.makedirs(model_dir)
+  params_dict.save_params_dict_to_yaml(params, params_save_path)
+def train_and_eval(
+    params: base_configs.ExperimentConfig,
+    strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
+  """Runs the train and eval path using compile/fit."""
+  logging.info('Running train and eval.')
+  # Note: for TPUs, strategy and scope should be created before the dataset
+  strategy = strategy_override or distribution_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  strategy_scope = distribution_utils.get_strategy_scope(strategy)
+  logging.info('Detected %d devices.', strategy.num_replicas_in_sync)
+  label_smoothing = params.model.loss.label_smoothing
+  one_hot = label_smoothing and label_smoothing > 0
+  builders = _get_dataset_builders(params, strategy, one_hot)
+  datasets = [builder.build() if builder else None for builder in builders]
+  # Unpack datasets and builders based on train/val/test splits
+  train_builder, validation_builder, test_builder = builders  # pylint: disable=unbalanced-tuple-unpacking
+  train_dataset, validation_dataset, test_dataset = datasets
+  train_epochs = params.train.epochs
+  train_steps = params.train.steps or train_builder.num_steps
+  validation_steps = params.evaluation.steps or validation_builder.num_steps
+  logging.info('Global batch size: %d', train_builder.global_batch_size)
+  with strategy_scope:
+    model_params = params.model.model_params.as_dict()
+    model = MODELS[params.model.name](**model_params)
+    learning_rate = optimizer_factory.build_learning_rate(
+        params=params.model.learning_rate,
+        batch_size=train_builder.global_batch_size,
+        train_steps=train_steps)
+    optimizer = optimizer_factory.build_optimizer(
+        optimizer_name=params.model.optimizer.name,
+        base_learning_rate=learning_rate,
+        params=params.model.optimizer.as_dict())
+    metrics_map = _get_metrics(one_hot)
+    metrics = [metrics_map[metric] for metric in params.train.metrics]
+    if one_hot:
+      loss_obj = tf.keras.losses.CategoricalCrossentropy(
+          label_smoothing=params.model.loss.label_smoothing)
+    else:
+      loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+    model.compile(optimizer=optimizer,
+                  loss=loss_obj,
+                  metrics=metrics,
+                  run_eagerly=params.runtime.enable_eager)
+    initial_epoch = 0
+    if params.train.resume_checkpoint:
+      initial_epoch = resume_from_checkpoint(model=model,
+                                             model_dir=params.model_dir,
+                                             train_steps=train_steps)
+  serialize_config(params=params, model_dir=params.model_dir)
+  # TODO(dankondratyuk): callbacks significantly slow down training
+  callbacks = custom_callbacks.get_callbacks(
+      model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
+      include_tensorboard=params.train.callbacks.enable_tensorboard,
+      track_lr=params.train.tensorboard.track_lr,
+      write_model_weights=params.train.tensorboard.write_model_weights,
+      initial_step=initial_epoch * train_steps,
+      model_dir=params.model_dir)
+  history = model.fit(
+      train_dataset,
+      epochs=train_epochs,
+      steps_per_epoch=train_steps,
+      initial_epoch=initial_epoch,
+      callbacks=callbacks,
+      validation_data=validation_dataset,
+      validation_steps=validation_steps,
+      validation_freq=params.evaluation.epochs_between_evals)
+  validation_output = model.evaluate(
+      validation_dataset, steps=validation_steps, verbose=2)
+  # TODO(dankondratyuk): eval and save final test accuracy
+  stats = common.build_stats(history,
+                             validation_output,
+                             callbacks)
+  return stats
+def export(params: base_configs.ExperimentConfig):
+  """Runs the model export functionality."""
+  logging.info('Exporting model.')
+  model_params = params.model.model_params.as_dict()
+  model = MODELS[params.model.name](**model_params)
+  checkpoint = params.export.checkpoint
+  if checkpoint is None:
+    logging.info('No export checkpoint was provided. Using the latest '
+                 'checkpoint from model_dir.')
+    checkpoint = tf.train.latest_checkpoint(params.model_dir)
+  model.load_weights(checkpoint)
+  model.save(params.export.destination)
+def run(flags_obj: flags.FlagValues,
+        strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
+  """Runs Image Classification model using native Keras APIs.
+  Args:
+    flags_obj: An object containing parsed flag values.
+    strategy_override: A `tf.distribute.Strategy` object to use for model.
+  Returns:
+    Dictionary of training/eval stats
+  """
+  params = _get_params_from_flags(flags_obj)
+  initialize(params)
+  if params.mode == 'train_and_eval':
+    return train_and_eval(params, strategy_override)
+  elif params.mode == 'export_only':
+    export(params)
+  else:
+    raise ValueError('{} is not a valid mode.'.format(params.mode))
+def main(_):
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  if stats:
+    logging.info('Run stats:\n%s', stats)
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_classifier_flags()
+  flags.mark_flag_as_required('data_dir')
+  flags.mark_flag_as_required('mode')
+  flags.mark_flag_as_required('model_type')
+  flags.mark_flag_as_required('dataset')
+  assert tf.version.VERSION.startswith('2.')
+  app.run(main)
--- a/official/vision/image_classification/classifier_trainer_test.py
+++ b/official/vision/image_classification/classifier_trainer_test.py
+# Lint as: python3
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit tests for the classifier trainer models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import functools
+import json
+import os
+import sys
+from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional, Tuple
+from absl import flags
+from absl.testing import parameterized
+import tensorflow.compat.v2 as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.utils.flags import core as flags_core
+from official.vision.image_classification import classifier_trainer
+from official.vision.image_classification import dataset_factory
+from official.vision.image_classification import test_utils
+from official.vision.image_classification.configs import base_configs
+classifier_trainer.define_classifier_flags()
+def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
+  """Returns the combinations of end-to-end tests to run."""
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],
+      model=[
+          'efficientnet',
+          'resnet',
+      ],
+      mode='eager',
+      dataset=[
+          'imagenet',
+      ],
+  )
+def get_params_override(params_override: Mapping[str, Any]) -> str:
+  """Converts params_override dict to string command."""
+  return '--params_override=' + json.dumps(params_override)
+def basic_params_override() -> MutableMapping[str, Any]:
+  """Returns a basic parameter configuration for testing."""
+  return {
+      'train_dataset': {
+          'builder': 'synthetic',
+          'use_per_replica_batch_size': True,
+          'batch_size': 1,
+          'image_size': 224,
+      },
+      'validation_dataset': {
+          'builder': 'synthetic',
+          'batch_size': 1,
+          'use_per_replica_batch_size': True,
+          'image_size': 224,
+      },
+      'test_dataset': {
+          'builder': 'synthetic',
+          'batch_size': 1,
+          'use_per_replica_batch_size': True,
+          'image_size': 224,
+      },
+      'train': {
+          'steps': 1,
+          'epochs': 1,
+          'callbacks': {
+              'enable_checkpoint_and_export': True,
+              'enable_tensorboard': False,
+          },
+      },
+      'evaluation': {
+          'steps': 1,
+      },
+  }
+def get_trivial_model(num_classes: int) -> tf.keras.Model:
+  """Creates and compiles trivial model for ImageNet dataset."""
+  model = test_utils.trivial_model(num_classes=num_classes)
+  lr = 0.01
+  optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
+  loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
+  model.compile(optimizer=optimizer,
+                loss=loss_obj,
+                run_eagerly=True)
+  return model
+def get_trivial_data() -> tf.data.Dataset:
+  """Gets trivial data in the ImageNet size."""
+  def generate_data(_) -> tf.data.Dataset:
+    image = tf.zeros(shape=(224, 224, 3), dtype=tf.float32)
+    label = tf.zeros([1], dtype=tf.int32)
+    return image, label
+  dataset = tf.data.Dataset.range(1)
+  dataset = dataset.repeat()
+  dataset = dataset.map(generate_data,
+                        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.prefetch(buffer_size=1).batch(1)
+  return dataset
+def run_end_to_end(main: Callable[[Any], None],
+                   extra_flags: Optional[Iterable[str]] = None,
+                   model_dir: Optional[str] = None):
+  """Runs the classifier trainer end-to-end."""
+  extra_flags = [] if extra_flags is None else extra_flags
+  args = [sys.argv[0], '--model_dir', model_dir] + extra_flags
+  flags_core.parse_flags(argv=args)
+  main(flags.FLAGS)
+class ClassifierTest(tf.test.TestCase, parameterized.TestCase):
+  """Unit tests for Keras models."""
+  _tempdir = None
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(ClassifierTest, cls).setUpClass()
+  def tearDown(self):
+    super(ClassifierTest, self).tearDown()
+    tf.io.gfile.rmtree(self.get_temp_dir())
+  @combinations.generate(distribution_strategy_combinations())
+  def test_end_to_end_train_and_eval_export(self, distribution, model, dataset):
+    """Test train_and_eval and export for Keras classifier models."""
+    # Some parameters are not defined as flags (e.g. cannot run
+    # classifier_train.py --batch_size=...) by design, so use
+    # "--params_override=..." instead
+    model_dir = self.get_temp_dir()
+    base_flags = [
+        '--data_dir=not_used',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+    ]
+    train_and_eval_flags = base_flags + [
+        get_params_override(basic_params_override()),
+        '--mode=train_and_eval',
+    ]
+    export_params = basic_params_override()
+    export_path = os.path.join(model_dir, 'export')
+    export_params['export'] = {}
+    export_params['export']['destination'] = export_path
+    export_flags = base_flags + [
+        '--mode=export_only',
+        get_params_override(export_params)
+    ]
+    run = functools.partial(classifier_trainer.run,
+                            strategy_override=distribution)
+    run_end_to_end(main=run,
+                   extra_flags=train_and_eval_flags,
+                   model_dir=model_dir)
+    run_end_to_end(main=run,
+                   extra_flags=export_flags,
+                   model_dir=model_dir)
+    self.assertTrue(os.path.exists(export_path))
+  @combinations.generate(distribution_strategy_combinations())
+  def test_end_to_end_invalid_mode(self, distribution, model, dataset):
+    """Test the Keras EfficientNet model with `strategy`."""
+    model_dir = self.get_temp_dir()
+    extra_flags = [
+        '--data_dir=not_used',
+        '--mode=invalid_mode',
+        '--model_type=' + model,
+        '--dataset=' + dataset,
+        get_params_override(basic_params_override()),
+    ]
+    run = functools.partial(classifier_trainer.run,
+                            strategy_override=distribution)
+    with self.assertRaises(ValueError):
+      run_end_to_end(main=run, extra_flags=extra_flags, model_dir=model_dir)
+class UtilTests(parameterized.TestCase, tf.test.TestCase):
+  """Tests for individual utility functions within classifier_trainer.py."""
+  @parameterized.named_parameters(
+      ('efficientnet-b0', 'efficientnet', 'efficientnet-b0', 224),
+      ('efficientnet-b1', 'efficientnet', 'efficientnet-b1', 240),
+      ('efficientnet-b2', 'efficientnet', 'efficientnet-b2', 260),
+      ('efficientnet-b3', 'efficientnet', 'efficientnet-b3', 300),
+      ('efficientnet-b4', 'efficientnet', 'efficientnet-b4', 380),
+      ('efficientnet-b5', 'efficientnet', 'efficientnet-b5', 456),
+      ('efficientnet-b6', 'efficientnet', 'efficientnet-b6', 528),
+      ('efficientnet-b7', 'efficientnet', 'efficientnet-b7', 600),
+      ('resnet', 'resnet', '', None),
+  )
+  def test_get_model_size(self, model, model_name, expected):
+    config = base_configs.ExperimentConfig(
+        model_name=model,
+        model=base_configs.ModelConfig(
+            model_params={
+                'model_name': model_name,
+            },
+        )
+    )
+    size = classifier_trainer.get_image_size_from_model(config)
+    self.assertEqual(size, expected)
+  @parameterized.named_parameters(
+      ('dynamic', 'dynamic', None, 'dynamic'),
+      ('scalar', 128., None, 128.),
+      ('float32', None, 'float32', 1),
+      ('float16', None, 'float16', 128),
+  )
+  def test_get_loss_scale(self, loss_scale, dtype, expected):
+    config = base_configs.ExperimentConfig(
+        model=base_configs.ModelConfig(
+            loss=base_configs.LossConfig(loss_scale=loss_scale)),
+        train_dataset=dataset_factory.DatasetConfig(dtype=dtype))
+    ls = classifier_trainer.get_loss_scale(config, fp16_default=128)
+    self.assertEqual(ls, expected)
+  @parameterized.named_parameters(
+      ('float16', 'float16'),
+      ('bfloat16', 'bfloat16')
+  )
+  def test_initialize(self, dtype):
+    config = base_configs.ExperimentConfig(
+        runtime=base_configs.RuntimeConfig(
+            enable_eager=False,
+            enable_xla=False,
+            gpu_threads_enabled=True,
+            per_gpu_thread_count=1,
+            gpu_thread_mode='gpu_private',
+            num_gpus=1,
+            dataset_num_private_threads=1,
+        ),
+        train_dataset=dataset_factory.DatasetConfig(dtype=dtype),
+        model=base_configs.ModelConfig(
+            loss=base_configs.LossConfig(loss_scale='dynamic')),
+    )
+    classifier_trainer.initialize(config)
+  def test_resume_from_checkpoint(self):
+    """Tests functionality for resuming from checkpoint."""
+    # Set the keras policy
+    policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    # Get the model, datasets, and compile it.
+    model = get_trivial_model(10)
+    # Create the checkpoint
+    model_dir = self.get_temp_dir()
+    train_epochs = 1
+    train_steps = 10
+    ds = get_trivial_data()
+    callbacks = [
+        tf.keras.callbacks.ModelCheckpoint(
+            os.path.join(model_dir, 'model.ckpt-{epoch:04d}'),
+            save_weights_only=True)
+    ]
+    model.fit(
+        ds,
+        callbacks=callbacks,
+        epochs=train_epochs,
+        steps_per_epoch=train_steps)
+    # Test load from checkpoint
+    clean_model = get_trivial_model(10)
+    weights_before_load = copy.deepcopy(clean_model.get_weights())
+    initial_epoch = classifier_trainer.resume_from_checkpoint(
+        model=clean_model,
+        model_dir=model_dir,
+        train_steps=train_steps)
+    self.assertEqual(initial_epoch, 1)
+    self.assertNotAllClose(weights_before_load, clean_model.get_weights())
+    tf.io.gfile.rmtree(model_dir)
+  def test_serialize_config(self):
+    """Tests functionality for serializing data."""
+    config = base_configs.ExperimentConfig()
+    model_dir = self.get_temp_dir()
+    classifier_trainer.serialize_config(params=config, model_dir=model_dir)
+    saved_params_path = os.path.join(model_dir, 'params.yaml')
+    self.assertTrue(os.path.exists(saved_params_path))
+    tf.io.gfile.rmtree(model_dir)
+if __name__ == '__main__':
+  assert tf.version.VERSION.startswith('2.')
+  tf.test.main()