Unverified Commit 1f3247f4 authored by Ayushman Kumar's avatar Ayushman Kumar Committed by GitHub
Browse files

Merge pull request #6 from tensorflow/master

Updated
parents 370a4c8d 0265f59c
...@@ -25,6 +25,7 @@ from __future__ import division ...@@ -25,6 +25,7 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from absl import logging
from official.utils.logs import hooks from official.utils.logs import hooks
from official.utils.logs import logger from official.utils.logs import logger
...@@ -57,9 +58,9 @@ def get_train_hooks(name_list, use_tpu=False, **kwargs): ...@@ -57,9 +58,9 @@ def get_train_hooks(name_list, use_tpu=False, **kwargs):
return [] return []
if use_tpu: if use_tpu:
tf.compat.v1.logging.warning('hooks_helper received name_list `{}`, but a ' logging.warning(
'TPU is specified. No hooks will be used.' 'hooks_helper received name_list `%s`, but a '
.format(name_list)) 'TPU is specified. No hooks will be used.', name_list)
return [] return []
train_hooks = [] train_hooks = []
......
...@@ -21,12 +21,13 @@ from __future__ import print_function ...@@ -21,12 +21,13 @@ from __future__ import print_function
import time import time
from absl import logging
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks from official.utils.logs import hooks
from official.utils.testing import mock_lib from official.utils.testing import mock_lib
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG) logging.set_verbosity(logging.DEBUG)
class ExamplesPerSecondHookTest(tf.test.TestCase): class ExamplesPerSecondHookTest(tf.test.TestCase):
......
...@@ -35,6 +35,7 @@ from six.moves import _thread as thread ...@@ -35,6 +35,7 @@ from six.moves import _thread as thread
from absl import flags from absl import flags
import tensorflow as tf import tensorflow as tf
from tensorflow.python.client import device_lib from tensorflow.python.client import device_lib
from absl import logging
from official.utils.logs import cloud_lib from official.utils.logs import cloud_lib
...@@ -119,8 +120,7 @@ class BaseBenchmarkLogger(object): ...@@ -119,8 +120,7 @@ class BaseBenchmarkLogger(object):
eval_results: dict, the result of evaluate. eval_results: dict, the result of evaluate.
""" """
if not isinstance(eval_results, dict): if not isinstance(eval_results, dict):
tf.compat.v1.logging.warning( logging.warning("eval_results should be dictionary for logging. Got %s",
"eval_results should be dictionary for logging. Got %s",
type(eval_results)) type(eval_results))
return return
global_step = eval_results[tf.compat.v1.GraphKeys.GLOBAL_STEP] global_step = eval_results[tf.compat.v1.GraphKeys.GLOBAL_STEP]
...@@ -144,12 +144,12 @@ class BaseBenchmarkLogger(object): ...@@ -144,12 +144,12 @@ class BaseBenchmarkLogger(object):
""" """
metric = _process_metric_to_json(name, value, unit, global_step, extras) metric = _process_metric_to_json(name, value, unit, global_step, extras)
if metric: if metric:
tf.compat.v1.logging.info("Benchmark metric: %s", metric) logging.info("Benchmark metric: %s", metric)
def log_run_info(self, model_name, dataset_name, run_params, test_id=None): def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
tf.compat.v1.logging.info( logging.info(
"Benchmark run: %s", _gather_run_info(model_name, dataset_name, "Benchmark run: %s",
run_params, test_id)) _gather_run_info(model_name, dataset_name, run_params, test_id))
def on_finish(self, status): def on_finish(self, status):
pass pass
...@@ -187,7 +187,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger): ...@@ -187,7 +187,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
self._metric_file_handler.write("\n") self._metric_file_handler.write("\n")
self._metric_file_handler.flush() self._metric_file_handler.flush()
except (TypeError, ValueError) as e: except (TypeError, ValueError) as e:
tf.compat.v1.logging.warning( logging.warning(
"Failed to dump metric to log file: name %s, value %s, error %s", "Failed to dump metric to log file: name %s, value %s, error %s",
name, value, e) name, value, e)
...@@ -212,8 +212,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger): ...@@ -212,8 +212,7 @@ class BenchmarkFileLogger(BaseBenchmarkLogger):
json.dump(run_info, f) json.dump(run_info, f)
f.write("\n") f.write("\n")
except (TypeError, ValueError) as e: except (TypeError, ValueError) as e:
tf.compat.v1.logging.warning( logging.warning("Failed to dump benchmark run info to log file: %s", e)
"Failed to dump benchmark run info to log file: %s", e)
def on_finish(self, status): def on_finish(self, status):
self._metric_file_handler.flush() self._metric_file_handler.flush()
...@@ -322,8 +321,8 @@ def _process_metric_to_json( ...@@ -322,8 +321,8 @@ def _process_metric_to_json(
name, value, unit=None, global_step=None, extras=None): name, value, unit=None, global_step=None, extras=None):
"""Validate the metric data and generate JSON for insert.""" """Validate the metric data and generate JSON for insert."""
if not isinstance(value, numbers.Number): if not isinstance(value, numbers.Number):
tf.compat.v1.logging.warning( logging.warning("Metric value to log should be a number. Got %s",
"Metric value to log should be a number. Got %s", type(value)) type(value))
return None return None
extras = _convert_to_json_dict(extras) extras = _convert_to_json_dict(extras)
...@@ -383,8 +382,7 @@ def _collect_cpu_info(run_info): ...@@ -383,8 +382,7 @@ def _collect_cpu_info(run_info):
run_info["machine_config"]["cpu_info"] = cpu_info run_info["machine_config"]["cpu_info"] = cpu_info
except ImportError: except ImportError:
tf.compat.v1.logging.warn( logging.warn("'cpuinfo' not imported. CPU info will not be logged.")
"'cpuinfo' not imported. CPU info will not be logged.")
def _collect_memory_info(run_info): def _collect_memory_info(run_info):
...@@ -396,8 +394,7 @@ def _collect_memory_info(run_info): ...@@ -396,8 +394,7 @@ def _collect_memory_info(run_info):
run_info["machine_config"]["memory_total"] = vmem.total run_info["machine_config"]["memory_total"] = vmem.total
run_info["machine_config"]["memory_available"] = vmem.available run_info["machine_config"]["memory_available"] = vmem.available
except ImportError: except ImportError:
tf.compat.v1.logging.warn( logging.warn("'psutil' not imported. Memory info will not be logged.")
"'psutil' not imported. Memory info will not be logged.")
def _collect_test_environment(run_info): def _collect_test_environment(run_info):
......
...@@ -28,6 +28,7 @@ import unittest ...@@ -28,6 +28,7 @@ import unittest
import mock import mock
from absl.testing import flagsaver from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from absl import logging
try: try:
from google.cloud import bigquery from google.cloud import bigquery
...@@ -79,7 +80,7 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -79,7 +80,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
mock_logger = mock.MagicMock() mock_logger = mock.MagicMock()
mock_config_benchmark_logger.return_value = mock_logger mock_config_benchmark_logger.return_value = mock_logger
with logger.benchmark_context(None): with logger.benchmark_context(None):
tf.compat.v1.logging.info("start benchmarking") logging.info("start benchmarking")
mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_SUCCESS) mock_logger.on_finish.assert_called_once_with(logger.RUN_STATUS_SUCCESS)
@mock.patch("official.utils.logs.logger.config_benchmark_logger") @mock.patch("official.utils.logs.logger.config_benchmark_logger")
...@@ -96,18 +97,18 @@ class BaseBenchmarkLoggerTest(tf.test.TestCase): ...@@ -96,18 +97,18 @@ class BaseBenchmarkLoggerTest(tf.test.TestCase):
def setUp(self): def setUp(self):
super(BaseBenchmarkLoggerTest, self).setUp() super(BaseBenchmarkLoggerTest, self).setUp()
self._actual_log = tf.compat.v1.logging.info self._actual_log = logging.info
self.logged_message = None self.logged_message = None
def mock_log(*args, **kwargs): def mock_log(*args, **kwargs):
self.logged_message = args self.logged_message = args
self._actual_log(*args, **kwargs) self._actual_log(*args, **kwargs)
tf.compat.v1.logging.info = mock_log logging.info = mock_log
def tearDown(self): def tearDown(self):
super(BaseBenchmarkLoggerTest, self).tearDown() super(BaseBenchmarkLoggerTest, self).tearDown()
tf.compat.v1.logging.info = self._actual_log logging.info = self._actual_log
def test_log_metric(self): def test_log_metric(self):
log = logger.BaseBenchmarkLogger() log = logger.BaseBenchmarkLogger()
......
...@@ -31,8 +31,9 @@ import re ...@@ -31,8 +31,9 @@ import re
import subprocess import subprocess
import sys import sys
import typing import typing
from absl import logging
# pylint:disable=logging-format-interpolation
import tensorflow as tf
_MIN_VERSION = (0, 0, 10) _MIN_VERSION = (0, 0, 10)
_STACK_OFFSET = 2 _STACK_OFFSET = 2
...@@ -94,8 +95,7 @@ def get_mlperf_log(): ...@@ -94,8 +95,7 @@ def get_mlperf_log():
version = pkg_resources.get_distribution("mlperf_compliance") version = pkg_resources.get_distribution("mlperf_compliance")
version = tuple(int(i) for i in version.version.split(".")) version = tuple(int(i) for i in version.version.split("."))
if version < _MIN_VERSION: if version < _MIN_VERSION:
tf.compat.v1.logging.warning( logging.warning("mlperf_compliance is version {}, must be >= {}".format(
"mlperf_compliance is version {}, must be >= {}".format(
".".join([str(i) for i in version]), ".".join([str(i) for i in version]),
".".join([str(i) for i in _MIN_VERSION]))) ".".join([str(i) for i in _MIN_VERSION])))
raise ImportError raise ImportError
...@@ -187,6 +187,6 @@ def clear_system_caches(): ...@@ -187,6 +187,6 @@ def clear_system_caches():
if __name__ == "__main__": if __name__ == "__main__":
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) logging.set_verbosity(logging.INFO)
with LOGGER(True): with LOGGER(True):
ncf_print(key=TAGS.RUN_START) ncf_print(key=TAGS.RUN_START)
...@@ -22,6 +22,8 @@ import json ...@@ -22,6 +22,8 @@ import json
import os import os
import random import random
import string import string
from absl import logging
import tensorflow.compat.v2 as tf import tensorflow.compat.v2 as tf
from official.utils.misc import tpu_lib from official.utils.misc import tpu_lib
...@@ -252,7 +254,7 @@ class SyntheticIterator(object): ...@@ -252,7 +254,7 @@ class SyntheticIterator(object):
def _monkey_patch_dataset_method(strategy): def _monkey_patch_dataset_method(strategy):
"""Monkey-patch `strategy`'s `make_dataset_iterator` method.""" """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
def make_dataset(self, dataset): def make_dataset(self, dataset):
tf.compat.v1.logging.info('Using pure synthetic data.') logging.info('Using pure synthetic data.')
with self.scope(): with self.scope():
if self.extended._global_batch_size: # pylint: disable=protected-access if self.extended._global_batch_size: # pylint: disable=protected-access
return SyntheticDataset(dataset, self.num_replicas_in_sync) return SyntheticDataset(dataset, self.num_replicas_in_sync)
......
...@@ -20,8 +20,11 @@ from __future__ import print_function ...@@ -20,8 +20,11 @@ from __future__ import print_function
import numbers import numbers
from absl import logging
import tensorflow as tf import tensorflow as tf
from tensorflow.python.util import nest from tensorflow.python.util import nest
# pylint:disable=logging-format-interpolation
def past_stop_threshold(stop_threshold, eval_metric): def past_stop_threshold(stop_threshold, eval_metric):
...@@ -48,8 +51,7 @@ def past_stop_threshold(stop_threshold, eval_metric): ...@@ -48,8 +51,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
"must be a number.") "must be a number.")
if eval_metric >= stop_threshold: if eval_metric >= stop_threshold:
tf.compat.v1.logging.info( logging.info("Stop threshold of {} was passed with metric value {}.".format(
"Stop threshold of {} was passed with metric value {}.".format(
stop_threshold, eval_metric)) stop_threshold, eval_metric))
return True return True
...@@ -88,6 +90,6 @@ def generate_synthetic_data( ...@@ -88,6 +90,6 @@ def generate_synthetic_data(
def apply_clean(flags_obj): def apply_clean(flags_obj):
if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir): if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
tf.compat.v1.logging.info("--clean flag set. Removing existing model dir:" logging.info("--clean flag set. Removing existing model dir:"
" {}".format(flags_obj.model_dir)) " {}".format(flags_obj.model_dir))
tf.io.gfile.rmtree(flags_obj.model_dir) tf.io.gfile.rmtree(flags_obj.model_dir)
...@@ -20,8 +20,9 @@ from __future__ import print_function ...@@ -20,8 +20,9 @@ from __future__ import print_function
import os import os
from absl import flags from absl import flags
from absl import logging
from absl.testing import flagsaver from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -75,7 +76,7 @@ class PerfZeroBenchmark(tf.test.Benchmark): ...@@ -75,7 +76,7 @@ class PerfZeroBenchmark(tf.test.Benchmark):
def _setup(self): def _setup(self):
"""Sets up and resets flags before each test.""" """Sets up and resets flags before each test."""
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) logging.set_verbosity(logging.INFO)
if PerfZeroBenchmark.local_flags is None: if PerfZeroBenchmark.local_flags is None:
for flag_method in self.flag_methods: for flag_method in self.flag_methods:
flag_method() flag_method()
......
...@@ -87,10 +87,6 @@ BASE_CFG = { ...@@ -87,10 +87,6 @@ BASE_CFG = {
}, },
'resnet': { 'resnet': {
'resnet_depth': 50, 'resnet_depth': 50,
'dropblock': {
'dropblock_keep_prob': None,
'dropblock_size': None,
},
'batch_norm': { 'batch_norm': {
'batch_norm_momentum': 0.997, 'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4, 'batch_norm_epsilon': 1e-4,
...@@ -111,43 +107,6 @@ BASE_CFG = { ...@@ -111,43 +107,6 @@ BASE_CFG = {
'use_sync_bn': False, 'use_sync_bn': False,
}, },
}, },
'nasfpn': {
'min_level': 3,
'max_level': 7,
'fpn_feat_dims': 256,
'num_repeats': 5,
'use_separable_conv': False,
'dropblock': {
'dropblock_keep_prob': None,
'dropblock_size': None,
},
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
},
# tunable_nasfpn:strip_begin
'tunable_nasfpn_v1': {
'min_level': 3,
'max_level': 7,
'fpn_feat_dims': 256,
'num_repeats': 5,
'use_separable_conv': False,
'dropblock': {
'dropblock_keep_prob': None,
'dropblock_size': None,
},
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
'use_sync_bn': False,
},
'nodes': None
},
# tunable_nasfpn:strip_end
'postprocess': { 'postprocess': {
'use_batched_nms': False, 'use_batched_nms': False,
'max_total_size': 100, 'max_total_size': 100,
......
...@@ -106,10 +106,6 @@ RETINANET_CFG = { ...@@ -106,10 +106,6 @@ RETINANET_CFG = {
}, },
'resnet': { 'resnet': {
'resnet_depth': 50, 'resnet_depth': 50,
'dropblock': {
'dropblock_keep_prob': None,
'dropblock_size': None,
},
'batch_norm': { 'batch_norm': {
'batch_norm_momentum': 0.997, 'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4, 'batch_norm_epsilon': 1e-4,
...@@ -128,22 +124,6 @@ RETINANET_CFG = { ...@@ -128,22 +124,6 @@ RETINANET_CFG = {
'batch_norm_trainable': True, 'batch_norm_trainable': True,
}, },
}, },
'nasfpn': {
'min_level': 3,
'max_level': 7,
'fpn_feat_dims': 256,
'num_repeats': 5,
'use_separable_conv': False,
'dropblock': {
'dropblock_keep_prob': None,
'dropblock_size': None,
},
'batch_norm': {
'batch_norm_momentum': 0.997,
'batch_norm_epsilon': 1e-4,
'batch_norm_trainable': True,
},
},
'retinanet_head': { 'retinanet_head': {
'min_level': 3, 'min_level': 3,
'max_level': 7, 'max_level': 7,
......
...@@ -52,7 +52,7 @@ flags.DEFINE_string( ...@@ -52,7 +52,7 @@ flags.DEFINE_string(
flags.DEFINE_string( flags.DEFINE_string(
'model', default='retinanet', 'model', default='retinanet',
help='Model to run: `retinanet` or `shapemask`.') help='Model to run: `retinanet` or `mask_rcnn`.')
flags.DEFINE_string('training_file_pattern', None, flags.DEFINE_string('training_file_pattern', None,
'Location of the train data.') 'Location of the train data.')
......
...@@ -37,19 +37,12 @@ def batch_norm_relu_generator(params): ...@@ -37,19 +37,12 @@ def batch_norm_relu_generator(params):
return _batch_norm_op return _batch_norm_op
def dropblock_generator(params):
return nn_ops.Dropblock(
dropblock_keep_prob=params.dropblock_keep_prob,
dropblock_size=params.dropblock_size)
def backbone_generator(params): def backbone_generator(params):
"""Generator function for various backbone models.""" """Generator function for various backbone models."""
if params.architecture.backbone == 'resnet': if params.architecture.backbone == 'resnet':
resnet_params = params.resnet resnet_params = params.resnet
backbone_fn = resnet.Resnet( backbone_fn = resnet.Resnet(
resnet_depth=resnet_params.resnet_depth, resnet_depth=resnet_params.resnet_depth,
dropblock=dropblock_generator(resnet_params.dropblock),
batch_norm_relu=batch_norm_relu_generator(resnet_params.batch_norm)) batch_norm_relu=batch_norm_relu_generator(resnet_params.batch_norm))
else: else:
raise ValueError('Backbone model %s is not supported.' % raise ValueError('Backbone model %s is not supported.' %
......
...@@ -84,88 +84,3 @@ class BatchNormRelu(tf.keras.layers.Layer): ...@@ -84,88 +84,3 @@ class BatchNormRelu(tf.keras.layers.Layer):
inputs = tf.nn.relu(inputs) inputs = tf.nn.relu(inputs)
return inputs return inputs
class Dropblock(object):
"""DropBlock: a regularization method for convolutional neural networks.
DropBlock is a form of structured dropout, where units in a contiguous
region of a feature map are dropped together. DropBlock works better than
dropout on convolutional layers due to the fact that activation units in
convolutional layers are spatially correlated.
See https://arxiv.org/pdf/1810.12890.pdf for details.
"""
def __init__(self,
dropblock_keep_prob=None,
dropblock_size=None,
data_format='channels_last'):
self._dropblock_keep_prob = dropblock_keep_prob
self._dropblock_size = dropblock_size
self._data_format = data_format
def __call__(self, net, is_training=False):
"""Builds Dropblock layer.
Args:
net: `Tensor` input tensor.
is_training: `bool` if True, the model is in training mode.
Returns:
A version of input tensor with DropBlock applied.
"""
if not is_training or self._dropblock_keep_prob is None:
return net
logging.info('Applying DropBlock: dropblock_size {}, net.shape {}'.format(
self._dropblock_size, net.shape))
if self._data_format == 'channels_last':
_, height, width, _ = net.get_shape().as_list()
else:
_, _, height, width = net.get_shape().as_list()
total_size = width * height
dropblock_size = min(self._dropblock_size, min(width, height))
# Seed_drop_rate is the gamma parameter of DropBlcok.
seed_drop_rate = (
1.0 - self._dropblock_keep_prob) * total_size / dropblock_size**2 / (
(width - self._dropblock_size + 1) *
(height - self._dropblock_size + 1))
# Forces the block to be inside the feature map.
w_i, h_i = tf.meshgrid(tf.range(width), tf.range(height))
valid_block = tf.logical_and(
tf.logical_and(w_i >= int(dropblock_size // 2),
w_i < width - (dropblock_size - 1) // 2),
tf.logical_and(h_i >= int(dropblock_size // 2),
h_i < width - (dropblock_size - 1) // 2))
if self._data_format == 'channels_last':
valid_block = tf.reshape(valid_block, [1, height, width, 1])
else:
valid_block = tf.reshape(valid_block, [1, 1, height, width])
randnoise = tf.random.uniform(net.shape, dtype=tf.float32)
valid_block = tf.cast(valid_block, dtype=tf.float32)
seed_keep_rate = tf.cast(1 - seed_drop_rate, dtype=tf.float32)
block_pattern = (1 - valid_block + seed_keep_rate + randnoise) >= 1
block_pattern = tf.cast(block_pattern, dtype=tf.float32)
if self._data_format == 'channels_last':
ksize = [1, self._dropblock_size, self._dropblock_size, 1]
else:
ksize = [1, 1, self._dropblock_size, self._dropblock_size]
block_pattern = -tf.nn.max_pool2d(
-block_pattern,
ksize=ksize,
strides=[1, 1, 1, 1],
padding='SAME',
data_format='NHWC' if self._data_format == 'channels_last' else 'NCHW')
percent_ones = tf.cast(
tf.reduce_sum(input_tensor=block_pattern), tf.float32) / tf.cast(
tf.size(input=block_pattern), tf.float32)
net = net / tf.cast(percent_ones, net.dtype) * tf.cast(
block_pattern, net.dtype)
return net
...@@ -34,14 +34,12 @@ class Resnet(object): ...@@ -34,14 +34,12 @@ class Resnet(object):
def __init__(self, def __init__(self,
resnet_depth, resnet_depth,
dropblock=nn_ops.Dropblock(),
batch_norm_relu=nn_ops.BatchNormRelu, batch_norm_relu=nn_ops.BatchNormRelu,
data_format='channels_last'): data_format='channels_last'):
"""ResNet initialization function. """ResNet initialization function.
Args: Args:
resnet_depth: `int` depth of ResNet backbone model. resnet_depth: `int` depth of ResNet backbone model.
dropblock: a dropblock layer.
batch_norm_relu: an operation that includes a batch normalization layer batch_norm_relu: an operation that includes a batch normalization layer
followed by a relu layer(optional). followed by a relu layer(optional).
data_format: `str` either "channels_first" for `[batch, channels, height, data_format: `str` either "channels_first" for `[batch, channels, height,
...@@ -49,7 +47,6 @@ class Resnet(object): ...@@ -49,7 +47,6 @@ class Resnet(object):
""" """
self._resnet_depth = resnet_depth self._resnet_depth = resnet_depth
self._dropblock = dropblock
self._batch_norm_relu = batch_norm_relu self._batch_norm_relu = batch_norm_relu
self._data_format = data_format self._data_format = data_format
...@@ -219,24 +216,20 @@ class Resnet(object): ...@@ -219,24 +216,20 @@ class Resnet(object):
inputs=inputs, filters=filters_out, kernel_size=1, strides=strides) inputs=inputs, filters=filters_out, kernel_size=1, strides=strides)
shortcut = self._batch_norm_relu(relu=False)( shortcut = self._batch_norm_relu(relu=False)(
shortcut, is_training=is_training) shortcut, is_training=is_training)
shortcut = self._dropblock(shortcut, is_training=is_training)
inputs = self.conv2d_fixed_padding( inputs = self.conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=1, strides=1) inputs=inputs, filters=filters, kernel_size=1, strides=1)
inputs = self._batch_norm_relu()(inputs, is_training=is_training) inputs = self._batch_norm_relu()(inputs, is_training=is_training)
inputs = self._dropblock(inputs, is_training=is_training)
inputs = self.conv2d_fixed_padding( inputs = self.conv2d_fixed_padding(
inputs=inputs, filters=filters, kernel_size=3, strides=strides) inputs=inputs, filters=filters, kernel_size=3, strides=strides)
inputs = self._batch_norm_relu()(inputs, is_training=is_training) inputs = self._batch_norm_relu()(inputs, is_training=is_training)
inputs = self._dropblock(inputs, is_training=is_training)
inputs = self.conv2d_fixed_padding( inputs = self.conv2d_fixed_padding(
inputs=inputs, filters=4 * filters, kernel_size=1, strides=1) inputs=inputs, filters=4 * filters, kernel_size=1, strides=1)
inputs = self._batch_norm_relu( inputs = self._batch_norm_relu(
relu=False, init_zero=True)( relu=False, init_zero=True)(
inputs, is_training=is_training) inputs, is_training=is_training)
inputs = self._dropblock(inputs, is_training=is_training)
return tf.nn.relu(inputs + shortcut) return tf.nn.relu(inputs + shortcut)
......
# Image Classification # Image Classification
This folder contains the TF 2.0 model examples for image classification: This folder contains TF 2.0 model examples for image classification:
* [ResNet](#resnet)
* [MNIST](#mnist) * [MNIST](#mnist)
* [Classifier Trainer](#classifier-trainer), a framework that uses the Keras
compile/fit methods for image classification models, including:
* ResNet
* EfficientNet[^1]
[^1]: Currently a work in progress. We cannot match "AutoAugment (AA)" in [the original version](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet).
For more information about other types of models, please refer to this For more information about other types of models, please refer to this
[README file](../../README.md). [README file](../../README.md).
## ResNet ## Before you begin
Similar to the [estimator implementation](../../r1/resnet), the Keras
implementation has code for the ImageNet dataset. The ImageNet
version uses a ResNet50 model implemented in
[`resnet_model.py`](./resnet/resnet_model.py).
Please make sure that you have the latest version of TensorFlow Please make sure that you have the latest version of TensorFlow
installed and installed and
[add the models folder to your Python path](/official/#running-the-models). [add the models folder to your Python path](/official/#running-the-models).
### Pretrained Models ### ImageNet preparation
* [ResNet50 Checkpoints](https://storage.googleapis.com/cloud-tpu-checkpoints/resnet/resnet50.tar.gz)
* ResNet50 TFHub: [feature vector](https://tfhub.dev/tensorflow/resnet_50/feature_vector/1)
and [classification](https://tfhub.dev/tensorflow/resnet_50/classification/1)
### ImageNet Training
Download the ImageNet dataset and convert it to TFRecord format. Download the ImageNet dataset and convert it to TFRecord format.
The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py) The following [script](https://github.com/tensorflow/tpu/blob/master/tools/datasets/imagenet_to_gcs.py)
and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy) and [README](https://github.com/tensorflow/tpu/tree/master/tools/datasets#imagenet_to_gcspy)
provide a few options. provide a few options.
Once your dataset is ready, you can begin training the model as follows:
```bash
python resnet/resnet_imagenet_main.py
```
Again, if you did not download the data to the default directory, specify the
location with the `--data_dir` flag:
```bash
python resnet/resnet_imagenet_main.py --data_dir=/path/to/imagenet
```
There are more flag options you can specify. Here are some examples:
- `--use_synthetic_data`: when set to true, synthetic data, rather than real
data, are used;
- `--batch_size`: the batch size used for the model;
- `--model_dir`: the directory to save the model checkpoint;
- `--train_epochs`: number of epoches to run for training the model;
- `--train_steps`: number of steps to run for training the model. We now only
support a number that is smaller than the number of batches in an epoch.
- `--skip_eval`: when set to true, evaluation as well as validation during
training is skipped
For example, this is a typical command line to run with ImageNet data with
batch size 128 per GPU:
```bash
python -m resnet/resnet_imagenet_main.py \
--model_dir=/tmp/model_dir/something \
--num_gpus=2 \
--batch_size=128 \
--train_epochs=90 \
--train_steps=10 \
--use_synthetic_data=false
```
See [`common.py`](common.py) for full list of options.
### Using multiple GPUs
You can train these models on multiple GPUs using `tf.distribute.Strategy` API.
You can read more about them in this
[guide](https://www.tensorflow.org/guide/distribute_strategy).
In this example, we have made it easier to use is with just a command line flag
`--num_gpus`. By default this flag is 1 if TensorFlow is compiled with CUDA,
and 0 otherwise.
- --num_gpus=0: Uses tf.distribute.OneDeviceStrategy with CPU as the device.
- --num_gpus=1: Uses tf.distribute.OneDeviceStrategy with GPU as the device.
- --num_gpus=2+: Uses tf.distribute.MirroredStrategy to run synchronous
distributed training across the GPUs.
If you wish to run without `tf.distribute.Strategy`, you can do so by setting
`--distribution_strategy=off`.
### Running on multiple GPU hosts
You can also train these models on multiple hosts, each with GPUs, using
`tf.distribute.Strategy`.
The easiest way to run multi-host benchmarks is to set the
[`TF_CONFIG`](https://www.tensorflow.org/guide/distributed_training#TF_CONFIG)
appropriately at each host. e.g., to run using `MultiWorkerMirroredStrategy` on
2 hosts, the `cluster` in `TF_CONFIG` should have 2 `host:port` entries, and
host `i` should have the `task` in `TF_CONFIG` set to `{"type": "worker",
"index": i}`. `MultiWorkerMirroredStrategy` will automatically use all the
available GPUs at each host.
### Running on Cloud TPUs ### Running on Cloud TPUs
Note: This model will **not** work with TPUs on Colab. Note: These models will **not** work with TPUs on Colab.
You can train the ResNet CTL model on Cloud TPUs using You can train image classification models on Cloud TPUs using
`tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is `tf.distribute.TPUStrategy`. If you are not familiar with Cloud TPUs, it is
strongly recommended that you go through the strongly recommended that you go through the
[quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to [quickstart](https://cloud.google.com/tpu/docs/quickstart) to learn how to
create a TPU and GCE VM. create a TPU and GCE VM.
To run ResNet model on a TPU, you must set `--distribution_strategy=tpu` and ## MNIST
`--tpu=$TPU_NAME`, where `$TPU_NAME` the name of your TPU in the Cloud Console.
From a GCE VM, you can run the following command to train ResNet for one epoch To download the data and run the MNIST sample model locally for the first time,
on a v2-8 or v3-8 TPU: run one of the following command:
```bash ```bash
python resnet/resnet_ctl_imagenet_main.py \ python3 mnist_main.py \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \ --model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \ --data_dir=$DATA_DIR \
--batch_size=1024 \ --train_epochs=10 \
--steps_per_loop=500 \ --distribution_strategy=one_device \
--train_epochs=1 \ --num_gpus=$NUM_GPUS \
--use_synthetic_data=false \ --download
--dtype=fp32 \
--enable_eager=true \
--enable_tensorboard=true \
--distribution_strategy=tpu \
--log_steps=50 \
--single_l2_loss_op=true \
--use_tf_function=true
``` ```
To train the ResNet to convergence, run it for 90 epochs: To train the model on a Cloud TPU, run the following command:
```bash ```bash
python resnet/resnet_ctl_imagenet_main.py \ python3 mnist_main.py \
--tpu=$TPU_NAME \ --tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \ --model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \ --data_dir=$DATA_DIR \
--batch_size=1024 \ --train_epochs=10 \
--steps_per_loop=500 \
--train_epochs=90 \
--use_synthetic_data=false \
--dtype=fp32 \
--enable_eager=true \
--enable_tensorboard=true \
--distribution_strategy=tpu \ --distribution_strategy=tpu \
--log_steps=50 \ --download
--single_l2_loss_op=true \
--use_tf_function=true
``` ```
Note: `$MODEL_DIR` and `$DATA_DIR` must be GCS paths. Note: the `--download` flag is only required the first time you run the model.
## MNIST ## Classifier Trainer
The classifier trainer is a unified framework for running image classification
models using Keras's compile/fit methods. Experiments should be provided in the
form of YAML files, some examples are included within the configs/examples
folder. Please see [configs/examples](./configs/examples) for more example
configurations.
To download the data and run the MNIST sample model locally for the first time, The provided configuration files use a per replica batch size and is scaled
run one of the following command: by the number of devices. For instance, if `batch size` = 64, then for 1 GPU
the global batch size would be 64 * 1 = 64. For 8 GPUs, the global batch size
would be 64 * 8 = 512. Similarly, for a v3-8 TPU, the global batch size would
be 64 * 8 = 512, and for a v3-32, the global batch size is 64 * 32 = 2048.
### ResNet50
#### On GPU:
```bash ```bash
python mnist_main.py \ python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \ --model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \ --data_dir=$DATA_DIR \
--train_epochs=10 \ --config_file=configs/examples/resnet/imagenet/gpu.yaml \
--distribution_strategy=one_device \ --params_override='runtime.num_gpus=$NUM_GPUS'
--num_gpus=$NUM_GPUS \
--download
``` ```
To train the model on a Cloud TPU, run the following command: #### On TPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=resnet \
--dataset=imagenet \
--tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=config/examples/resnet/imagenet/tpu.yaml
```
### EfficientNet
**Note: EfficientNet development is a work in progress.**
#### On GPU:
```bash
python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \
--config_file=configs/examples/efficientnet/imagenet/efficientnet-b0-gpu.yaml \
--params_override='runtime.num_gpus=$NUM_GPUS'
```
#### On TPU:
```bash ```bash
python mnist_main.py \ python3 classifier_trainer.py \
--mode=train_and_eval \
--model_type=efficientnet \
--dataset=imagenet \
--tpu=$TPU_NAME \ --tpu=$TPU_NAME \
--model_dir=$MODEL_DIR \ --model_dir=$MODEL_DIR \
--data_dir=$DATA_DIR \ --data_dir=$DATA_DIR \
--train_epochs=10 \ --config_file=config/examples/efficientnet/imagenet/efficientnet-b0-tpu.yaml
--distribution_strategy=tpu \
--download
``` ```
Note: the `--download` flag is only required the first time you run the model. Note that the number of GPU devices can be overridden in the command line using
`--params_overrides`. The TPU does not need this override as the device is fixed
by providing the TPU address or name with the `--tpu` flag.
This diff is collapsed.
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Tests for autoaugment."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
from absl.testing import parameterized
import tensorflow.compat.v2 as tf
from official.vision.image_classification import augment
def get_dtype_test_cases():
return [
('uint8', tf.uint8),
('int32', tf.int32),
('float16', tf.float16),
('float32', tf.float32),
]
@parameterized.named_parameters(get_dtype_test_cases())
class TransformsTest(parameterized.TestCase, tf.test.TestCase):
"""Basic tests for fundamental transformations."""
def test_to_from_4d(self, dtype):
for shape in [(10, 10), (10, 10, 10), (10, 10, 10, 10)]:
original_ndims = len(shape)
image = tf.zeros(shape, dtype=dtype)
image_4d = augment.to_4d(image)
self.assertEqual(4, tf.rank(image_4d))
self.assertAllEqual(image, augment.from_4d(image_4d, original_ndims))
def test_transform(self, dtype):
image = tf.constant([[1, 2], [3, 4]], dtype=dtype)
self.assertAllEqual(augment.transform(image, transforms=[1]*8),
[[4, 4], [4, 4]])
def disable_test_translate(self, dtype):
image = tf.constant(
[[1, 0, 1, 0], [0, 1, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
dtype=dtype)
translations = [-1, -1]
translated = augment.translate(image=image,
translations=translations)
expected = [[1, 0, 1, 0], [0, 1, 0, 0], [1, 0, 1, 0], [0, 0, 0, 0]]
self.assertAllEqual(translated, expected)
def test_translate_shapes(self, dtype):
translation = [0, 0]
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.translate(image, translation))
def test_translate_invalid_translation(self, dtype):
image = tf.zeros((1, 1), dtype=dtype)
invalid_translation = [[[1, 1]]]
with self.assertRaisesRegex(TypeError, 'rank 1 or 2'):
_ = augment.translate(image, invalid_translation)
def test_rotate(self, dtype):
image = tf.reshape(tf.cast(tf.range(9), dtype), (3, 3))
rotation = 90.
transformed = augment.rotate(image=image, degrees=rotation)
expected = [[2, 5, 8],
[1, 4, 7],
[0, 3, 6]]
self.assertAllEqual(transformed, expected)
def test_rotate_shapes(self, dtype):
degrees = 0.
for shape in [(3, 3), (5, 5), (224, 224, 3)]:
image = tf.zeros(shape, dtype=dtype)
self.assertAllEqual(image, augment.rotate(image, degrees))
class AutoaugmentTest(tf.test.TestCase):
def test_autoaugment(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.AutoAugment()
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_randaug(self):
"""Smoke test to be sure there are no syntax errors."""
image = tf.zeros((224, 224, 3), dtype=tf.uint8)
augmenter = augment.RandAugment()
aug_image = augmenter.distort(image)
self.assertEqual((224, 224, 3), aug_image.shape)
def test_all_policy_ops(self):
"""Smoke test to be sure all augmentation functions can execute."""
prob = 1
magnitude = 10
replace_value = [128] * 3
cutout_const = 100
translate_const = 250
image = tf.ones((224, 224, 3), dtype=tf.uint8)
for op_name in augment.NAME_TO_FUNC:
func, _, args = augment._parse_policy_info(op_name,
prob,
magnitude,
replace_value,
cutout_const,
translate_const)
image = func(image, *args)
self.assertEqual((224, 224, 3), image.shape)
if __name__ == '__main__':
assert tf.version.VERSION.startswith('2.')
tf.test.main()
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common modules for callbacks."""
from __future__ import absolute_import
from __future__ import division
# from __future__ import google_type_annotations
from __future__ import print_function
import os
from absl import logging
import tensorflow as tf
from typing import Any, List, MutableMapping, Text
def get_callbacks(model_checkpoint: bool = True,
include_tensorboard: bool = True,
track_lr: bool = True,
write_model_weights: bool = True,
initial_step: int = 0,
model_dir: Text = None) -> List[tf.keras.callbacks.Callback]:
"""Get all callbacks."""
model_dir = model_dir or ''
callbacks = []
if model_checkpoint:
ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
callbacks.append(tf.keras.callbacks.ModelCheckpoint(
ckpt_full_path, save_weights_only=True, verbose=1))
if include_tensorboard:
callbacks.append(CustomTensorBoard(
log_dir=model_dir,
track_lr=track_lr,
initial_step=initial_step,
write_images=write_model_weights))
return callbacks
def get_scalar_from_tensor(t: tf.Tensor) -> int:
"""Utility function to convert a Tensor to a scalar."""
t = tf.keras.backend.get_value(t)
if callable(t):
return t()
else:
return t
class CustomTensorBoard(tf.keras.callbacks.TensorBoard):
"""A customized TensorBoard callback that tracks additional datapoints.
Metrics tracked:
- Global learning rate
Attributes:
log_dir: the path of the directory where to save the log files to be
parsed by TensorBoard.
track_lr: `bool`, whether or not to track the global learning rate.
initial_step: the initial step, used for preemption recovery.
**kwargs: Additional arguments for backwards compatibility. Possible key
is `period`.
"""
# TODO(b/146499062): track params, flops, log lr, l2 loss,
# classification loss
def __init__(self,
log_dir: Text,
track_lr: bool = False,
initial_step: int = 0,
**kwargs):
super(CustomTensorBoard, self).__init__(log_dir=log_dir, **kwargs)
self.step = initial_step
self._track_lr = track_lr
def on_batch_begin(self,
epoch: int,
logs: MutableMapping[Text, Any] = None) -> None:
self.step += 1
if logs is None:
logs = {}
logs.update(self._calculate_metrics())
super(CustomTensorBoard, self).on_batch_begin(epoch, logs)
def on_epoch_begin(self,
epoch: int,
logs: MutableMapping[Text, Any] = None) -> None:
if logs is None:
logs = {}
metrics = self._calculate_metrics()
logs.update(metrics)
for k, v in metrics.items():
logging.info('Current %s: %f', k, v)
super(CustomTensorBoard, self).on_epoch_begin(epoch, logs)
def on_epoch_end(self,
epoch: int,
logs: MutableMapping[Text, Any] = None) -> None:
if logs is None:
logs = {}
metrics = self._calculate_metrics()
logs.update(metrics)
super(CustomTensorBoard, self).on_epoch_end(epoch, logs)
def _calculate_metrics(self) -> MutableMapping[Text, Any]:
logs = {}
if self._track_lr:
logs['learning_rate'] = self._calculate_lr()
return logs
def _calculate_lr(self) -> int:
"""Calculates the learning rate given the current step."""
lr = self._get_base_optimizer().lr
if callable(lr):
lr = lr(self.step)
return get_scalar_from_tensor(lr)
def _get_base_optimizer(self) -> tf.keras.optimizers.Optimizer:
"""Get the base optimizer used by the current model."""
optimizer = self.model.optimizer
# The optimizer might be wrapped by another class, so unwrap it
while hasattr(optimizer, '_optimizer'):
optimizer = optimizer._optimizer # pylint:disable=protected-access
return optimizer
# Lint as: python3
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs an Image Classification model."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pprint
from typing import Any, Tuple, Text, Optional, Mapping
from absl import app
from absl import flags
from absl import logging
import tensorflow.compat.v2 as tf
from official.modeling import performance
from official.modeling.hyperparams import params_dict
from official.utils import hyperparams_flags
from official.utils.logs import logger
from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils
from official.vision.image_classification import callbacks as custom_callbacks
from official.vision.image_classification import dataset_factory
from official.vision.image_classification import optimizer_factory
from official.vision.image_classification.configs import base_configs
from official.vision.image_classification.configs import configs
from official.vision.image_classification.efficientnet import efficientnet_model
from official.vision.image_classification.resnet import common
from official.vision.image_classification.resnet import resnet_model
MODELS = {
'efficientnet': efficientnet_model.EfficientNet.from_name,
'resnet': resnet_model.resnet50,
}
def _get_metrics(one_hot: bool) -> Mapping[Text, Any]:
"""Get a dict of available metrics to track."""
if one_hot:
return {
# (name, metric_fn)
'acc': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'accuracy': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'top_1': tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
'top_5': tf.keras.metrics.TopKCategoricalAccuracy(
k=5,
name='top_5_accuracy'),
}
else:
return {
# (name, metric_fn)
'acc': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'accuracy': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'top_1': tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'),
'top_5': tf.keras.metrics.SparseTopKCategoricalAccuracy(
k=5,
name='top_5_accuracy'),
}
def get_image_size_from_model(
params: base_configs.ExperimentConfig) -> Optional[int]:
"""If the given model has a preferred image size, return it."""
if params.model_name == 'efficientnet':
efficientnet_name = params.model.model_params.model_name
if efficientnet_name in efficientnet_model.MODEL_CONFIGS:
return efficientnet_model.MODEL_CONFIGS[efficientnet_name].resolution
return None
def _get_dataset_builders(params: base_configs.ExperimentConfig,
strategy: tf.distribute.Strategy,
one_hot: bool
) -> Tuple[Any, Any, Any]:
"""Create and return train, validation, and test dataset builders."""
if one_hot:
logging.warning('label_smoothing > 0, so datasets will be one hot encoded.')
else:
logging.warning('label_smoothing not applied, so datasets will not be one '
'hot encoded.')
num_devices = strategy.num_replicas_in_sync
image_size = get_image_size_from_model(params)
dataset_configs = [
params.train_dataset, params.validation_dataset, params.test_dataset
]
builders = []
for config in dataset_configs:
if config is not None and config.has_data:
builder = dataset_factory.DatasetBuilder(
config,
image_size=image_size or config.image_size,
num_devices=num_devices,
one_hot=one_hot)
else:
builder = None
builders.append(builder)
return builders
def get_loss_scale(params: base_configs.ExperimentConfig,
fp16_default: float = 128.) -> float:
"""Returns the loss scale for initializations."""
loss_scale = params.model.loss.loss_scale
if loss_scale == 'dynamic':
return loss_scale
elif loss_scale is not None:
return float(loss_scale)
elif params.train_dataset.dtype == 'float32':
return 1.
else:
assert params.train_dataset.dtype == 'float16'
return fp16_default
def _get_params_from_flags(flags_obj: flags.FlagValues):
"""Get ParamsDict from flags."""
model = flags_obj.model_type.lower()
dataset = flags_obj.dataset.lower()
params = configs.get_config(model=model, dataset=dataset)
flags_overrides = {
'model_dir': flags_obj.model_dir,
'mode': flags_obj.mode,
'model': {
'name': model,
},
'runtime': {
'enable_eager': flags_obj.enable_eager,
'tpu': flags_obj.tpu,
},
'train_dataset': {
'data_dir': flags_obj.data_dir,
},
'validation_dataset': {
'data_dir': flags_obj.data_dir,
},
'test_dataset': {
'data_dir': flags_obj.data_dir,
},
}
overriding_configs = (flags_obj.config_file,
flags_obj.params_override,
flags_overrides)
pp = pprint.PrettyPrinter()
logging.info('Base params: %s', pp.pformat(params.as_dict()))
for param in overriding_configs:
logging.info('Overriding params: %s', param)
# Set is_strict to false because we can have dynamic dict parameters.
params = params_dict.override_params_dict(params, param, is_strict=False)
params.validate()
params.lock()
logging.info('Final model parameters: %s', pp.pformat(params.as_dict()))
return params
def resume_from_checkpoint(model: tf.keras.Model,
model_dir: str,
train_steps: int) -> int:
"""Resumes from the latest checkpoint, if possible.
Loads the model weights and optimizer settings from a checkpoint.
This function should be used in case of preemption recovery.
Args:
model: The model whose weights should be restored.
model_dir: The directory where model weights were saved.
train_steps: The number of steps to train.
Returns:
The epoch of the latest checkpoint, or 0 if not restoring.
"""
logging.info('Load from checkpoint is enabled.')
latest_checkpoint = tf.train.latest_checkpoint(model_dir)
logging.info('latest_checkpoint: %s', latest_checkpoint)
if not latest_checkpoint:
logging.info('No checkpoint detected.')
return 0
logging.info('Checkpoint file %s found and restoring from '
'checkpoint', latest_checkpoint)
model.load_weights(latest_checkpoint)
initial_epoch = model.optimizer.iterations // train_steps
logging.info('Completed loading from checkpoint.')
logging.info('Resuming from epoch %d', initial_epoch)
return int(initial_epoch)
def initialize(params: base_configs.ExperimentConfig):
"""Initializes backend related initializations."""
keras_utils.set_session_config(
enable_eager=params.runtime.enable_eager,
enable_xla=params.runtime.enable_xla)
if params.runtime.gpu_threads_enabled:
keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=params.runtime.per_gpu_thread_count,
gpu_thread_mode=params.runtime.gpu_thread_mode,
num_gpus=params.runtime.num_gpus,
datasets_num_private_threads=params.runtime.dataset_num_private_threads)
dataset = params.train_dataset or params.validation_dataset
performance.set_mixed_precision_policy(dataset.dtype)
if dataset.data_format:
data_format = dataset.data_format
elif tf.config.list_physical_devices('GPU'):
data_format = 'channels_first'
else:
data_format = 'channels_last'
tf.keras.backend.set_image_data_format(data_format)
distribution_utils.configure_cluster(
params.runtime.worker_hosts,
params.runtime.task_index)
if params.runtime.enable_eager:
# Enable eager execution to allow step-by-step debugging
tf.config.experimental_run_functions_eagerly(True)
def define_classifier_flags():
"""Defines common flags for image classification."""
hyperparams_flags.initialize_common_flags()
flags.DEFINE_string(
'data_dir',
default=None,
help='The location of the input data.')
flags.DEFINE_string(
'mode',
default=None,
help='Mode to run: `train`, `eval`, `train_and_eval` or `export`.')
flags.DEFINE_bool(
'enable_eager',
default=None,
help='Use eager execution and disable autograph for debugging.')
flags.DEFINE_string(
'model_type',
default=None,
help='The type of the model, e.g. EfficientNet, etc.')
flags.DEFINE_string(
'dataset',
default=None,
help='The name of the dataset, e.g. ImageNet, etc.')
def serialize_config(params: base_configs.ExperimentConfig,
model_dir: str):
"""Serializes and saves the experiment config."""
params_save_path = os.path.join(model_dir, 'params.yaml')
logging.info('Saving experiment configuration to %s', params_save_path)
tf.io.gfile.makedirs(model_dir)
params_dict.save_params_dict_to_yaml(params, params_save_path)
def train_and_eval(
params: base_configs.ExperimentConfig,
strategy_override: tf.distribute.Strategy) -> Mapping[str, Any]:
"""Runs the train and eval path using compile/fit."""
logging.info('Running train and eval.')
# Note: for TPUs, strategy and scope should be created before the dataset
strategy = strategy_override or distribution_utils.get_distribution_strategy(
distribution_strategy=params.runtime.distribution_strategy,
all_reduce_alg=params.runtime.all_reduce_alg,
num_gpus=params.runtime.num_gpus,
tpu_address=params.runtime.tpu)
strategy_scope = distribution_utils.get_strategy_scope(strategy)
logging.info('Detected %d devices.', strategy.num_replicas_in_sync)
label_smoothing = params.model.loss.label_smoothing
one_hot = label_smoothing and label_smoothing > 0
builders = _get_dataset_builders(params, strategy, one_hot)
datasets = [builder.build() if builder else None for builder in builders]
# Unpack datasets and builders based on train/val/test splits
train_builder, validation_builder, test_builder = builders # pylint: disable=unbalanced-tuple-unpacking
train_dataset, validation_dataset, test_dataset = datasets
train_epochs = params.train.epochs
train_steps = params.train.steps or train_builder.num_steps
validation_steps = params.evaluation.steps or validation_builder.num_steps
logging.info('Global batch size: %d', train_builder.global_batch_size)
with strategy_scope:
model_params = params.model.model_params.as_dict()
model = MODELS[params.model.name](**model_params)
learning_rate = optimizer_factory.build_learning_rate(
params=params.model.learning_rate,
batch_size=train_builder.global_batch_size,
train_steps=train_steps)
optimizer = optimizer_factory.build_optimizer(
optimizer_name=params.model.optimizer.name,
base_learning_rate=learning_rate,
params=params.model.optimizer.as_dict())
metrics_map = _get_metrics(one_hot)
metrics = [metrics_map[metric] for metric in params.train.metrics]
if one_hot:
loss_obj = tf.keras.losses.CategoricalCrossentropy(
label_smoothing=params.model.loss.label_smoothing)
else:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer,
loss=loss_obj,
metrics=metrics,
run_eagerly=params.runtime.enable_eager)
initial_epoch = 0
if params.train.resume_checkpoint:
initial_epoch = resume_from_checkpoint(model=model,
model_dir=params.model_dir,
train_steps=train_steps)
serialize_config(params=params, model_dir=params.model_dir)
# TODO(dankondratyuk): callbacks significantly slow down training
callbacks = custom_callbacks.get_callbacks(
model_checkpoint=params.train.callbacks.enable_checkpoint_and_export,
include_tensorboard=params.train.callbacks.enable_tensorboard,
track_lr=params.train.tensorboard.track_lr,
write_model_weights=params.train.tensorboard.write_model_weights,
initial_step=initial_epoch * train_steps,
model_dir=params.model_dir)
history = model.fit(
train_dataset,
epochs=train_epochs,
steps_per_epoch=train_steps,
initial_epoch=initial_epoch,
callbacks=callbacks,
validation_data=validation_dataset,
validation_steps=validation_steps,
validation_freq=params.evaluation.epochs_between_evals)
validation_output = model.evaluate(
validation_dataset, steps=validation_steps, verbose=2)
# TODO(dankondratyuk): eval and save final test accuracy
stats = common.build_stats(history,
validation_output,
callbacks)
return stats
def export(params: base_configs.ExperimentConfig):
"""Runs the model export functionality."""
logging.info('Exporting model.')
model_params = params.model.model_params.as_dict()
model = MODELS[params.model.name](**model_params)
checkpoint = params.export.checkpoint
if checkpoint is None:
logging.info('No export checkpoint was provided. Using the latest '
'checkpoint from model_dir.')
checkpoint = tf.train.latest_checkpoint(params.model_dir)
model.load_weights(checkpoint)
model.save(params.export.destination)
def run(flags_obj: flags.FlagValues,
strategy_override: tf.distribute.Strategy = None) -> Mapping[str, Any]:
"""Runs Image Classification model using native Keras APIs.
Args:
flags_obj: An object containing parsed flag values.
strategy_override: A `tf.distribute.Strategy` object to use for model.
Returns:
Dictionary of training/eval stats
"""
params = _get_params_from_flags(flags_obj)
initialize(params)
if params.mode == 'train_and_eval':
return train_and_eval(params, strategy_override)
elif params.mode == 'export_only':
export(params)
else:
raise ValueError('{} is not a valid mode.'.format(params.mode))
def main(_):
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
if stats:
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
define_classifier_flags()
flags.mark_flag_as_required('data_dir')
flags.mark_flag_as_required('mode')
flags.mark_flag_as_required('model_type')
flags.mark_flag_as_required('dataset')
assert tf.version.VERSION.startswith('2.')
app.run(main)
# Lint as: python3
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Unit tests for the classifier trainer models."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import functools
import json
import os
import sys
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional, Tuple
from absl import flags
from absl.testing import parameterized
import tensorflow.compat.v2 as tf
from tensorflow.python.distribute import combinations
from tensorflow.python.distribute import strategy_combinations
from official.utils.flags import core as flags_core
from official.vision.image_classification import classifier_trainer
from official.vision.image_classification import dataset_factory
from official.vision.image_classification import test_utils
from official.vision.image_classification.configs import base_configs
classifier_trainer.define_classifier_flags()
def distribution_strategy_combinations() -> Iterable[Tuple[Any, ...]]:
"""Returns the combinations of end-to-end tests to run."""
return combinations.combine(
distribution=[
strategy_combinations.default_strategy,
strategy_combinations.tpu_strategy,
strategy_combinations.one_device_strategy_gpu,
],
model=[
'efficientnet',
'resnet',
],
mode='eager',
dataset=[
'imagenet',
],
)
def get_params_override(params_override: Mapping[str, Any]) -> str:
"""Converts params_override dict to string command."""
return '--params_override=' + json.dumps(params_override)
def basic_params_override() -> MutableMapping[str, Any]:
"""Returns a basic parameter configuration for testing."""
return {
'train_dataset': {
'builder': 'synthetic',
'use_per_replica_batch_size': True,
'batch_size': 1,
'image_size': 224,
},
'validation_dataset': {
'builder': 'synthetic',
'batch_size': 1,
'use_per_replica_batch_size': True,
'image_size': 224,
},
'test_dataset': {
'builder': 'synthetic',
'batch_size': 1,
'use_per_replica_batch_size': True,
'image_size': 224,
},
'train': {
'steps': 1,
'epochs': 1,
'callbacks': {
'enable_checkpoint_and_export': True,
'enable_tensorboard': False,
},
},
'evaluation': {
'steps': 1,
},
}
def get_trivial_model(num_classes: int) -> tf.keras.Model:
"""Creates and compiles trivial model for ImageNet dataset."""
model = test_utils.trivial_model(num_classes=num_classes)
lr = 0.01
optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy()
model.compile(optimizer=optimizer,
loss=loss_obj,
run_eagerly=True)
return model
def get_trivial_data() -> tf.data.Dataset:
"""Gets trivial data in the ImageNet size."""
def generate_data(_) -> tf.data.Dataset:
image = tf.zeros(shape=(224, 224, 3), dtype=tf.float32)
label = tf.zeros([1], dtype=tf.int32)
return image, label
dataset = tf.data.Dataset.range(1)
dataset = dataset.repeat()
dataset = dataset.map(generate_data,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.prefetch(buffer_size=1).batch(1)
return dataset
def run_end_to_end(main: Callable[[Any], None],
extra_flags: Optional[Iterable[str]] = None,
model_dir: Optional[str] = None):
"""Runs the classifier trainer end-to-end."""
extra_flags = [] if extra_flags is None else extra_flags
args = [sys.argv[0], '--model_dir', model_dir] + extra_flags
flags_core.parse_flags(argv=args)
main(flags.FLAGS)
class ClassifierTest(tf.test.TestCase, parameterized.TestCase):
"""Unit tests for Keras models."""
_tempdir = None
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(ClassifierTest, cls).setUpClass()
def tearDown(self):
super(ClassifierTest, self).tearDown()
tf.io.gfile.rmtree(self.get_temp_dir())
@combinations.generate(distribution_strategy_combinations())
def test_end_to_end_train_and_eval_export(self, distribution, model, dataset):
"""Test train_and_eval and export for Keras classifier models."""
# Some parameters are not defined as flags (e.g. cannot run
# classifier_train.py --batch_size=...) by design, so use
# "--params_override=..." instead
model_dir = self.get_temp_dir()
base_flags = [
'--data_dir=not_used',
'--model_type=' + model,
'--dataset=' + dataset,
]
train_and_eval_flags = base_flags + [
get_params_override(basic_params_override()),
'--mode=train_and_eval',
]
export_params = basic_params_override()
export_path = os.path.join(model_dir, 'export')
export_params['export'] = {}
export_params['export']['destination'] = export_path
export_flags = base_flags + [
'--mode=export_only',
get_params_override(export_params)
]
run = functools.partial(classifier_trainer.run,
strategy_override=distribution)
run_end_to_end(main=run,
extra_flags=train_and_eval_flags,
model_dir=model_dir)
run_end_to_end(main=run,
extra_flags=export_flags,
model_dir=model_dir)
self.assertTrue(os.path.exists(export_path))
@combinations.generate(distribution_strategy_combinations())
def test_end_to_end_invalid_mode(self, distribution, model, dataset):
"""Test the Keras EfficientNet model with `strategy`."""
model_dir = self.get_temp_dir()
extra_flags = [
'--data_dir=not_used',
'--mode=invalid_mode',
'--model_type=' + model,
'--dataset=' + dataset,
get_params_override(basic_params_override()),
]
run = functools.partial(classifier_trainer.run,
strategy_override=distribution)
with self.assertRaises(ValueError):
run_end_to_end(main=run, extra_flags=extra_flags, model_dir=model_dir)
class UtilTests(parameterized.TestCase, tf.test.TestCase):
"""Tests for individual utility functions within classifier_trainer.py."""
@parameterized.named_parameters(
('efficientnet-b0', 'efficientnet', 'efficientnet-b0', 224),
('efficientnet-b1', 'efficientnet', 'efficientnet-b1', 240),
('efficientnet-b2', 'efficientnet', 'efficientnet-b2', 260),
('efficientnet-b3', 'efficientnet', 'efficientnet-b3', 300),
('efficientnet-b4', 'efficientnet', 'efficientnet-b4', 380),
('efficientnet-b5', 'efficientnet', 'efficientnet-b5', 456),
('efficientnet-b6', 'efficientnet', 'efficientnet-b6', 528),
('efficientnet-b7', 'efficientnet', 'efficientnet-b7', 600),
('resnet', 'resnet', '', None),
)
def test_get_model_size(self, model, model_name, expected):
config = base_configs.ExperimentConfig(
model_name=model,
model=base_configs.ModelConfig(
model_params={
'model_name': model_name,
},
)
)
size = classifier_trainer.get_image_size_from_model(config)
self.assertEqual(size, expected)
@parameterized.named_parameters(
('dynamic', 'dynamic', None, 'dynamic'),
('scalar', 128., None, 128.),
('float32', None, 'float32', 1),
('float16', None, 'float16', 128),
)
def test_get_loss_scale(self, loss_scale, dtype, expected):
config = base_configs.ExperimentConfig(
model=base_configs.ModelConfig(
loss=base_configs.LossConfig(loss_scale=loss_scale)),
train_dataset=dataset_factory.DatasetConfig(dtype=dtype))
ls = classifier_trainer.get_loss_scale(config, fp16_default=128)
self.assertEqual(ls, expected)
@parameterized.named_parameters(
('float16', 'float16'),
('bfloat16', 'bfloat16')
)
def test_initialize(self, dtype):
config = base_configs.ExperimentConfig(
runtime=base_configs.RuntimeConfig(
enable_eager=False,
enable_xla=False,
gpu_threads_enabled=True,
per_gpu_thread_count=1,
gpu_thread_mode='gpu_private',
num_gpus=1,
dataset_num_private_threads=1,
),
train_dataset=dataset_factory.DatasetConfig(dtype=dtype),
model=base_configs.ModelConfig(
loss=base_configs.LossConfig(loss_scale='dynamic')),
)
classifier_trainer.initialize(config)
def test_resume_from_checkpoint(self):
"""Tests functionality for resuming from checkpoint."""
# Set the keras policy
policy = tf.keras.mixed_precision.experimental.Policy('mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy(policy)
# Get the model, datasets, and compile it.
model = get_trivial_model(10)
# Create the checkpoint
model_dir = self.get_temp_dir()
train_epochs = 1
train_steps = 10
ds = get_trivial_data()
callbacks = [
tf.keras.callbacks.ModelCheckpoint(
os.path.join(model_dir, 'model.ckpt-{epoch:04d}'),
save_weights_only=True)
]
model.fit(
ds,
callbacks=callbacks,
epochs=train_epochs,
steps_per_epoch=train_steps)
# Test load from checkpoint
clean_model = get_trivial_model(10)
weights_before_load = copy.deepcopy(clean_model.get_weights())
initial_epoch = classifier_trainer.resume_from_checkpoint(
model=clean_model,
model_dir=model_dir,
train_steps=train_steps)
self.assertEqual(initial_epoch, 1)
self.assertNotAllClose(weights_before_load, clean_model.get_weights())
tf.io.gfile.rmtree(model_dir)
def test_serialize_config(self):
"""Tests functionality for serializing data."""
config = base_configs.ExperimentConfig()
model_dir = self.get_temp_dir()
classifier_trainer.serialize_config(params=config, model_dir=model_dir)
saved_params_path = os.path.join(model_dir, 'params.yaml')
self.assertTrue(os.path.exists(saved_params_path))
tf.io.gfile.rmtree(model_dir)
if __name__ == '__main__':
assert tf.version.VERSION.startswith('2.')
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment