"vscode:/vscode.git/clone" did not exist on "5511c258cf00f2f247b6b346ba3c82321f10cf5c"
Commit 9a88e415 authored by Hongkun Yu's avatar Hongkun Yu Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 267007907
parent bd211e3e
......@@ -26,7 +26,6 @@ import tensorflow as tf
from official.resnet.ctl import ctl_common
from official.vision.image_classification import imagenet_preprocessing
from official.vision.image_classification import common
from official.vision.image_classification import resnet_imagenet_main
from official.vision.image_classification import resnet_model
from official.utils.flags import core as flags_core
from official.utils.logs import logger
......@@ -246,7 +245,7 @@ def run(flags_obj):
training_accuracy.reset_states()
for step in range(train_steps):
optimizer.lr = resnet_imagenet_main.learning_rate_schedule(
optimizer.lr = common.learning_rate_schedule(
epoch, step, train_steps, flags_obj.batch_size)
time_callback.on_batch_begin(step+epoch*train_steps)
......
......@@ -31,6 +31,41 @@ from official.utils.misc import keras_utils
FLAGS = flags.FLAGS
BASE_LEARNING_RATE = 0.1 # This matches Jing's version.
TRAIN_TOP_1 = 'training_accuracy_top_1'
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
]
def learning_rate_schedule(current_epoch,
current_batch,
batches_per_epoch,
batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
provided scaling factor.
Args:
current_epoch: integer, current epoch indexed from 0.
current_batch: integer, current batch in the current epoch, indexed from 0.
batches_per_epoch: integer, number of steps in an epoch.
batch_size: integer, total batch sized.
Returns:
Adjusted learning rate.
"""
initial_lr = BASE_LEARNING_RATE * batch_size / 256
epoch = current_epoch + float(current_batch) / batches_per_epoch
warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
if epoch < warmup_end_epoch:
# Learning rate increases linearly per step.
return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
for mult, start_epoch in LR_SCHEDULE:
if epoch >= start_epoch:
learning_rate = initial_lr * mult
else:
break
return learning_rate
class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
......@@ -172,12 +207,13 @@ def get_optimizer(learning_rate=0.1):
return gradient_descent_v2.SGD(learning_rate=learning_rate, momentum=0.9)
def get_callbacks(learning_rate_schedule_fn, num_images):
# TODO(hongkuny,haoyuzhang): make cifar model use_tensor_lr to clean up code.
def get_callbacks(learning_rate_schedule_fn=None, num_images=None):
"""Returns common callbacks."""
time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps)
callbacks = [time_callback]
if not FLAGS.use_tensor_lr:
if not FLAGS.use_tensor_lr and learning_rate_schedule_fn:
lr_callback = LearningRateBatchScheduler(
learning_rate_schedule_fn,
batch_size=FLAGS.batch_size,
......@@ -312,6 +348,9 @@ def define_keras_flags(dynamic_loss_scale=True):
flags.DEFINE_boolean(
name='enable_get_next_as_optional', default=False,
help='Enable get_next_as_optional behavior in DistributedIterator.')
flags.DEFINE_boolean(
name='enable_checkpoint_and_export', default=False,
help='Whether to enable a checkpoint callback and export the savedmodel.')
def get_synth_input_fn(height, width, num_channels, num_classes,
......@@ -346,7 +385,6 @@ def get_synth_input_fn(height, width, num_channels, num_classes,
mean=127,
stddev=60,
name='synthetic_inputs')
labels = tf.random.uniform([1],
minval=0,
maxval=num_classes - 1,
......
......@@ -18,6 +18,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from absl import app
from absl import flags
from absl import logging
......@@ -33,42 +35,6 @@ from official.vision.image_classification import common
from official.vision.image_classification import imagenet_preprocessing
from official.vision.image_classification import resnet_model
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
]
def learning_rate_schedule(current_epoch,
current_batch,
batches_per_epoch,
batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
provided scaling factor.
Args:
current_epoch: integer, current epoch indexed from 0.
current_batch: integer, current batch in the current epoch, indexed from 0.
batches_per_epoch: integer, number of steps in an epoch.
batch_size: integer, total batch sized.
Returns:
Adjusted learning rate.
"""
initial_lr = common.BASE_LEARNING_RATE * batch_size / 256
epoch = current_epoch + float(current_batch) / batches_per_epoch
warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
if epoch < warmup_end_epoch:
# Learning rate increases linearly per step.
return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
for mult, start_epoch in LR_SCHEDULE:
if epoch >= start_epoch:
learning_rate = initial_lr * mult
else:
break
return learning_rate
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using native Keras APIs.
......@@ -94,7 +60,7 @@ def run(flags_obj):
common.set_cudnn_batchnorm_mode()
dtype = flags_core.get_tf_dtype(flags_obj)
if dtype == 'float16':
if dtype == tf.float16:
loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
policy = tf.compat.v2.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
......@@ -175,9 +141,9 @@ def run(flags_obj):
lr_schedule = common.PiecewiseConstantDecayWithWarmup(
batch_size=flags_obj.batch_size,
epoch_size=imagenet_preprocessing.NUM_IMAGES['train'],
warmup_epochs=LR_SCHEDULE[0][1],
boundaries=list(p[1] for p in LR_SCHEDULE[1:]),
multipliers=list(p[0] for p in LR_SCHEDULE),
warmup_epochs=common.LR_SCHEDULE[0][1],
boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
multipliers=list(p[0] for p in common.LR_SCHEDULE),
compute_lr_on_cpu=True)
with strategy_scope:
......@@ -218,8 +184,11 @@ def run(flags_obj):
run_eagerly=flags_obj.run_eagerly)
callbacks = common.get_callbacks(
learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train'])
common.learning_rate_schedule, imagenet_preprocessing.NUM_IMAGES['train'])
if flags_obj.enable_checkpoint_and_export:
ckpt_full_path = os.path.join(flags_obj.model_dir, 'model.ckpt-{epoch:04d}')
callbacks.append(tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
save_weights_only=True))
train_steps = (
imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
train_epochs = flags_obj.train_epochs
......@@ -257,6 +226,10 @@ def run(flags_obj):
validation_data=validation_data,
validation_freq=flags_obj.epochs_between_evals,
verbose=2)
if flags_obj.enable_checkpoint_and_export:
# Keras model.save assumes a float32 input designature.
export_path = os.path.join(flags_obj.model_dir, 'saved_model')
model.save(export_path, include_optimizer=False)
eval_output = None
if not flags_obj.skip_eval:
......
......@@ -18,19 +18,16 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tempfile
import tensorflow as tf
from tensorflow.python.eager import context
from tensorflow.python.platform import googletest
from official.utils.misc import keras_utils
from official.utils.testing import integration
from official.vision.image_classification import imagenet_preprocessing
from official.vision.image_classification import resnet_imagenet_main
class KerasImagenetTest(googletest.TestCase):
class KerasImagenetTest(tf.test.TestCase):
"""Unit tests for Keras ResNet with ImageNet."""
_extra_flags = [
......@@ -40,11 +37,6 @@ class KerasImagenetTest(googletest.TestCase):
]
_tempdir = None
def get_temp_dir(self):
if not self._tempdir:
self._tempdir = tempfile.mkdtemp(dir=googletest.GetTempDir())
return self._tempdir
@classmethod
def setUpClass(cls): # pylint: disable=invalid-name
super(KerasImagenetTest, cls).setUpClass()
......@@ -65,7 +57,6 @@ class KerasImagenetTest(googletest.TestCase):
extra_flags = [
"-distribution_strategy", "off",
"-model_dir", "keras_imagenet_no_dist_strat",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
......@@ -81,7 +72,6 @@ class KerasImagenetTest(googletest.TestCase):
extra_flags = [
"-enable_eager", "false",
"-distribution_strategy", "off",
"-model_dir", "keras_imagenet_graph_no_dist_strat",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
......@@ -105,8 +95,8 @@ class KerasImagenetTest(googletest.TestCase):
extra_flags = [
"-num_gpus", "1",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_1_gpu",
"-data_format", "channels_last",
"-enable_checkpoint_and_export", "1",
]
extra_flags = extra_flags + self._extra_flags
......@@ -130,7 +120,6 @@ class KerasImagenetTest(googletest.TestCase):
"-num_gpus", "1",
"-dtype", "fp16",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_1_gpu",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
......@@ -141,27 +130,6 @@ class KerasImagenetTest(googletest.TestCase):
extra_flags=extra_flags
)
def test_end_to_end_graph_1_gpu(self):
"""Test Keras model in legacy graph mode with 1 GPU."""
if context.num_gpus() < 1:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(1, context.num_gpus()))
extra_flags = [
"-num_gpus", "1",
"-enable_eager", "false",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_graph_1_gpu",
"-data_format", "channels_last",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_2_gpu(self):
"""Test Keras model with 2 GPUs."""
......@@ -176,7 +144,6 @@ class KerasImagenetTest(googletest.TestCase):
extra_flags = [
"-num_gpus", "2",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
......@@ -200,7 +167,6 @@ class KerasImagenetTest(googletest.TestCase):
"-num_gpus", "2",
"-enable_xla", "true",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_xla_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
......@@ -224,7 +190,6 @@ class KerasImagenetTest(googletest.TestCase):
"-num_gpus", "2",
"-dtype", "fp16",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_2_gpu_fp16",
]
extra_flags = extra_flags + self._extra_flags
......@@ -249,50 +214,6 @@ class KerasImagenetTest(googletest.TestCase):
"-dtype", "fp16",
"-enable_xla", "true",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_xla_2_gpu_fp16",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_graph_2_gpu(self):
"""Test Keras model in legacy graph mode with 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(2, context.num_gpus()))
extra_flags = [
"-num_gpus", "2",
"-enable_eager", "false",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_graph_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
integration.run_synthetic(
main=resnet_imagenet_main.run,
tmp_root=self.get_temp_dir(),
extra_flags=extra_flags
)
def test_end_to_end_graph_xla_2_gpu(self):
"""Test Keras model in legacy graph mode with XLA and 2 GPUs."""
if context.num_gpus() < 2:
self.skipTest(
"{} GPUs are not available for this test. {} GPUs are available".
format(2, context.num_gpus()))
extra_flags = [
"-num_gpus", "2",
"-enable_eager", "false",
"-enable_xla", "true",
"-distribution_strategy", "default",
"-model_dir", "keras_imagenet_graph_xla_2_gpu",
]
extra_flags = extra_flags + self._extra_flags
......@@ -305,4 +226,4 @@ class KerasImagenetTest(googletest.TestCase):
if __name__ == "__main__":
tf.compat.v1.enable_v2_behavior()
googletest.main()
tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment