Commit 05631eec authored by liangjing's avatar liangjing
Browse files

version 1

parent 7e0391d9
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Common util functions and classes used by both keras cifar and imagenet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import os
import sys
from absl import flags
from absl import logging
import numpy as np
import tensorflow as tf
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.mlp_log import mlp_log
import imagenet_preprocessing
import lars_optimizer
import lars_util
from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
FLAGS = flags.FLAGS
# BASE_LEARNING_RATE = 0.1 # This matches Jing's version.
TRAIN_TOP_1 = 'training_accuracy_top_1'
LR_SCHEDULE = [ # (multiplier, epoch to start) tuples
(1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
]
def learning_rate_schedule(current_epoch,
current_batch,
steps_per_epoch,
batch_size):
"""Handles linear scaling rule, gradual warmup, and LR decay.
Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
provided scaling factor.
Args:
current_epoch: integer, current epoch indexed from 0.
current_batch: integer, current batch in the current epoch, indexed from 0.
steps_per_epoch: integer, number of steps in an epoch.
batch_size: integer, total batch sized.
Returns:
Adjusted learning rate.
"""
initial_lr = FLAGS.base_learning_rate * batch_size / 256
epoch = current_epoch + float(current_batch) / steps_per_epoch
warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
if epoch < warmup_end_epoch:
# Learning rate increases linearly per step.
return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
for mult, start_epoch in LR_SCHEDULE:
if epoch >= start_epoch:
learning_rate = initial_lr * mult
else:
break
return learning_rate
class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
"""Callback to update learning rate on every batch (not epoch boundaries).
N.B. Only support Keras optimizers, not TF optimizers.
Attributes:
schedule: a function that takes an epoch index and a batch index as input
(both integer, indexed from 0) and returns a new learning rate as
output (float).
"""
def __init__(self, schedule, batch_size, steps_per_epoch):
super(LearningRateBatchScheduler, self).__init__()
self.schedule = schedule
self.steps_per_epoch = steps_per_epoch
self.batch_size = batch_size
self.epochs = -1
self.prev_lr = -1
def on_epoch_begin(self, epoch, logs=None):
if not hasattr(self.model.optimizer, 'learning_rate'):
raise ValueError('Optimizer must have a "learning_rate" attribute.')
self.epochs += 1
def on_batch_begin(self, batch, logs=None):
"""Executes before step begins."""
lr = self.schedule(self.epochs,
batch,
self.steps_per_epoch,
self.batch_size)
if not isinstance(lr, (float, np.float32, np.float64)):
raise ValueError('The output of the "schedule" function should be float.')
if lr != self.prev_lr:
self.model.optimizer.learning_rate = lr # lr should be a float here
self.prev_lr = lr
tf.compat.v1.logging.debug(
'Epoch %05d Batch %05d: LearningRateBatchScheduler '
'change learning rate to %s.', self.epochs, batch, lr)
class PiecewiseConstantDecayWithWarmup(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""Piecewise constant decay with warmup schedule."""
def __init__(self, batch_size, steps_per_epoch, warmup_epochs, boundaries,
multipliers, compute_lr_on_cpu=True, name=None):
super(PiecewiseConstantDecayWithWarmup, self).__init__()
if len(boundaries) != len(multipliers) - 1:
raise ValueError('The length of boundaries must be 1 less than the '
'length of multipliers')
base_lr_batch_size = 256
self.steps_per_epoch = steps_per_epoch
self.rescaled_lr = FLAGS.base_learning_rate * batch_size / base_lr_batch_size
self.step_boundaries = [float(self.steps_per_epoch) * x for x in boundaries]
self.lr_values = [self.rescaled_lr * m for m in multipliers]
self.warmup_steps = warmup_epochs * self.steps_per_epoch
self.compute_lr_on_cpu = compute_lr_on_cpu
self.name = name
self.learning_rate_ops_cache = {}
def __call__(self, step):
if tf.executing_eagerly():
return self._get_learning_rate(step)
# In an eager function or graph, the current implementation of optimizer
# repeatedly call and thus create ops for the learning rate schedule. To
# avoid this, we cache the ops if not executing eagerly.
graph = tf.compat.v1.get_default_graph()
if graph not in self.learning_rate_ops_cache:
if self.compute_lr_on_cpu:
with tf.device('/device:CPU:0'):
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
else:
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
return self.learning_rate_ops_cache[graph]
def _get_learning_rate(self, step):
"""Compute learning rate at given step."""
with tf.compat.v1.name_scope(self.name, 'PiecewiseConstantDecayWithWarmup',
[self.rescaled_lr, self.step_boundaries,
self.lr_values, self.warmup_steps,
self.compute_lr_on_cpu]):
def warmup_lr(step):
return self.rescaled_lr * (
tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
def piecewise_lr(step):
return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
self.lr_values)
lr = tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
lambda: piecewise_lr(step))
return lr
def get_config(self):
return {
'rescaled_lr': self.rescaled_lr,
'step_boundaries': self.step_boundaries,
'lr_values': self.lr_values,
'warmup_steps': self.warmup_steps,
'compute_lr_on_cpu': self.compute_lr_on_cpu,
'name': self.name
}
def get_optimizer(flags_obj,
steps_per_epoch,
train_steps):
"""Returns optimizer to use."""
optimizer = None
learning_rate_schedule_fn = None
if (get_flag_module(flags_obj, 'model') is None or
flags_obj.model == 'resnet50_v1.5'):
if flags_obj.lr_schedule == 'polynomial':
lr_schedule = lars_util.PolynomialDecayWithWarmup(
batch_size=flags_obj.batch_size,
steps_per_epoch=steps_per_epoch,
train_steps=train_steps,
initial_learning_rate=flags_obj.base_learning_rate,
end_learning_rate=flags_obj.end_learning_rate,
warmup_epochs=flags_obj.warmup_epochs)
elif flags_obj.lr_schedule == 'piecewise':
lr_schedule = PiecewiseConstantDecayWithWarmup(
batch_size=flags_obj.batch_size,
steps_per_epoch=steps_per_epoch,
warmup_epochs=LR_SCHEDULE[0][1],
boundaries=list(p[1] for p in LR_SCHEDULE[1:]),
multipliers=list(p[0] for p in LR_SCHEDULE),
compute_lr_on_cpu=True)
elif flags_obj.lr_schedule == 'constant':
lr_schedule = flags_obj.base_learning_rate * flags_obj.batch_size / 256
else:
raise ValueError('lr_schedule "%s" is unknown.' % flags_obj.lr_schedule)
if flags_obj.optimizer == 'SGD':
# The learning_rate is overwritten at the beginning of
# each step by callback.
optimizer = gradient_descent_v2.SGD(
learning_rate=lr_schedule, momentum=FLAGS.momentum)
elif flags_obj.optimizer == 'LARS':
use_experimental_compile = True if tf.config.list_physical_devices(
'GPU') else False
optimizer = lars_optimizer.LARSOptimizer(
learning_rate=lr_schedule,
momentum=flags_obj.momentum,
weight_decay=flags_obj.weight_decay,
skip_list=['batch_normalization', 'bias', 'bn'],
epsilon=flags_obj.lars_epsilon)
# use_experimental_compile=use_experimental_compile)
learning_rate_schedule_fn = learning_rate_schedule
elif flags_obj.model == 'mobilenet':
initial_learning_rate = \
flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
optimizer = tf.keras.optimizers.SGD(
learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate,
decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay,
decay_rate=flags_obj.lr_decay_factor,
staircase=True),
momentum=flags_obj.momentum)
return optimizer, learning_rate_schedule_fn
# TODO(hongkuny,haoyuzhang): make cifar model use_tensor_lr to clean up code.
def get_callbacks(
steps_per_epoch,
learning_rate_schedule_fn=None,
pruning_method=None,
enable_checkpoint_and_export=False,
model_dir=None):
"""Returns common callbacks."""
time_callback = keras_utils.TimeHistory(
FLAGS.batch_size,
FLAGS.log_steps,
logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
callbacks = [time_callback]
if FLAGS.lr_schedule == 'constant' and learning_rate_schedule_fn:
lr_callback = LearningRateBatchScheduler(
learning_rate_schedule_fn,
batch_size=FLAGS.batch_size,
steps_per_epoch=steps_per_epoch)
callbacks.append(lr_callback)
if FLAGS.enable_tensorboard:
tensorboard_callback = tf.keras.callbacks.TensorBoard(
log_dir=FLAGS.model_dir)
callbacks.append(tensorboard_callback)
if FLAGS.profile_steps:
profiler_callback = keras_utils.get_profiler_callback(
FLAGS.model_dir,
FLAGS.profile_steps,
FLAGS.enable_tensorboard,
steps_per_epoch)
callbacks.append(profiler_callback)
is_pruning_enabled = pruning_method is not None
if is_pruning_enabled:
callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
if model_dir is not None:
callbacks.append(tfmot.sparsity.keras.PruningSummaries(
log_dir=model_dir, profile_batch=0))
if enable_checkpoint_and_export:
if model_dir is not None:
ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
callbacks.append(
tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
save_weights_only=True))
return callbacks
def build_stats(history, eval_output, callbacks):
"""Normalizes and returns dictionary of stats.
Args:
history: Results of the training step. Supports both categorical_accuracy
and sparse_categorical_accuracy.
eval_output: Output of the eval step. Assumes first value is eval_loss and
second value is accuracy_top_1.
callbacks: a list of callbacks which might include a time history callback
used during keras.fit.
Returns:
Dictionary of normalized results.
"""
stats = {}
if eval_output:
stats['accuracy_top_1'] = float(eval_output[1])
if FLAGS.report_accuracy_metrics:
stats['eval_loss'] = float(eval_output[0])
if history and history.history and FLAGS.report_accuracy_metrics:
train_hist = history.history
# Gets final loss from training.
stats['loss'] = float(train_hist['loss'][-1])
# Gets top_1 training accuracy.
if 'categorical_accuracy' in train_hist:
stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
elif 'sparse_categorical_accuracy' in train_hist:
stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
if not callbacks:
return stats
# Look for the time history callback which was used during keras.fit
for callback in callbacks:
if isinstance(callback, keras_utils.TimeHistory):
timestamp_log = callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = callback.train_finish_time
if callback.epoch_runtime_log:
stats['avg_exp_per_second'] = callback.average_examples_per_second
return stats
def define_keras_flags(
dynamic_loss_scale=True,
model=False,
optimizer=False,
pretrained_filepath=False):
"""Define flags for Keras models."""
flags_core.define_base(clean=True, num_gpu=True, run_eagerly=True,
train_epochs=True, epochs_between_evals=True,
distribution_strategy=True)
flags_core.define_performance(num_parallel_calls=False,
synthetic_data=True,
dtype=True,
all_reduce_alg=True,
num_packs=True,
tf_gpu_thread_mode=True,
datasets_num_private_threads=True,
dynamic_loss_scale=dynamic_loss_scale,
loss_scale=True,
fp16_implementation=True,
tf_data_experimental_slack=True,
enable_xla=True,
force_v2_in_keras_compile=True,
training_dataset_cache=True,
training_prefetch_batchs=True,
eval_dataset_cache=True,
eval_prefetch_batchs=True)
flags_core.define_image()
flags_core.define_benchmark()
flags_core.define_distribution()
flags.adopt_module_key_flags(flags_core)
flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
# TODO(b/135607288): Remove this flag once we understand the root cause of
# slowdown when setting the learning phase in Keras backend.
flags.DEFINE_boolean(
name='set_learning_phase_to_train', default=True,
help='If skip eval, also set Keras learning phase to 1 (training).')
flags.DEFINE_boolean(
name='explicit_gpu_placement', default=False,
help='If not using distribution strategy, explicitly set device scope '
'for the Keras training loop.')
flags.DEFINE_boolean(name='use_trivial_model', default=False,
help='Whether to use a trivial Keras model.')
flags.DEFINE_boolean(name='report_accuracy_metrics', default=True,
help='Report metrics during training and evaluation.')
flags.DEFINE_string(
name='lr_schedule', default='piecewise',
help='learning rate schedule. '
'"piecewise" for PiecewiseConstantDecayWithWarmup, '
'"polynomial" for PolynomialDecayWithWarmup, '
'and "constant" for static learning rate.')
flags.DEFINE_boolean(
name='enable_tensorboard', default=False,
help='Whether to enable Tensorboard callback.')
flags.DEFINE_integer(
name='train_steps', default=None,
help='The number of steps to run for training. If it is larger than '
'# batches per epoch, then use # batches per epoch. This flag will be '
'ignored if train_epochs is set to be larger than 1. ')
flags.DEFINE_string(
name='profile_steps', default=None,
help='Save profiling data to model dir at given range of global steps. The '
'value must be a comma separated pair of positive integers, specifying '
'the first and last step to profile. For example, "--profile_steps=2,4" '
'triggers the profiler to process 3 steps, starting from the 2nd step. '
'Note that profiler has a non-trivial performance overhead, and the '
'output file can be gigantic if profiling many steps.')
flags.DEFINE_boolean(
name='batchnorm_spatial_persistent', default=True,
help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
flags.DEFINE_boolean(
name='enable_get_next_as_optional', default=False,
help='Enable get_next_as_optional behavior in DistributedIterator.')
flags.DEFINE_boolean(
name='enable_checkpoint_and_export', default=False,
help='Whether to enable a checkpoint callback and export the savedmodel.')
flags.DEFINE_string(
name='tpu', default='', help='TPU address to connect to.')
flags.DEFINE_string(
name='tpu_zone', default='', help='Zone in which the TPU resides.')
flags.DEFINE_integer(
name='steps_per_loop',
default=500,
help='Number of steps per training loop. Only training step happens '
'inside the loop. Callbacks will not be called inside. Will be capped at '
'steps per epoch.')
flags.DEFINE_boolean(
name='use_tf_while_loop',
default=True,
help='Whether to build a tf.while_loop inside the training loop on the '
'host. Setting it to True is critical to have peak performance on '
'TPU.')
flags.DEFINE_boolean(
name='use_tf_keras_layers', default=False,
help='Whether to use tf.keras.layers instead of tf.python.keras.layers.'
'It only changes imagenet resnet model layers for now. This flag is '
'a temporal flag during transition to tf.keras.layers. Do not use this '
'flag for external usage. this will be removed shortly.')
flags.DEFINE_float(
'base_learning_rate', 0.1,
'Base learning rate. '
'This is the learning rate when using batch size 256; when using other '
'batch sizes, the learning rate will be scaled linearly.')
flags.DEFINE_string(
'optimizer', 'SGD',
'Name of optimizer preset. (SGD, LARS)')
flags.DEFINE_boolean(
'drop_train_remainder', True,
'Whether to drop remainder in the training dataset.')
flags.DEFINE_boolean(
'drop_eval_remainder', False,
'Whether to drop remainder in the eval dataset.')
flags.DEFINE_float(
'label_smoothing', 0.0,
'Apply label smoothing to the loss. This applies to '
'categorical_cross_entropy; when label_smoothing > 0, '
'one-hot encoding is used for the labels.')
flags.DEFINE_integer(
'num_classes', 1000,
'Number of classes for labels, at least 2.')
flags.DEFINE_integer(
'eval_offset_epochs', 0,
'Epoch number of the first evaluation.')
lars_util.define_lars_flags()
if model:
flags.DEFINE_string('model', 'resnet50_v1.5',
'Name of model preset. (mobilenet, resnet50_v1.5)')
if optimizer:
# TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
flags.DEFINE_float('initial_learning_rate_per_sample', 0.00007,
'Initial value of learning rate per sample for '
'SGD optimizer when using mobilenet.')
flags.DEFINE_float('lr_decay_factor', 0.94,
'Learning rate decay factor for SGD optimizer '
'when using mobilenet.')
flags.DEFINE_float('num_epochs_per_decay', 2.5,
'Number of epochs per decay for SGD optimizer '
'when using mobilenet.')
if pretrained_filepath:
flags.DEFINE_string('pretrained_filepath', '',
'Pretrained file path.')
flags.DEFINE_float('target_accuracy', 0.759,
'Target eval accuracy, after which training will stop.')
def get_synth_data(height, width, num_channels, num_classes, dtype):
"""Creates a set of synthetic random data.
Args:
height: Integer height that will be used to create a fake image tensor.
width: Integer width that will be used to create a fake image tensor.
num_channels: Integer depth that will be used to create a fake image tensor.
num_classes: Number of classes that should be represented in the fake labels
tensor
dtype: Data type for features/images.
Returns:
A tuple of tensors representing the inputs and labels.
"""
# Synthetic input should be within [0, 255].
inputs = tf.random.truncated_normal([height, width, num_channels],
dtype=dtype,
mean=127,
stddev=60,
name='synthetic_inputs')
labels = tf.random.uniform([1],
minval=0,
maxval=num_classes - 1,
dtype=tf.int32,
name='synthetic_labels')
return inputs, labels
def define_pruning_flags():
"""Define flags for pruning methods."""
flags.DEFINE_string('pruning_method', None,
'Pruning method.'
'None (no pruning) or polynomial_decay.')
flags.DEFINE_float('pruning_initial_sparsity', 0.0,
'Initial sparsity for pruning.')
flags.DEFINE_float('pruning_final_sparsity', 0.5,
'Final sparsity for pruning.')
flags.DEFINE_integer('pruning_begin_step', 0,
'Begin step for pruning.')
flags.DEFINE_integer('pruning_end_step', 100000,
'End step for pruning.')
flags.DEFINE_integer('pruning_frequency', 100,
'Frequency for pruning.')
def get_synth_input_fn(height, width, num_channels, num_classes,
dtype=tf.float32, drop_remainder=True):
"""Returns an input function that returns a dataset with random data.
This input_fn returns a data set that iterates over a set of random data and
bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
copy is still included. This used to find the upper throughput bound when
tuning the full input pipeline.
Args:
height: Integer height that will be used to create a fake image tensor.
width: Integer width that will be used to create a fake image tensor.
num_channels: Integer depth that will be used to create a fake image tensor.
num_classes: Number of classes that should be represented in the fake labels
tensor
dtype: Data type for features/images.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
Returns:
An input_fn that can be used in place of a real one to return a dataset
that can be used for iteration.
"""
# pylint: disable=unused-argument
def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
"""Returns dataset filled with random data."""
inputs, labels = get_synth_data(height=height,
width=width,
num_channels=num_channels,
num_classes=num_classes,
dtype=dtype)
if FLAGS.label_smoothing and FLAGS.label_smoothing > 0:
labels = tf.one_hot(labels, num_classes)
labels = tf.reshape(labels, [num_classes])
else:
labels = tf.cast(labels, tf.float32)
labels = tf.cast(labels, dtype=tf.float32)
data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
# `drop_remainder` will make dataset produce outputs with known shapes.
data = data.batch(batch_size, drop_remainder=drop_remainder)
data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
return data
return input_fn
def set_cudnn_batchnorm_mode():
"""Set CuDNN batchnorm mode for better performance.
Note: Spatial Persistent mode may lead to accuracy losses for certain
models.
"""
if FLAGS.batchnorm_spatial_persistent:
os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
else:
os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
def print_flags(flags_obj):
"""Print out all flags."""
flags_by_module = flags_obj.flags_by_module_dict()
modules = sorted(flags_by_module)
main_module = sys.argv[0]
if main_module in modules:
modules.remove(main_module)
modules = [main_module] + modules
selections = ['mlperf', 'tensorflow', 'absl', 'xla', 'tf2', 'main']
for module in modules:
hit_selections = False
for selection in selections:
if selection in module:
hit_selections = True
break
# if not hit_selections:
# continue
logging.info('Module %s:', module)
flags_dict = flags_by_module[module]
for flag in flags_dict:
logging.info('\t flags_obj.%s = %s', flag.name, flag.value)
def get_flag_module(flags_obj, flag):
"""Get which module a flag is defined in."""
flags_by_module = flags_obj.flags_by_module_dict()
modules = sorted(flags_by_module)
for module in modules:
if flag in flags_by_module[module]:
return module
return None
def get_num_train_iterations(flags_obj):
"""Returns the number of training steps, train and test epochs."""
if flags_obj.drop_train_remainder:
steps_per_epoch = (
imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
else:
steps_per_epoch = (
math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['train'] /
flags_obj.batch_size))
train_epochs = flags_obj.train_epochs
# if mutliple epochs, ignore the train_steps flag.
if train_epochs <= 1 and flags_obj.train_steps:
steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
train_epochs = 1
else:
eval_offset_epochs = flags_obj.eval_offset_epochs
epochs_between_evals = flags_obj.epochs_between_evals
train_epochs = eval_offset_epochs + math.ceil(
(train_epochs - eval_offset_epochs) /
epochs_between_evals) * epochs_between_evals
return steps_per_epoch, train_epochs
def get_num_eval_steps(flags_obj):
"""Returns the number of eval steps."""
if flags_obj.drop_eval_remainder:
eval_steps = (
imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)
else:
eval_steps = (
math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
flags_obj.batch_size))
return eval_steps
cat $1 |grep eval_accuracy|awk -F eval_accuracy '{print $2}'|awk -F value '{print $2}'|awk '{print $2}'|uniq
# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Provides utilities to preprocess images.
Training images are sampled using the provided bounding boxes, and subsequently
cropped to the sampled bounding box. Images are additionally flipped randomly,
then resized to the target output size (without aspect-ratio preservation).
Images used during evaluation are resized (with aspect-ratio preservation) and
centrally cropped.
All images undergo mean color subtraction.
Note that these steps are colloquially referred to as "ResNet preprocessing,"
and they differ from "VGG preprocessing," which does not use bounding boxes
and instead does an aspect-preserving resize followed by random crop during
training. (These both differ from "Inception preprocessing," which introduces
color distortion steps.)
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import functools
import os
from absl import flags
from absl import logging
import tensorflow as tf
DEFAULT_IMAGE_SIZE = 224
NUM_CHANNELS = 3
NUM_IMAGES = {
'train': 1281167,
'validation': 50000,
}
_NUM_TRAIN_FILES = 1024
_SHUFFLE_BUFFER = 10000
_R_MEAN = 123.68
_G_MEAN = 116.78
_B_MEAN = 103.94
CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
# The lower bound for the smallest side of the image for aspect-preserving
# resizing. For example, if an image is 500 x 1000, it will be resized to
# _RESIZE_MIN x (_RESIZE_MIN * 2).
_RESIZE_MIN = 256
FLAGS = flags.FLAGS
def process_record_dataset(dataset,
is_training,
batch_size,
shuffle_buffer,
dtype=tf.float32,
datasets_num_private_threads=None,
drop_remainder=False,
tf_data_experimental_slack=False,
prefetch_batchs=tf.data.experimental.AUTOTUNE):
"""Given a Dataset with raw records, return an iterator over the records.
Args:
dataset: A Dataset representing raw records
is_training: A boolean denoting whether the input is for training.
batch_size: The number of samples per batch.
shuffle_buffer: The buffer size to use when shuffling records. A larger
value results in better randomness, but smaller values reduce startup
time and use less memory.
dtype: Data type to use for images/features.
datasets_num_private_threads: Number of threads for a private
threadpool created for all datasets computation.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
tf_data_experimental_slack: Whether to enable tf.data's
`experimental_slack` option.
prefetch_batchs: The number of batchs to prefetch.
Returns:
Dataset of (image, label) pairs ready for iteration.
"""
# Defines a specific size thread pool for tf.data operations.
if datasets_num_private_threads:
options = tf.data.Options()
options.experimental_threading.private_threadpool_size = (
datasets_num_private_threads)
dataset = dataset.with_options(options)
logging.info(
'datasets_num_private_threads: %s', datasets_num_private_threads)
if is_training:
# Shuffles records before repeating to respect epoch boundaries.
dataset = dataset.shuffle(buffer_size=shuffle_buffer)
# Repeats the dataset for the number of epochs to train.
dataset = dataset.repeat()
one_hot = False
num_classes = FLAGS.num_classes
if FLAGS.label_smoothing and FLAGS.label_smoothing > 0:
one_hot = True
logging.info('Num classes: %d', num_classes)
logging.info('One hot: %s', one_hot)
if is_training and FLAGS.cache_decoded_image:
parse_record_fn = preprocess_parsed_example
else:
parse_record_fn = parse_and_preprocess_record
map_fn = functools.partial(
parse_record_fn,
is_training=is_training,
dtype=dtype,
num_classes=num_classes,
one_hot=one_hot)
# Parses the raw records into images and labels.
dataset = dataset.map(
map_fn,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
# Operations between the final prefetch and the get_next call to the iterator
# will happen synchronously during run time. We prefetch here again to
# background all of the above processing work and keep it out of the
# critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
# allows DistributionStrategies to adjust how many batches to fetch based
# on how many devices are present.
dataset = dataset.prefetch(buffer_size=prefetch_batchs)
options = tf.data.Options()
options.experimental_slack = tf_data_experimental_slack
dataset = dataset.with_options(options)
return dataset
def get_filenames(is_training, data_dir):
"""Return filenames for dataset."""
if is_training:
return [
os.path.join(data_dir,'train', 'train-%05d-of-01024' % i)
for i in range(_NUM_TRAIN_FILES)]
else:
return [
os.path.join(data_dir,'val', 'val-%05d-of-00128' % i)
for i in range(128)]
def parse_example_proto(example_serialized):
"""Parses an Example proto containing a training example of an image.
The output of the build_image_data.py image preprocessing script is a dataset
containing serialized Example protocol buffers. Each Example proto contains
the following fields (values are included as examples):
image/height: 462
image/width: 581
image/colorspace: 'RGB'
image/channels: 3
image/class/label: 615
image/class/synset: 'n03623198'
image/class/text: 'knee pad'
image/object/bbox/xmin: 0.1
image/object/bbox/xmax: 0.9
image/object/bbox/ymin: 0.2
image/object/bbox/ymax: 0.6
image/object/bbox/label: 615
image/format: 'JPEG'
image/filename: 'ILSVRC2012_val_00041207.JPEG'
image/encoded: <JPEG encoded string>
Args:
example_serialized: scalar Tensor tf.string containing a serialized
Example protocol buffer.
Returns:
image_buffer: Tensor tf.string containing the contents of a JPEG file.
label: Tensor tf.int32 containing the label.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
"""
# Dense features in Example proto.
feature_map = {
'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
default_value=''),
'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
default_value=-1),
'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
default_value=''),
}
sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
# Sparse features in Example proto.
feature_map.update(
{k: sparse_float32 for k in [
'image/object/bbox/xmin', 'image/object/bbox/ymin',
'image/object/bbox/xmax', 'image/object/bbox/ymax']})
features = tf.io.parse_single_example(serialized=example_serialized,
features=feature_map)
label = tf.cast(features['image/class/label'], dtype=tf.int32)
xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
# Note that we impose an ordering of (y, x) just to make life difficult.
bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
# Force the variable number of bounding boxes into the shape
# [1, num_boxes, coords].
bbox = tf.expand_dims(bbox, 0)
bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
return features['image/encoded'], label, bbox
def parse_example_proto_and_decode(example_serialized):
"""Parses an example and decodes the image to prepare for caching."""
image_buffer, label, bbox = parse_example_proto(example_serialized)
image_buffer = tf.reshape(image_buffer, shape=[])
image_buffer = tf.io.decode_jpeg(image_buffer, 3)
return image_buffer, label, bbox
def preprocess_parsed_example(
image_buffer, label, bbox, is_training, dtype, num_classes, one_hot=False):
"""Applies preprocessing steps to the input parsed example."""
image = preprocess_image(
image_buffer=image_buffer,
bbox=bbox,
output_height=DEFAULT_IMAGE_SIZE,
output_width=DEFAULT_IMAGE_SIZE,
num_channels=NUM_CHANNELS,
is_training=is_training)
image = tf.cast(image, dtype)
# Subtract one so that labels are in [0, 1000), and cast to float32 for
# Keras model.
label = tf.reshape(label, shape=[1])
label = tf.cast(label, tf.int32)
label -= 1
if one_hot:
label = tf.one_hot(label, num_classes)
label = tf.reshape(label, [num_classes])
else:
label = tf.cast(label, tf.float32)
return image, label
def parse_and_preprocess_record(
raw_record, is_training, dtype, num_classes, one_hot=False):
"""Parses and preprocesses a record containing a training example of an image.
The input record is parsed into a label and image, and the image is passed
through preprocessing steps (cropping, flipping, and so on).
Args:
raw_record: scalar Tensor tf.string containing a serialized
Example protocol buffer.
is_training: A boolean denoting whether the input is for training.
dtype: data type to use for images/features.
num_classes: Number of classes for one hot encoding.
one_hot: Whether to use one_hot encoding on label.
Returns:
Tuple with processed image tensor in a channel-last format and
one-hot-encoded label tensor.
"""
image_buffer, label, bbox = parse_example_proto(raw_record)
return preprocess_parsed_example(image_buffer=image_buffer,
label=label,
bbox=bbox,
is_training=is_training,
dtype=dtype,
one_hot=one_hot,
num_classes=num_classes)
def input_fn(is_training,
data_dir,
batch_size,
dtype=tf.float32,
datasets_num_private_threads=None,
input_context=None,
drop_remainder=False,
tf_data_experimental_slack=False,
dataset_cache=False,
filenames=None,
prefetch_batchs=tf.data.experimental.AUTOTUNE):
"""Input function which provides batches for train or eval.
Args:
is_training: A boolean denoting whether the input is for training.
data_dir: The directory containing the input data.
batch_size: The number of samples per batch.
dtype: Data type to use for images/features
datasets_num_private_threads: Number of private threads for tf.data.
input_context: A `tf.distribute.InputContext` object passed in by
`tf.distribute.Strategy`.
drop_remainder: A boolean indicates whether to drop the remainder of the
batches. If True, the batch dimension will be static.
tf_data_experimental_slack: Whether to enable tf.data's
`experimental_slack` option.
dataset_cache: Whether to cache the dataset on workers.
Typically used to improve training performance when training data is in
remote storage and can fit into worker memory.
filenames: Optional field for providing the file names of the TFRecords.
prefetch_batchs: The number of batchs to prefetch.
Returns:
A dataset that can be used for iteration.
"""
if filenames is None:
filenames = get_filenames(is_training, data_dir)
dataset = tf.data.Dataset.from_tensor_slices(filenames)
if input_context:
logging.info(
'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
input_context.input_pipeline_id, input_context.num_input_pipelines)
dataset = dataset.shard(input_context.num_input_pipelines,
input_context.input_pipeline_id)
if is_training:
# Shuffle the input files
dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
# Convert to individual records.
# cycle_length = 10 means that up to 10 files will be read and deserialized in
# parallel. You may want to increase this number if you have a large number of
# CPU cores.
dataset = dataset.interleave(
tf.data.TFRecordDataset,
cycle_length=10,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if is_training and FLAGS.cache_decoded_image:
dataset = dataset.map(
parse_example_proto_and_decode,
num_parallel_calls=tf.data.experimental.AUTOTUNE)
if dataset_cache:
# Improve training / eval performance when data is in remote storage and
# can fit into worker memory.
dataset = dataset.cache()
return process_record_dataset(
dataset=dataset,
is_training=is_training,
batch_size=batch_size,
shuffle_buffer=_SHUFFLE_BUFFER,
dtype=dtype,
datasets_num_private_threads=datasets_num_private_threads,
drop_remainder=drop_remainder,
tf_data_experimental_slack=tf_data_experimental_slack,
prefetch_batchs=prefetch_batchs,
)
def _decode_crop_and_flip(image_buffer, bbox, num_channels):
"""Crops the given image to a random part of the image, and randomly flips.
We use the fused decode_and_crop op, which performs better than the two ops
used separately in series, but note that this requires that the image be
passed in as an un-decoded string Tensor.
Args:
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
num_channels: Integer depth of the image buffer for decoding.
Returns:
3-D tensor with cropped image.
"""
# A large fraction of image datasets contain a human-annotated bounding box
# delineating the region of the image containing the object of interest. We
# choose to create a new bounding box for the object which is a randomly
# distorted version of the human-annotated bounding box that obeys an
# allowed range of aspect ratios, sizes and overlap with the human-annotated
# bounding box. If no box is supplied, then we assume the bounding box is
# the entire image.
decoded = image_buffer.dtype != tf.string
shape = (tf.shape(image_buffer) if decoded
else tf.image.extract_jpeg_shape(image_buffer))
sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
shape,
bounding_boxes=bbox,
min_object_covered=0.1,
aspect_ratio_range=[0.75, 1.33],
area_range=[0.05, 1.0],
max_attempts=100,
use_image_if_no_bounding_boxes=True)
bbox_begin, bbox_size, _ = sample_distorted_bounding_box
# Reassemble the bounding box in the format the crop op requires.
offset_y, offset_x, _ = tf.unstack(bbox_begin)
target_height, target_width, _ = tf.unstack(bbox_size)
crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
if decoded:
cropped = tf.image.crop_to_bounding_box(
image_buffer,
offset_height=offset_y,
offset_width=offset_x,
target_height=target_height,
target_width=target_width)
else:
# Use the fused decode and crop op here, which is faster than sequential.
cropped = tf.image.decode_and_crop_jpeg(
image_buffer, crop_window, channels=num_channels)
# Flip to add a little more random distortion in.
cropped = tf.image.random_flip_left_right(cropped)
return cropped
def _central_crop(image, crop_height, crop_width):
"""Performs central crops of the given image list.
Args:
image: a 3-D image tensor
crop_height: the height of the image following the crop.
crop_width: the width of the image following the crop.
Returns:
3-D tensor with cropped image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
amount_to_be_cropped_h = (height - crop_height)
crop_top = amount_to_be_cropped_h // 2
amount_to_be_cropped_w = (width - crop_width)
crop_left = amount_to_be_cropped_w // 2
return tf.slice(
image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
def _mean_image_subtraction(image, means, num_channels):
"""Subtracts the given means from each image channel.
For example:
means = [123.68, 116.779, 103.939]
image = _mean_image_subtraction(image, means)
Note that the rank of `image` must be known.
Args:
image: a tensor of size [height, width, C].
means: a C-vector of values to subtract from each channel.
num_channels: number of color channels in the image that will be distorted.
Returns:
the centered image.
Raises:
ValueError: If the rank of `image` is unknown, if `image` has a rank other
than three or if the number of channels in `image` doesn't match the
number of values in `means`.
"""
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
# We have a 1-D tensor of means; convert to 3-D.
# Note(b/130245863): we explicitly call `broadcast` instead of simply
# expanding dimensions for better performance.
means = tf.broadcast_to(means, tf.shape(image))
return image - means
def _smallest_size_at_least(height, width, resize_min):
"""Computes new shape with the smallest side equal to `smallest_side`.
Computes new shape with the smallest side equal to `smallest_side` while
preserving the original aspect ratio.
Args:
height: an int32 scalar tensor indicating the current height.
width: an int32 scalar tensor indicating the current width.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
new_height: an int32 scalar tensor indicating the new height.
new_width: an int32 scalar tensor indicating the new width.
"""
resize_min = tf.cast(resize_min, tf.float32)
# Convert to floats to make subsequent calculations go smoothly.
height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
smaller_dim = tf.minimum(height, width)
scale_ratio = resize_min / smaller_dim
# Convert back to ints to make heights and widths that TF ops will accept.
new_height = tf.cast(height * scale_ratio, tf.int32)
new_width = tf.cast(width * scale_ratio, tf.int32)
return new_height, new_width
def _aspect_preserving_resize(image, resize_min):
"""Resize images preserving the original aspect ratio.
Args:
image: A 3-D image `Tensor`.
resize_min: A python integer or scalar `Tensor` indicating the size of
the smallest side after resize.
Returns:
resized_image: A 3-D tensor containing the resized image.
"""
shape = tf.shape(input=image)
height, width = shape[0], shape[1]
new_height, new_width = _smallest_size_at_least(height, width, resize_min)
return _resize_image(image, new_height, new_width)
def _resize_image(image, height, width):
"""Simple wrapper around tf.resize_images.
This is primarily to make sure we use the same `ResizeMethod` and other
details each time.
Args:
image: A 3-D image `Tensor`.
height: The target height for the resized image.
width: The target width for the resized image.
Returns:
resized_image: A 3-D tensor containing the resized image. The first two
dimensions have the shape [height, width].
"""
return tf.compat.v1.image.resize(
image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
def preprocess_image(image_buffer, bbox, output_height, output_width,
num_channels, is_training=False):
"""Preprocesses the given image.
Preprocessing includes decoding, cropping, and resizing for both training
and eval images. Training preprocessing, however, introduces some random
distortion of the image to improve accuracy.
Args:
image_buffer: scalar string Tensor representing the raw JPEG image buffer.
bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
where each coordinate is [0, 1) and the coordinates are arranged as
[ymin, xmin, ymax, xmax].
output_height: The height of the image after preprocessing.
output_width: The width of the image after preprocessing.
num_channels: Integer depth of the image buffer for decoding.
is_training: `True` if we're preprocessing the image for training and
`False` otherwise.
Returns:
A preprocessed image.
"""
if is_training:
# For training, we want to randomize some of the distortions.
image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
image = _resize_image(image, output_height, output_width)
else:
# For validation, we want to decode, resize, then just crop the middle.
if image_buffer.dtype == tf.string:
image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
else:
image = image_buffer
image = _aspect_preserving_resize(image, _RESIZE_MIN)
image = _central_crop(image, output_height, output_width)
image.set_shape([output_height, output_width, num_channels])
return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
# from tf2_common.training import optimizer_v2modified
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend_config
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.training import training_ops
from tensorflow.python.ops import state_ops
# class LARSOptimizer(optimizer_v2modified.OptimizerV2Modified):
#class LARSOptimizer(optimizer_v2.OptimizerV2):
class LARSOptimizer(tf.keras.optimizers.Optimizer):
"""Layer-wise Adaptive Rate Scaling for large batch training.
Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
Implements the LARS learning rate scheme presented in the paper above. This
optimizer is useful when scaling the batch size to up to 32K without
significant performance degradation. It is recommended to use the optimizer
in conjunction with:
- Gradual learning rate warm-up
- Linear learning rate scaling
- Poly rule learning rate decay
Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
use the default momentum optimizer.
"""
def __init__(
self,
learning_rate,
momentum=0.9,
weight_decay=0.0001,
# The LARS coefficient is a hyperparameter
eeta=0.001,
epsilon=0.0,
name="LARSOptimizer",
# Enable skipping variables from LARS scaling.
# TODO(sameerkm): Enable a direct mechanism to pass a
# subset of variables to the optimizer.
skip_list=None,
use_nesterov=False,
**kwargs):
"""Construct a new LARS Optimizer.
Args:
learning_rate: A `Tensor`, floating point value, or a schedule that is a
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
that takes no arguments and returns the actual value to use. The
learning rate.
momentum: A floating point value. Momentum hyperparameter.
weight_decay: A floating point value. Weight decay hyperparameter.
eeta: LARS coefficient as used in the paper. Dfault set to LARS
coefficient from the paper. (eeta / weight_decay) determines the highest
scaling factor in LARS.
epsilon: Optional epsilon parameter to be set in models that have very
small gradients. Default set to 0.0.
name: Optional name prefix for variables and ops created by LARSOptimizer.
skip_list: List of strings to enable skipping variables from LARS scaling.
If any of the strings in skip_list is a subset of var.name, variable
'var' is skipped from LARS scaling. For a typical classification model
with batch normalization, the skip_list is ['batch_normalization',
'bias']
use_nesterov: when set to True, nesterov momentum will be enabled
**kwargs: keyword arguments.
Raises:
ValueError: If a hyperparameter is set to a non-sensical value.
"""
if momentum < 0.0:
raise ValueError("momentum should be positive: %s" % momentum)
if weight_decay < 0.0:
raise ValueError("weight_decay should be positive: %s" % weight_decay)
super(LARSOptimizer, self).__init__(name=name, **kwargs)
self._set_hyper("learning_rate", learning_rate)
# When directly using class members, instead of
# _set_hyper and _get_hyper (such as learning_rate above),
# the values are fixed after __init(), and not being
# updated during the training process.
# This provides better performance but less flexibility.
self.momentum = momentum
self.weight_decay = weight_decay
self.eeta = eeta
self.epsilon = epsilon or backend_config.epsilon()
self._skip_list = skip_list
self.use_nesterov = use_nesterov
def _prepare_local(self, var_device, var_dtype, apply_state):
lr_t = self._get_hyper("learning_rate", var_dtype)
local_step = math_ops.cast(self.iterations, var_dtype)
lr_t = math_ops.cast(lr_t(local_step), var_dtype)
learning_rate_t = array_ops.identity(lr_t)
apply_state[(var_device, var_dtype)].update(
dict(
learning_rate=learning_rate_t,
))
def _create_slots(self, var_list):
for v in var_list:
self.add_slot(v, "momentum")
def compute_lr(self, grad, var, coefficients):
scaled_lr = coefficients["learning_rate"]
if self._skip_list is None or not any(v in var.name
for v in self._skip_list):
w_norm = linalg_ops.norm(var, ord=2)
g_norm = linalg_ops.norm(grad, ord=2)
trust_ratio = array_ops.where(
math_ops.greater(w_norm, 0),
array_ops.where(
math_ops.greater(g_norm, 0),
(self.eeta * w_norm /
(g_norm + self.weight_decay * w_norm + self.epsilon)), 1.0), 1.0)
scaled_lr = coefficients["learning_rate"] * trust_ratio
# Add the weight regularization gradient
grad = grad + self.weight_decay * var
return scaled_lr, grad
def _apply_dense(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
scaled_lr, grad = self.compute_lr(grad, var, coefficients)
mom = self.get_slot(var, "momentum")
return training_ops.apply_momentum(
var,
mom,
math_ops.cast(1.0, var.dtype.base_dtype),
grad * scaled_lr,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def _resource_apply_dense(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
scaled_lr, grad = self.compute_lr(grad, var, coefficients)
mom = self.get_slot(var, "momentum")
# Use ApplyKerasMomentum instead of ApplyMomentum
# training_ops.resource_apply_keras_momentum(
# var.handle,
# mom.handle,
# scaled_lr,
# grad,
# coefficients["momentum"],
# use_locking=False,
# use_nesterov=self.use_nesterov)
mom_t = mom * self.momentum - grad * scaled_lr
mom_t = state_ops.assign(mom, mom_t, use_locking=False)
if self.use_nesterov:
var_t = var + mom_t * self.momentum - grad * scaled_lr
else:
var_t = var + mom_t
return state_ops.assign(var, var_t, use_locking=False).op
# Fallback to momentum optimizer for sparse tensors
def _apply_sparse(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
mom = self.get_slot(var, "momentum")
return training_ops.sparse_apply_momentum(
var,
mom,
coefficients["learning_rate"],
grad.values,
grad.indices,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
mom = self.get_slot(var, "momentum")
return training_ops.resource_sparse_apply_keras_momentum(
var.handle,
mom.handle,
coefficients["learning_rate"],
grad,
indices,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def get_config(self):
config = super(LARSOptimizer, self).get_config()
config.update({
"learning_rate": self._serialize_hyperparameter("learning_rate"),
"momentum": self.momentum,
"weight_decay": self.weight_decay,
"eeta": self.eeta,
"epsilon": self.epsilon,
"use_nesterov": self.use_nesterov,
})
return config
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Enable Layer-wise Adaptive Rate Scaling optimizer in ResNet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from tf2_common.utils.mlp_log import mlp_log
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
FLAGS = flags.FLAGS
def define_lars_flags():
"""Defines flags needed by LARS optimizer."""
flags.DEFINE_float(
'end_learning_rate', default=None,
help=('Polynomial decay end learning rate.'))
flags.DEFINE_float(
'lars_epsilon', default=0.0,
help=('Override autoselected LARS epsilon.'))
flags.DEFINE_float(
'warmup_epochs', default=None,
help=('Override autoselected polynomial decay warmup epochs.'))
flags.DEFINE_float(
'momentum',
default=0.9,
help=('Momentum parameter used in the MomentumOptimizer.'))
class PolynomialDecayWithWarmup(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""A LearningRateSchedule that uses a polynomial decay with warmup."""
def __init__(
self,
batch_size,
steps_per_epoch,
train_steps,
initial_learning_rate=None,
end_learning_rate=None,
warmup_epochs=None,
compute_lr_on_cpu=False,
name=None):
"""Applies a polynomial decay to the learning rate with warmup."""
super(PolynomialDecayWithWarmup, self).__init__()
self.batch_size = batch_size
self.steps_per_epoch = steps_per_epoch
self.train_steps = train_steps
self.name = name
self.learning_rate_ops_cache = {}
self.compute_lr_on_cpu = compute_lr_on_cpu
if batch_size < 16384:
self.initial_learning_rate = 10.0
warmup_epochs_ = 5
elif batch_size < 32768:
self.initial_learning_rate = 25.0
warmup_epochs_ = 5
else:
self.initial_learning_rate = 31.2
warmup_epochs_ = 25
# Override default poly learning rate and warmup epochs
if initial_learning_rate:
self.initial_learning_rate = initial_learning_rate
if end_learning_rate:
self.end_learning_rate = end_learning_rate
else:
self.end_learning_rate = 0.0001
if warmup_epochs is not None:
warmup_epochs_ = warmup_epochs
self.warmup_epochs = warmup_epochs_
opt_name = FLAGS.optimizer.lower()
mlp_log.mlperf_print('opt_name', opt_name)
if opt_name == 'lars':
mlp_log.mlperf_print('{}_epsilon'.format(opt_name), FLAGS.lars_epsilon)
mlp_log.mlperf_print('{}_opt_weight_decay'.format(opt_name),
FLAGS.weight_decay)
mlp_log.mlperf_print('{}_opt_base_learning_rate'.format(opt_name),
self.initial_learning_rate)
mlp_log.mlperf_print('{}_opt_learning_rate_warmup_epochs'.format(opt_name),
warmup_epochs_)
mlp_log.mlperf_print('{}_opt_end_learning_rate'.format(opt_name),
self.end_learning_rate)
warmup_steps = warmup_epochs_ * steps_per_epoch
self.warmup_steps = tf.cast(warmup_steps, tf.float32)
self.decay_steps = train_steps - warmup_steps + 1
mlp_log.mlperf_print('{}_opt_learning_rate_decay_steps'.format(opt_name),
int(self.decay_steps))
mlp_log.mlperf_print(
'{}_opt_learning_rate_decay_poly_power'.format(opt_name), 2.0)
mlp_log.mlperf_print('{}_opt_momentum'.format(opt_name), FLAGS.momentum)
self.poly_rate_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=self.initial_learning_rate,
decay_steps=self.decay_steps,
end_learning_rate=self.end_learning_rate,
power=2.0)
def __call__(self, step):
if tf.executing_eagerly():
return self._get_learning_rate(step)
# In an eager function or graph, the current implementation of optimizer
# repeatedly call and thus create ops for the learning rate schedule. To
# avoid this, we cache the ops if not executing eagerly.
graph = tf.compat.v1.get_default_graph()
if graph not in self.learning_rate_ops_cache:
if self.compute_lr_on_cpu:
with tf.device('/device:CPU:0'):
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
else:
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
return self.learning_rate_ops_cache[graph]
def _get_learning_rate(self, step):
with ops.name_scope_v2(self.name or 'PolynomialDecayWithWarmup') as name:
initial_learning_rate = ops.convert_to_tensor_v2(
self.initial_learning_rate, name='initial_learning_rate')
warmup_steps = ops.convert_to_tensor_v2(
self.warmup_steps, name='warmup_steps')
warmup_rate = (
initial_learning_rate * step / warmup_steps)
poly_steps = math_ops.subtract(step, warmup_steps)
poly_rate = self.poly_rate_scheduler(poly_steps)
decay_rate = tf.where(step <= warmup_steps,
warmup_rate, poly_rate, name=name)
return decay_rate
def get_config(self):
return {
'batch_size': self.batch_size,
'steps_per_epoch': self.steps_per_epoch,
'train_steps': self.train_steps,
'initial_learning_rate': self.initial_learning_rate,
'end_learning_rate': self.end_learning_rate,
'warmup_epochs': self.warmup_epochs,
'name': self.name,
}
nohup: ignoring input
:::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.145380 140547769902912 mlp_log.py:80] :::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.146378 140547769902912 mlp_log.py:80] :::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147078 140547769902912 mlp_log.py:80] :::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.147791 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.148500 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149215 140547769902912 mlp_log.py:80] :::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.149919 140547769902912 mlp_log.py:80] :::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:27.150071 140547769902912 common.py:617] Module ./resnet_ctl_imagenet_main.py:
I0319 12:55:27.150561 140547769902912 common.py:620] flags_obj.use_tf_function = True
I0319 12:55:27.150646 140547769902912 common.py:620] flags_obj.single_l2_loss_op = True
I0319 12:55:27.150727 140547769902912 common.py:620] flags_obj.cache_decoded_image = False
I0319 12:55:27.150808 140547769902912 common.py:620] flags_obj.enable_device_warmup = True
I0319 12:55:27.150889 140547769902912 common.py:620] flags_obj.device_warmup_steps = 1
I0319 12:55:27.150968 140547769902912 common.py:620] flags_obj.num_replicas = 32
I0319 12:55:27.151046 140547769902912 common.py:617] Module absl.app:
I0319 12:55:27.151130 140547769902912 common.py:620] flags_obj.run_with_pdb = False
I0319 12:55:27.151208 140547769902912 common.py:620] flags_obj.pdb_post_mortem = False
I0319 12:55:27.151290 140547769902912 common.py:620] flags_obj.pdb = False
I0319 12:55:27.151383 140547769902912 common.py:620] flags_obj.run_with_profiling = False
I0319 12:55:27.151461 140547769902912 common.py:620] flags_obj.profile_file = None
I0319 12:55:27.151540 140547769902912 common.py:620] flags_obj.use_cprofile_for_profiling = True
I0319 12:55:27.151618 140547769902912 common.py:620] flags_obj.only_check_args = False
I0319 12:55:27.151695 140547769902912 common.py:620] flags_obj.help = False
I0319 12:55:27.151774 140547769902912 common.py:620] flags_obj.helpshort = False
I0319 12:55:27.151850 140547769902912 common.py:620] flags_obj.helpfull = False
I0319 12:55:27.151929 140547769902912 common.py:620] flags_obj.helpxml = False
I0319 12:55:27.152006 140547769902912 common.py:617] Module absl.logging:
I0319 12:55:27.152086 140547769902912 common.py:620] flags_obj.logtostderr = False
I0319 12:55:27.152163 140547769902912 common.py:620] flags_obj.alsologtostderr = False
I0319 12:55:27.152240 140547769902912 common.py:620] flags_obj.log_dir =
I0319 12:55:27.152339 140547769902912 common.py:620] flags_obj.verbosity = 0
I0319 12:55:27.152423 140547769902912 common.py:620] flags_obj.logger_levels = {}
I0319 12:55:27.152507 140547769902912 common.py:620] flags_obj.stderrthreshold = fatal
I0319 12:55:27.152584 140547769902912 common.py:620] flags_obj.showprefixforinfo = True
I0319 12:55:27.152662 140547769902912 common.py:617] Module absl.testing.absltest:
I0319 12:55:27.152743 140547769902912 common.py:620] flags_obj.test_srcdir =
I0319 12:55:27.152820 140547769902912 common.py:620] flags_obj.test_tmpdir = /tmp/absl_testing
I0319 12:55:27.152901 140547769902912 common.py:620] flags_obj.test_random_seed = 301
I0319 12:55:27.152981 140547769902912 common.py:620] flags_obj.test_randomize_ordering_seed = 1
I0319 12:55:27.153058 140547769902912 common.py:620] flags_obj.xml_output_file =
I0319 12:55:27.153135 140547769902912 common.py:617] Module common:
I0319 12:55:27.153217 140547769902912 common.py:620] flags_obj.enable_eager = True
I0319 12:55:27.153294 140547769902912 common.py:620] flags_obj.skip_eval = False
I0319 12:55:27.153382 140547769902912 common.py:620] flags_obj.set_learning_phase_to_train = True
I0319 12:55:27.153460 140547769902912 common.py:620] flags_obj.explicit_gpu_placement = False
I0319 12:55:27.153537 140547769902912 common.py:620] flags_obj.use_trivial_model = False
I0319 12:55:27.153614 140547769902912 common.py:620] flags_obj.report_accuracy_metrics = True
I0319 12:55:27.153692 140547769902912 common.py:620] flags_obj.lr_schedule = polynomial
I0319 12:55:27.153769 140547769902912 common.py:620] flags_obj.enable_tensorboard = False
I0319 12:55:27.153845 140547769902912 common.py:620] flags_obj.train_steps = None
I0319 12:55:27.153923 140547769902912 common.py:620] flags_obj.profile_steps = None
I0319 12:55:27.154000 140547769902912 common.py:620] flags_obj.batchnorm_spatial_persistent = True
I0319 12:55:27.154076 140547769902912 common.py:620] flags_obj.enable_get_next_as_optional = False
I0319 12:55:27.154153 140547769902912 common.py:620] flags_obj.enable_checkpoint_and_export = False
I0319 12:55:27.154229 140547769902912 common.py:620] flags_obj.tpu =
I0319 12:55:27.154305 140547769902912 common.py:620] flags_obj.tpu_zone =
I0319 12:55:27.154394 140547769902912 common.py:620] flags_obj.steps_per_loop = 514
I0319 12:55:27.154473 140547769902912 common.py:620] flags_obj.use_tf_while_loop = True
I0319 12:55:27.154549 140547769902912 common.py:620] flags_obj.use_tf_keras_layers = False
I0319 12:55:27.154627 140547769902912 common.py:620] flags_obj.base_learning_rate = 4.9
I0319 12:55:27.154710 140547769902912 common.py:620] flags_obj.optimizer = LARS
I0319 12:55:27.154787 140547769902912 common.py:620] flags_obj.drop_train_remainder = True
I0319 12:55:27.154863 140547769902912 common.py:620] flags_obj.drop_eval_remainder = False
I0319 12:55:27.154940 140547769902912 common.py:620] flags_obj.label_smoothing = 0.1
I0319 12:55:27.155020 140547769902912 common.py:620] flags_obj.num_classes = 1000
I0319 12:55:27.155099 140547769902912 common.py:620] flags_obj.eval_offset_epochs = 3
I0319 12:55:27.155177 140547769902912 common.py:620] flags_obj.target_accuracy = 0.759
I0319 12:55:27.155256 140547769902912 common.py:617] Module lars_util:
I0319 12:55:27.155346 140547769902912 common.py:620] flags_obj.end_learning_rate = None
I0319 12:55:27.155426 140547769902912 common.py:620] flags_obj.lars_epsilon = 0.0
I0319 12:55:27.155504 140547769902912 common.py:620] flags_obj.warmup_epochs = 5.0
I0319 12:55:27.155582 140547769902912 common.py:620] flags_obj.momentum = 0.9
I0319 12:55:27.155662 140547769902912 common.py:617] Module resnet_model:
I0319 12:55:27.155743 140547769902912 common.py:620] flags_obj.weight_decay = 0.0002
I0319 12:55:27.155822 140547769902912 common.py:620] flags_obj.num_accumulation_steps = 1
I0319 12:55:27.155900 140547769902912 common.py:617] Module resnet_runnable:
I0319 12:55:27.155981 140547769902912 common.py:620] flags_obj.trace_warmup = False
I0319 12:55:27.156070 140547769902912 common.py:617] Module tensorflow.python.ops.parallel_for.pfor:
I0319 12:55:27.156152 140547769902912 common.py:620] flags_obj.op_conversion_fallback_to_while_loop = True
I0319 12:55:27.156228 140547769902912 common.py:617] Module tensorflow.python.tpu.client.client:
I0319 12:55:27.156317 140547769902912 common.py:620] flags_obj.runtime_oom_exit = True
I0319 12:55:27.156397 140547769902912 common.py:620] flags_obj.hbm_oom_exit = True
I0319 12:55:27.156476 140547769902912 common.py:617] Module tf2_common.utils.flags._base:
I0319 12:55:27.156557 140547769902912 common.py:620] flags_obj.data_dir = /data/tf-imagenet/imagenet
I0319 12:55:27.156634 140547769902912 common.py:620] flags_obj.model_dir = /tmp
I0319 12:55:27.156712 140547769902912 common.py:620] flags_obj.clean = False
I0319 12:55:27.156790 140547769902912 common.py:620] flags_obj.train_epochs = 70
I0319 12:55:27.156867 140547769902912 common.py:620] flags_obj.epochs_between_evals = 4
I0319 12:55:27.156945 140547769902912 common.py:620] flags_obj.batch_size = 2496
I0319 12:55:27.157022 140547769902912 common.py:620] flags_obj.num_gpus = 8
I0319 12:55:27.157100 140547769902912 common.py:620] flags_obj.run_eagerly = False
I0319 12:55:27.157177 140547769902912 common.py:620] flags_obj.distribution_strategy = mirrored
I0319 12:55:27.157255 140547769902912 common.py:617] Module tf2_common.utils.flags._benchmark:
I0319 12:55:27.157347 140547769902912 common.py:620] flags_obj.benchmark_logger_type = BaseBenchmarkLogger
I0319 12:55:27.157434 140547769902912 common.py:620] flags_obj.benchmark_test_id = None
I0319 12:55:27.157512 140547769902912 common.py:620] flags_obj.log_steps = 125
I0319 12:55:27.157588 140547769902912 common.py:620] flags_obj.benchmark_log_dir = None
I0319 12:55:27.157666 140547769902912 common.py:620] flags_obj.gcp_project = None
I0319 12:55:27.157744 140547769902912 common.py:620] flags_obj.bigquery_data_set = test_benchmark
I0319 12:55:27.157821 140547769902912 common.py:620] flags_obj.bigquery_run_table = benchmark_run
I0319 12:55:27.157899 140547769902912 common.py:620] flags_obj.bigquery_run_status_table = benchmark_run_status
I0319 12:55:27.157977 140547769902912 common.py:620] flags_obj.bigquery_metric_table = benchmark_metric
I0319 12:55:27.158053 140547769902912 common.py:617] Module tf2_common.utils.flags._distribution:
I0319 12:55:27.158134 140547769902912 common.py:620] flags_obj.worker_hosts = None
I0319 12:55:27.158211 140547769902912 common.py:620] flags_obj.task_index = -1
I0319 12:55:27.158288 140547769902912 common.py:617] Module tf2_common.utils.flags._misc:
I0319 12:55:27.158379 140547769902912 common.py:620] flags_obj.data_format = None
I0319 12:55:27.158457 140547769902912 common.py:617] Module tf2_common.utils.flags._performance:
I0319 12:55:27.158539 140547769902912 common.py:620] flags_obj.use_synthetic_data = False
I0319 12:55:27.158615 140547769902912 common.py:620] flags_obj.dtype = fp16
I0319 12:55:27.158691 140547769902912 common.py:620] flags_obj.loss_scale = None
I0319 12:55:27.158768 140547769902912 common.py:620] flags_obj.fp16_implementation = keras
I0319 12:55:27.158844 140547769902912 common.py:620] flags_obj.all_reduce_alg = nccl
I0319 12:55:27.158921 140547769902912 common.py:620] flags_obj.num_packs = 1
I0319 12:55:27.158999 140547769902912 common.py:620] flags_obj.tf_gpu_thread_mode = gpu_private
I0319 12:55:27.159075 140547769902912 common.py:620] flags_obj.per_gpu_thread_count = 0
I0319 12:55:27.159153 140547769902912 common.py:620] flags_obj.datasets_num_private_threads = 32
I0319 12:55:27.159230 140547769902912 common.py:620] flags_obj.training_dataset_cache = True
I0319 12:55:27.159306 140547769902912 common.py:620] flags_obj.training_prefetch_batchs = 128
I0319 12:55:27.159394 140547769902912 common.py:620] flags_obj.eval_dataset_cache = True
I0319 12:55:27.159471 140547769902912 common.py:620] flags_obj.eval_prefetch_batchs = 192
I0319 12:55:27.159548 140547769902912 common.py:620] flags_obj.tf_data_experimental_slack = False
I0319 12:55:27.159631 140547769902912 common.py:620] flags_obj.enable_xla = False
I0319 12:55:27.159710 140547769902912 common.py:620] flags_obj.force_v2_in_keras_compile = None
WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
W0319 12:55:27.160811 140547769902912 device_compatibility_check.py:107] Mixed precision compatibility check (mixed_float16): WARNING
Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
Z100L, no compute capability (probably not an Nvidia GPU) (x8)
See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
I0319 12:55:27.161139 140547769902912 keras_utils.py:243] Logical CPU cores: 128
I0319 12:55:27.161378 140547769902912 keras_utils.py:249] TF_GPU_THREAD_COUNT: 2
I0319 12:55:27.161468 140547769902912 keras_utils.py:251] TF_GPU_THREAD_MODE: gpu_private
I0319 12:55:27.161551 140547769902912 keras_utils.py:261] Recommended datasets_num_private_threads: 64
2023-03-19 12:55:27.162998: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-19 12:55:27.181835: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.181964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 32252 MB memory: -> device: 0, name: Z100L, pci bus id: 0000:07:00.0
2023-03-19 12:55:27.582374: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.582493: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 32252 MB memory: -> device: 1, name: Z100L, pci bus id: 0000:0a:00.0
2023-03-19 12:55:27.961772: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:27.961893: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 32252 MB memory: -> device: 2, name: Z100L, pci bus id: 0000:15:00.0
2023-03-19 12:55:28.339247: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.339376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 32252 MB memory: -> device: 3, name: Z100L, pci bus id: 0000:0f:00.0
2023-03-19 12:55:28.719486: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:28.719627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:4 with 32252 MB memory: -> device: 4, name: Z100L, pci bus id: 0000:85:00.0
2023-03-19 12:55:29.097492: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.097606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:5 with 32252 MB memory: -> device: 5, name: Z100L, pci bus id: 0000:7f:00.0
2023-03-19 12:55:29.475299: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.475428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:6 with 32252 MB memory: -> device: 6, name: Z100L, pci bus id: 0000:77:00.0
2023-03-19 12:55:29.855076: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2023-03-19 12:55:29.855191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:7 with 32252 MB memory: -> device: 7, name: Z100L, pci bus id: 0000:7a:00.0
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
I0319 12:55:30.261204 140547769902912 mirrored_strategy.py:376] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
num_index -1
enter the tf.float16 set policy
Compute dtype: float16
Variable dtype: float32
:::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.263783 140547769902912 mlp_log.py:80] :::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.264862 140547769902912 mlp_log.py:80] :::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.265909 140547769902912 mlp_log.py:80] :::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.266957 140547769902912 mlp_log.py:80] :::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 12:55:30.267157 140547769902912 resnet_ctl_imagenet_main.py:204] Training 71 epochs, each epoch has 513 steps, total steps: 36423; Eval 21 steps
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.377633 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.390385 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.400095 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.402572 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.414422 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.426609 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.486386 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.488949 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.497610 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
I0319 12:55:30.500023 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
:::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.377869 140547769902912 mlp_log.py:80] :::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.378870 140547769902912 mlp_log.py:80] :::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.379752 140547769902912 mlp_log.py:80] :::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.380624 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.381502 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.382365 140547769902912 mlp_log.py:80] :::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.383680 140547769902912 mlp_log.py:80] :::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.384541 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
:::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.385398 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
I0319 12:55:35.494630 140547769902912 resnet_ctl_imagenet_main.py:238] Warmup for 1 steps.
I0319 12:55:35.496956 140547769902912 controller.py:340] Warmup at step 0 of 1
I0319 12:55:35.497112 140547769902912 controller.py:345] Entering warmup loop with 1 steps, at step 0 of 1
WARNING:tensorflow:From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
W0319 12:55:35.497444 140547769902912 deprecation.py:341] From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
Instructions for updating:
rename to distribute_datasets_from_function
I0319 12:55:35.897564 140547769902912 resnet_runnable.py:484] Entering the warmup loop.
WARNING:tensorflow:From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
W0319 12:55:37.124004 140547769902912 deprecation.py:545] From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
Instructions for updating:
experimental_compile is deprecated, use jit_compile instead
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:55:55.412617 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 12:56:48.352646 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
I0319 13:00:32.592645 140547769902912 resnet_runnable.py:497] Exiting the warmup loop.
I0319 13:00:32.595108 140547769902912 controller.py:220] step: 1 steps_per_second: 0.00
enter fp16 computing
step: 1 steps_per_second: 0.00
:::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596201 140547769902912 mlp_log.py:80] :::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.596997 140547769902912 mlp_log.py:80] :::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.597745 140547769902912 mlp_log.py:80] :::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 13:00:32.599620 140547769902912 controller.py:247] Train at step 0 of 36423
I0319 13:00:32.599745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 0 of 36423
I0319 13:00:32.612586 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:00:32.634842 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:00:32.636068 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:00:32.637336 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:00:32.637444 140547769902912 imagenet_preprocessing.py:119] One hot: True
I0319 13:08:32.765698 140547769902912 keras_utils.py:120] TimeHistory: 2676.05 examples/second between steps 0 and 513
I0319 13:08:32.769956 140547769902912 controller.py:220] step: 513 steps_per_second: 1.07 {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
I0319 13:08:32.770123 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 513 of 36423
I0319 13:16:30.476807 140547769902912 keras_utils.py:120] TimeHistory: 2680.53 examples/second between steps 513 and 1026
I0319 13:16:30.481098 140547769902912 controller.py:220] step: 1026 steps_per_second: 1.07 {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
I0319 13:16:30.481256 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1026 of 36423
I0319 13:24:28.062501 140547769902912 keras_utils.py:120] TimeHistory: 2681.24 examples/second between steps 1026 and 1539
I0319 13:24:28.066748 140547769902912 controller.py:220] step: 1539 steps_per_second: 1.07 {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
I0319 13:24:28.066913 140547769902912 controller.py:185] Start evaluation at step: 1539
I0319 13:24:28.070569 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
W0319 13:24:28.088642 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
I0319 13:24:28.089705 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
I0319 13:24:28.089835 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
I0319 13:24:28.089923 140547769902912 imagenet_preprocessing.py:119] One hot: True
step: 513 steps_per_second: 1.07 {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
step: 1026 steps_per_second: 1.07 {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
step: 1539 steps_per_second: 1.07 {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
:::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:24:28.927603 140547769902912 mlp_log.py:80] :::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.308466 140547769902912 mlp_log.py:80] :::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.317326 140547769902912 mlp_log.py:80] :::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.318364 140547769902912 mlp_log.py:80] :::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.319331 140547769902912 mlp_log.py:80] :::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:25:01.329561 140547769902912 controller.py:220] step: 1539 evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
I0319 13:25:01.329745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1539 of 36423
I0319 13:32:58.584241 140547769902912 keras_utils.py:120] TimeHistory: 2683.07 examples/second between steps 1539 and 2052
I0319 13:32:58.588519 140547769902912 controller.py:220] step: 2052 steps_per_second: 1.00 {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
I0319 13:32:58.588680 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2052 of 36423
I0319 13:40:56.833560 140547769902912 keras_utils.py:120] TimeHistory: 2677.52 examples/second between steps 2052 and 2565
I0319 13:40:56.837803 140547769902912 controller.py:220] step: 2565 steps_per_second: 1.07 {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
I0319 13:40:56.837963 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2565 of 36423
I0319 13:48:55.233101 140547769902912 keras_utils.py:120] TimeHistory: 2676.68 examples/second between steps 2565 and 3078
I0319 13:48:55.237374 140547769902912 controller.py:220] step: 3078 steps_per_second: 1.07 {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
I0319 13:48:55.237531 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3078 of 36423
I0319 13:56:53.574455 140547769902912 keras_utils.py:120] TimeHistory: 2677.00 examples/second between steps 3078 and 3591
I0319 13:56:53.578727 140547769902912 controller.py:220] step: 3591 steps_per_second: 1.07 {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
I0319 13:56:53.578876 140547769902912 controller.py:185] Start evaluation at step: 3591
step: 1539 evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
step: 2052 steps_per_second: 1.00 {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
step: 2565 steps_per_second: 1.07 {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
step: 3078 steps_per_second: 1.07 {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
step: 3591 steps_per_second: 1.07 {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
:::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:56:54.080654 140547769902912 mlp_log.py:80] :::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.254401 140547769902912 mlp_log.py:80] :::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.261220 140547769902912 mlp_log.py:80] :::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.262227 140547769902912 mlp_log.py:80] :::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.263200 140547769902912 mlp_log.py:80] :::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 13:57:05.272903 140547769902912 controller.py:220] step: 3591 evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
I0319 13:57:05.273066 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3591 of 36423
I0319 14:05:03.201216 140547769902912 keras_utils.py:120] TimeHistory: 2679.28 examples/second between steps 3591 and 4104
I0319 14:05:03.205459 140547769902912 controller.py:220] step: 4104 steps_per_second: 1.05 {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
I0319 14:05:03.205613 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4104 of 36423
I0319 14:13:01.703775 140547769902912 keras_utils.py:120] TimeHistory: 2676.10 examples/second between steps 4104 and 4617
I0319 14:13:01.707995 140547769902912 controller.py:220] step: 4617 steps_per_second: 1.07 {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
I0319 14:13:01.708152 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4617 of 36423
I0319 14:20:58.757003 140547769902912 keras_utils.py:120] TimeHistory: 2684.23 examples/second between steps 4617 and 5130
I0319 14:20:58.761198 140547769902912 controller.py:220] step: 5130 steps_per_second: 1.08 {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
I0319 14:20:58.761370 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5130 of 36423
I0319 14:28:56.838135 140547769902912 keras_utils.py:120] TimeHistory: 2678.46 examples/second between steps 5130 and 5643
I0319 14:28:56.842247 140547769902912 controller.py:220] step: 5643 steps_per_second: 1.07 {'train_loss': 47.524445, 'train_accuracy': 0.517012}
I0319 14:28:56.842405 140547769902912 controller.py:185] Start evaluation at step: 5643
step: 3591 evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
step: 4104 steps_per_second: 1.05 {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
step: 4617 steps_per_second: 1.07 {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
step: 5130 steps_per_second: 1.08 {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
step: 5643 steps_per_second: 1.07 {'train_loss': 47.524445, 'train_accuracy': 0.517012}
:::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:28:57.346966 140547769902912 mlp_log.py:80] :::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.307533 140547769902912 mlp_log.py:80] :::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.314471 140547769902912 mlp_log.py:80] :::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.315475 140547769902912 mlp_log.py:80] :::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.316439 140547769902912 mlp_log.py:80] :::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 14:29:08.326488 140547769902912 controller.py:220] step: 5643 evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
I0319 14:29:08.326648 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5643 of 36423
I0319 14:37:05.725753 140547769902912 keras_utils.py:120] TimeHistory: 2682.26 examples/second between steps 5643 and 6156
I0319 14:37:05.729918 140547769902912 controller.py:220] step: 6156 steps_per_second: 1.05 {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
I0319 14:37:05.730074 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6156 of 36423
I0319 14:45:03.411590 140547769902912 keras_utils.py:120] TimeHistory: 2680.68 examples/second between steps 6156 and 6669
I0319 14:45:03.415779 140547769902912 controller.py:220] step: 6669 steps_per_second: 1.07 {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
I0319 14:45:03.415935 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6669 of 36423
I0319 14:53:02.156559 140547769902912 keras_utils.py:120] TimeHistory: 2674.74 examples/second between steps 6669 and 7182
I0319 14:53:02.160710 140547769902912 controller.py:220] step: 7182 steps_per_second: 1.07 {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
I0319 14:53:02.160865 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7182 of 36423
I0319 15:01:00.511001 140547769902912 keras_utils.py:120] TimeHistory: 2676.93 examples/second between steps 7182 and 7695
I0319 15:01:00.515219 140547769902912 controller.py:220] step: 7695 steps_per_second: 1.07 {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
I0319 15:01:00.517019 140547769902912 controller.py:185] Start evaluation at step: 7695
step: 5643 evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
step: 6156 steps_per_second: 1.05 {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
step: 6669 steps_per_second: 1.07 {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
step: 7182 steps_per_second: 1.07 {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
step: 7695 steps_per_second: 1.07 {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
:::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:01.002238 140547769902912 mlp_log.py:80] :::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.832513 140547769902912 mlp_log.py:80] :::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.839387 140547769902912 mlp_log.py:80] :::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.840405 140547769902912 mlp_log.py:80] :::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.841379 140547769902912 mlp_log.py:80] :::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:01:11.851153 140547769902912 controller.py:220] step: 7695 evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
I0319 15:01:11.851322 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7695 of 36423
I0319 15:09:09.903125 140547769902912 keras_utils.py:120] TimeHistory: 2678.59 examples/second between steps 7695 and 8208
I0319 15:09:09.907292 140547769902912 controller.py:220] step: 8208 steps_per_second: 1.05 {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
I0319 15:09:09.907462 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8208 of 36423
I0319 15:17:08.328512 140547769902912 keras_utils.py:120] TimeHistory: 2676.53 examples/second between steps 8208 and 8721
I0319 15:17:08.332779 140547769902912 controller.py:220] step: 8721 steps_per_second: 1.07 {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
I0319 15:17:08.332940 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8721 of 36423
I0319 15:25:06.558547 140547769902912 keras_utils.py:120] TimeHistory: 2677.62 examples/second between steps 8721 and 9234
I0319 15:25:06.562764 140547769902912 controller.py:220] step: 9234 steps_per_second: 1.07 {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
I0319 15:25:06.562925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9234 of 36423
I0319 15:33:04.438484 140547769902912 keras_utils.py:120] TimeHistory: 2679.59 examples/second between steps 9234 and 9747
I0319 15:33:04.442654 140547769902912 controller.py:220] step: 9747 steps_per_second: 1.07 {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
I0319 15:33:04.442804 140547769902912 controller.py:185] Start evaluation at step: 9747
step: 7695 evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
step: 8208 steps_per_second: 1.05 {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
step: 8721 steps_per_second: 1.07 {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
step: 9234 steps_per_second: 1.07 {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
step: 9747 steps_per_second: 1.07 {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
:::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:04.930735 140547769902912 mlp_log.py:80] :::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.094051 140547769902912 mlp_log.py:80] :::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.100932 140547769902912 mlp_log.py:80] :::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.101949 140547769902912 mlp_log.py:80] :::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.102918 140547769902912 mlp_log.py:80] :::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 15:33:16.112729 140547769902912 controller.py:220] step: 9747 evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
I0319 15:33:16.112884 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9747 of 36423
I0319 15:41:14.392338 140547769902912 keras_utils.py:120] TimeHistory: 2677.32 examples/second between steps 9747 and 10260
I0319 15:41:14.396505 140547769902912 controller.py:220] step: 10260 steps_per_second: 1.05 {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
I0319 15:41:14.396659 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10260 of 36423
I0319 15:49:11.961558 140547769902912 keras_utils.py:120] TimeHistory: 2681.33 examples/second between steps 10260 and 10773
I0319 15:49:11.965767 140547769902912 controller.py:220] step: 10773 steps_per_second: 1.07 {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
I0319 15:49:11.965925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10773 of 36423
I0319 15:57:09.164847 140547769902912 keras_utils.py:120] TimeHistory: 2683.39 examples/second between steps 10773 and 11286
I0319 15:57:09.168977 140547769902912 controller.py:220] step: 11286 steps_per_second: 1.08 {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
I0319 15:57:09.169133 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11286 of 36423
I0319 16:05:06.888276 140547769902912 keras_utils.py:120] TimeHistory: 2680.46 examples/second between steps 11286 and 11799
I0319 16:05:06.892483 140547769902912 controller.py:220] step: 11799 steps_per_second: 1.07 {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
I0319 16:05:06.892634 140547769902912 controller.py:185] Start evaluation at step: 11799
step: 9747 evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
step: 10260 steps_per_second: 1.05 {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
step: 10773 steps_per_second: 1.07 {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
step: 11286 steps_per_second: 1.08 {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
step: 11799 steps_per_second: 1.07 {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
:::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:07.376655 140547769902912 mlp_log.py:80] :::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.161060 140547769902912 mlp_log.py:80] :::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.167979 140547769902912 mlp_log.py:80] :::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.168991 140547769902912 mlp_log.py:80] :::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.169961 140547769902912 mlp_log.py:80] :::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:05:18.179913 140547769902912 controller.py:220] step: 11799 evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
I0319 16:05:18.180072 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11799 of 36423
I0319 16:13:15.017472 140547769902912 keras_utils.py:120] TimeHistory: 2685.42 examples/second between steps 11799 and 12312
I0319 16:13:15.021653 140547769902912 controller.py:220] step: 12312 steps_per_second: 1.05 {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
I0319 16:13:15.021814 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12312 of 36423
I0319 16:21:11.873815 140547769902912 keras_utils.py:120] TimeHistory: 2685.34 examples/second between steps 12312 and 12825
I0319 16:21:11.877966 140547769902912 controller.py:220] step: 12825 steps_per_second: 1.08 {'train_loss': 39.75526, 'train_accuracy': 0.627093}
I0319 16:21:11.878120 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12825 of 36423
I0319 16:29:08.757629 140547769902912 keras_utils.py:120] TimeHistory: 2685.19 examples/second between steps 12825 and 13338
I0319 16:29:08.761925 140547769902912 controller.py:220] step: 13338 steps_per_second: 1.08 {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
I0319 16:29:08.762086 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13338 of 36423
I0319 16:37:05.957099 140547769902912 keras_utils.py:120] TimeHistory: 2683.41 examples/second between steps 13338 and 13851
I0319 16:37:05.961228 140547769902912 controller.py:220] step: 13851 steps_per_second: 1.08 {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
I0319 16:37:05.961388 140547769902912 controller.py:185] Start evaluation at step: 13851
step: 11799 evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
step: 12312 steps_per_second: 1.05 {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
step: 12825 steps_per_second: 1.08 {'train_loss': 39.75526, 'train_accuracy': 0.627093}
step: 13338 steps_per_second: 1.08 {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
step: 13851 steps_per_second: 1.08 {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
:::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:06.441277 140547769902912 mlp_log.py:80] :::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.448269 140547769902912 mlp_log.py:80] :::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.455250 140547769902912 mlp_log.py:80] :::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.456276 140547769902912 mlp_log.py:80] :::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.457254 140547769902912 mlp_log.py:80] :::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 16:37:17.467283 140547769902912 controller.py:220] step: 13851 evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
I0319 16:37:17.467454 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13851 of 36423
I0319 16:45:14.272286 140547769902912 keras_utils.py:120] TimeHistory: 2685.60 examples/second between steps 13851 and 14364
I0319 16:45:14.276514 140547769902912 controller.py:220] step: 14364 steps_per_second: 1.05 {'train_loss': 38.50588, 'train_accuracy': 0.645977}
I0319 16:45:14.276674 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14364 of 36423
I0319 16:53:12.242927 140547769902912 keras_utils.py:120] TimeHistory: 2679.08 examples/second between steps 14364 and 14877
I0319 16:53:12.247173 140547769902912 controller.py:220] step: 14877 steps_per_second: 1.07 {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
I0319 16:53:12.247342 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14877 of 36423
I0319 17:01:08.925324 140547769902912 keras_utils.py:120] TimeHistory: 2686.32 examples/second between steps 14877 and 15390
I0319 17:01:08.929522 140547769902912 controller.py:220] step: 15390 steps_per_second: 1.08 {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
I0319 17:01:08.929681 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15390 of 36423
I0319 17:09:05.460300 140547769902912 keras_utils.py:120] TimeHistory: 2687.14 examples/second between steps 15390 and 15903
I0319 17:09:05.464558 140547769902912 controller.py:220] step: 15903 steps_per_second: 1.08 {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
I0319 17:09:05.464712 140547769902912 controller.py:185] Start evaluation at step: 15903
step: 13851 evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
step: 14364 steps_per_second: 1.05 {'train_loss': 38.50588, 'train_accuracy': 0.645977}
step: 14877 steps_per_second: 1.07 {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
step: 15390 steps_per_second: 1.08 {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
step: 15903 steps_per_second: 1.08 {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
:::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:05.958450 140547769902912 mlp_log.py:80] :::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.709334 140547769902912 mlp_log.py:80] :::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.716322 140547769902912 mlp_log.py:80] :::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.717343 140547769902912 mlp_log.py:80] :::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.718302 140547769902912 mlp_log.py:80] :::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:09:16.728244 140547769902912 controller.py:220] step: 15903 evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
I0319 17:09:16.728415 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15903 of 36423
I0319 17:17:12.753624 140547769902912 keras_utils.py:120] TimeHistory: 2690.00 examples/second between steps 15903 and 16416
I0319 17:17:12.757766 140547769902912 controller.py:220] step: 16416 steps_per_second: 1.05 {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
I0319 17:17:12.757923 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16416 of 36423
I0319 17:25:08.728839 140547769902912 keras_utils.py:120] TimeHistory: 2690.31 examples/second between steps 16416 and 16929
I0319 17:25:08.733042 140547769902912 controller.py:220] step: 16929 steps_per_second: 1.08 {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
I0319 17:25:08.733199 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16929 of 36423
I0319 17:33:05.759370 140547769902912 keras_utils.py:120] TimeHistory: 2684.36 examples/second between steps 16929 and 17442
I0319 17:33:05.763500 140547769902912 controller.py:220] step: 17442 steps_per_second: 1.08 {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
I0319 17:33:05.763653 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17442 of 36423
I0319 17:41:02.449225 140547769902912 keras_utils.py:120] TimeHistory: 2686.27 examples/second between steps 17442 and 17955
I0319 17:41:02.453442 140547769902912 controller.py:220] step: 17955 steps_per_second: 1.08 {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
I0319 17:41:02.453614 140547769902912 controller.py:185] Start evaluation at step: 17955
step: 15903 evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
step: 16416 steps_per_second: 1.05 {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
step: 16929 steps_per_second: 1.08 {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
step: 17442 steps_per_second: 1.08 {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
step: 17955 steps_per_second: 1.08 {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
:::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:02.938170 140547769902912 mlp_log.py:80] :::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.817046 140547769902912 mlp_log.py:80] :::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.824302 140547769902912 mlp_log.py:80] :::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.825345 140547769902912 mlp_log.py:80] :::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.826306 140547769902912 mlp_log.py:80] :::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 17:41:13.836492 140547769902912 controller.py:220] step: 17955 evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
I0319 17:41:13.836662 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17955 of 36423
I0319 17:49:10.447629 140547769902912 keras_utils.py:120] TimeHistory: 2686.69 examples/second between steps 17955 and 18468
I0319 17:49:10.451847 140547769902912 controller.py:220] step: 18468 steps_per_second: 1.05 {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
I0319 17:49:10.452003 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18468 of 36423
I0319 17:57:07.115317 140547769902912 keras_utils.py:120] TimeHistory: 2686.40 examples/second between steps 18468 and 18981
I0319 17:57:07.119469 140547769902912 controller.py:220] step: 18981 steps_per_second: 1.08 {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
I0319 17:57:07.119627 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18981 of 36423
I0319 18:05:03.520790 140547769902912 keras_utils.py:120] TimeHistory: 2687.88 examples/second between steps 18981 and 19494
I0319 18:05:03.524950 140547769902912 controller.py:220] step: 19494 steps_per_second: 1.08 {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
I0319 18:05:03.525108 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 19494 of 36423
I0319 18:13:00.146009 140547769902912 keras_utils.py:120] TimeHistory: 2686.66 examples/second between steps 19494 and 20007
I0319 18:13:00.150213 140547769902912 controller.py:220] step: 20007 steps_per_second: 1.08 {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
I0319 18:13:00.150395 140547769902912 controller.py:185] Start evaluation at step: 20007
step: 17955 evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
step: 18468 steps_per_second: 1.05 {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
step: 18981 steps_per_second: 1.08 {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
step: 19494 steps_per_second: 1.08 {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
step: 20007 steps_per_second: 1.08 {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
:::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:00.638623 140547769902912 mlp_log.py:80] :::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.473854 140547769902912 mlp_log.py:80] :::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.482290 140547769902912 mlp_log.py:80] :::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.483335 140547769902912 mlp_log.py:80] :::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.484290 140547769902912 mlp_log.py:80] :::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:13:11.494453 140547769902912 controller.py:220] step: 20007 evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
I0319 18:13:11.494655 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20007 of 36423
I0319 18:21:07.807034 140547769902912 keras_utils.py:120] TimeHistory: 2688.38 examples/second between steps 20007 and 20520
I0319 18:21:07.811231 140547769902912 controller.py:220] step: 20520 steps_per_second: 1.05 {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
I0319 18:21:07.811421 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20520 of 36423
I0319 18:29:04.059954 140547769902912 keras_utils.py:120] TimeHistory: 2688.74 examples/second between steps 20520 and 21033
I0319 18:29:04.064098 140547769902912 controller.py:220] step: 21033 steps_per_second: 1.08 {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
I0319 18:29:04.064261 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21033 of 36423
I0319 18:37:00.501375 140547769902912 keras_utils.py:120] TimeHistory: 2687.68 examples/second between steps 21033 and 21546
I0319 18:37:00.505643 140547769902912 controller.py:220] step: 21546 steps_per_second: 1.08 {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
I0319 18:37:00.505825 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21546 of 36423
I0319 18:44:57.448547 140547769902912 keras_utils.py:120] TimeHistory: 2684.84 examples/second between steps 21546 and 22059
I0319 18:44:57.453764 140547769902912 controller.py:220] step: 22059 steps_per_second: 1.08 {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
I0319 18:44:57.453983 140547769902912 controller.py:185] Start evaluation at step: 22059
step: 20007 evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
step: 20520 steps_per_second: 1.05 {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
step: 21033 steps_per_second: 1.08 {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
step: 21546 steps_per_second: 1.08 {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
step: 22059 steps_per_second: 1.08 {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
:::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:44:57.962047 140547769902912 mlp_log.py:80] :::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.834538 140547769902912 mlp_log.py:80] :::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.841803 140547769902912 mlp_log.py:80] :::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.842825 140547769902912 mlp_log.py:80] :::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.843794 140547769902912 mlp_log.py:80] :::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 18:45:08.853801 140547769902912 controller.py:220] step: 22059 evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
I0319 18:45:08.853981 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22059 of 36423
I0319 18:53:05.613945 140547769902912 keras_utils.py:120] TimeHistory: 2685.85 examples/second between steps 22059 and 22572
I0319 18:53:05.618196 140547769902912 controller.py:220] step: 22572 steps_per_second: 1.05 {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
I0319 18:53:05.618384 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22572 of 36423
I0319 19:01:03.341272 140547769902912 keras_utils.py:120] TimeHistory: 2680.44 examples/second between steps 22572 and 23085
I0319 19:01:03.345571 140547769902912 controller.py:220] step: 23085 steps_per_second: 1.07 {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
I0319 19:01:03.345741 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23085 of 36423
I0319 19:09:00.970286 140547769902912 keras_utils.py:120] TimeHistory: 2680.99 examples/second between steps 23085 and 23598
I0319 19:09:00.974560 140547769902912 controller.py:220] step: 23598 steps_per_second: 1.07 {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
I0319 19:09:00.974729 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23598 of 36423
I0319 19:16:57.904174 140547769902912 keras_utils.py:120] TimeHistory: 2684.90 examples/second between steps 23598 and 24111
I0319 19:16:57.908411 140547769902912 controller.py:220] step: 24111 steps_per_second: 1.08 {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
I0319 19:16:57.908590 140547769902912 controller.py:185] Start evaluation at step: 24111
step: 22059 evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
step: 22572 steps_per_second: 1.05 {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
step: 23085 steps_per_second: 1.07 {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
step: 23598 steps_per_second: 1.07 {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
step: 24111 steps_per_second: 1.08 {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
:::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:16:58.442478 140547769902912 mlp_log.py:80] :::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.277749 140547769902912 mlp_log.py:80] :::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.285157 140547769902912 mlp_log.py:80] :::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.286182 140547769902912 mlp_log.py:80] :::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.287138 140547769902912 mlp_log.py:80] :::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:17:09.297158 140547769902912 controller.py:220] step: 24111 evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
I0319 19:17:09.297350 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24111 of 36423
I0319 19:25:05.843054 140547769902912 keras_utils.py:120] TimeHistory: 2687.06 examples/second between steps 24111 and 24624
I0319 19:25:05.847337 140547769902912 controller.py:220] step: 24624 steps_per_second: 1.05 {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
I0319 19:25:05.847517 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24624 of 36423
I0319 19:33:01.919262 140547769902912 keras_utils.py:120] TimeHistory: 2689.74 examples/second between steps 24624 and 25137
I0319 19:33:01.923496 140547769902912 controller.py:220] step: 25137 steps_per_second: 1.08 {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
I0319 19:33:01.923671 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25137 of 36423
I0319 19:40:58.928619 140547769902912 keras_utils.py:120] TimeHistory: 2684.48 examples/second between steps 25137 and 25650
I0319 19:40:58.932954 140547769902912 controller.py:220] step: 25650 steps_per_second: 1.08 {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
I0319 19:40:58.933148 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25650 of 36423
I0319 19:48:55.050780 140547769902912 keras_utils.py:120] TimeHistory: 2689.48 examples/second between steps 25650 and 26163
I0319 19:48:55.054964 140547769902912 controller.py:220] step: 26163 steps_per_second: 1.08 {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
I0319 19:48:55.055132 140547769902912 controller.py:185] Start evaluation at step: 26163
step: 24111 evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
step: 24624 steps_per_second: 1.05 {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
step: 25137 steps_per_second: 1.08 {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
step: 25650 steps_per_second: 1.08 {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
step: 26163 steps_per_second: 1.08 {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
:::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:48:55.547407 140547769902912 mlp_log.py:80] :::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.552730 140547769902912 mlp_log.py:80] :::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.559940 140547769902912 mlp_log.py:80] :::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.560959 140547769902912 mlp_log.py:80] :::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.561913 140547769902912 mlp_log.py:80] :::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 19:49:06.571880 140547769902912 controller.py:220] step: 26163 evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
I0319 19:49:06.572060 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26163 of 36423
I0319 19:57:04.827503 140547769902912 keras_utils.py:120] TimeHistory: 2677.45 examples/second between steps 26163 and 26676
I0319 19:57:04.831851 140547769902912 controller.py:220] step: 26676 steps_per_second: 1.05 {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
I0319 19:57:04.832029 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26676 of 36423
I0319 20:05:01.335356 140547769902912 keras_utils.py:120] TimeHistory: 2687.31 examples/second between steps 26676 and 27189
I0319 20:05:01.339524 140547769902912 controller.py:220] step: 27189 steps_per_second: 1.08 {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
I0319 20:05:01.339692 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27189 of 36423
I0319 20:12:58.183544 140547769902912 keras_utils.py:120] TimeHistory: 2685.39 examples/second between steps 27189 and 27702
I0319 20:12:58.187861 140547769902912 controller.py:220] step: 27702 steps_per_second: 1.08 {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
I0319 20:12:58.188049 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27702 of 36423
I0319 20:20:54.531436 140547769902912 keras_utils.py:120] TimeHistory: 2688.23 examples/second between steps 27702 and 28215
I0319 20:20:54.535721 140547769902912 controller.py:220] step: 28215 steps_per_second: 1.08 {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
I0319 20:20:54.535894 140547769902912 controller.py:185] Start evaluation at step: 28215
step: 26163 evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
step: 26676 steps_per_second: 1.05 {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
step: 27189 steps_per_second: 1.08 {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
step: 27702 steps_per_second: 1.08 {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
step: 28215 steps_per_second: 1.08 {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
:::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:20:55.034361 140547769902912 mlp_log.py:80] :::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.511004 140547769902912 mlp_log.py:80] :::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.518299 140547769902912 mlp_log.py:80] :::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.519329 140547769902912 mlp_log.py:80] :::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.520274 140547769902912 mlp_log.py:80] :::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:21:05.530137 140547769902912 controller.py:220] step: 28215 evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
I0319 20:21:05.530332 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28215 of 36423
I0319 20:29:02.549445 140547769902912 keras_utils.py:120] TimeHistory: 2684.41 examples/second between steps 28215 and 28728
I0319 20:29:02.553695 140547769902912 controller.py:220] step: 28728 steps_per_second: 1.05 {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
I0319 20:29:02.553875 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28728 of 36423
I0319 20:36:59.701035 140547769902912 keras_utils.py:120] TimeHistory: 2683.68 examples/second between steps 28728 and 29241
I0319 20:36:59.705335 140547769902912 controller.py:220] step: 29241 steps_per_second: 1.08 {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
I0319 20:36:59.705515 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29241 of 36423
I0319 20:44:56.506052 140547769902912 keras_utils.py:120] TimeHistory: 2685.63 examples/second between steps 29241 and 29754
I0319 20:44:56.510352 140547769902912 controller.py:220] step: 29754 steps_per_second: 1.08 {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
I0319 20:44:56.510533 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29754 of 36423
I0319 20:52:52.982735 140547769902912 keras_utils.py:120] TimeHistory: 2687.48 examples/second between steps 29754 and 30267
I0319 20:52:52.987001 140547769902912 controller.py:220] step: 30267 steps_per_second: 1.08 {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
I0319 20:52:52.987169 140547769902912 controller.py:185] Start evaluation at step: 30267
step: 28215 evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
step: 28728 steps_per_second: 1.05 {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
step: 29241 steps_per_second: 1.08 {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
step: 29754 steps_per_second: 1.08 {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
step: 30267 steps_per_second: 1.08 {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
:::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:52:53.483794 140547769902912 mlp_log.py:80] :::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.441937 140547769902912 mlp_log.py:80] :::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.452280 140547769902912 mlp_log.py:80] :::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
:::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.453428 140547769902912 mlp_log.py:80] :::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
I0319 20:53:04.463417 140547769902912 controller.py:220] step: 30267 evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
step: 30267 evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
:::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.464306 140547769902912 mlp_log.py:80] :::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
:::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.465044 140547769902912 mlp_log.py:80] :::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
I0319 20:53:04.484413 140547769902912 resnet_ctl_imagenet_main.py:298] Run stats:
{'eval_loss': 0.22827734, 'eval_acc': 0.75942, 'train_loss': 28.79324, 'train_acc': 0.80263704, 'step_timestamp_log': ['BatchTimestamp<batch_index: 0, timestamp: 1679230834.281045>', 'BatchTimestamp<batch_index: 513, timestamp: 1679231312.765514>', 'BatchTimestamp<batch_index: 1026, timestamp: 1679231790.4766295>', 'BatchTimestamp<batch_index: 1539, timestamp: 1679232268.0622654>', 'BatchTimestamp<batch_index: 2052, timestamp: 1679232778.5840638>', 'BatchTimestamp<batch_index: 2565, timestamp: 1679233256.8333852>', 'BatchTimestamp<batch_index: 3078, timestamp: 1679233735.232925>', 'BatchTimestamp<batch_index: 3591, timestamp: 1679234213.5742714>', 'BatchTimestamp<batch_index: 4104, timestamp: 1679234703.2010417>', 'BatchTimestamp<batch_index: 4617, timestamp: 1679235181.7035873>', 'BatchTimestamp<batch_index: 5130, timestamp: 1679235658.7568266>', 'BatchTimestamp<batch_index: 5643, timestamp: 1679236136.837958>', 'BatchTimestamp<batch_index: 6156, timestamp: 1679236625.7255764>', 'BatchTimestamp<batch_index: 6669, timestamp: 1679237103.411414>', 'BatchTimestamp<batch_index: 7182, timestamp: 1679237582.1563838>', 'BatchTimestamp<batch_index: 7695, timestamp: 1679238060.5108216>', 'BatchTimestamp<batch_index: 8208, timestamp: 1679238549.9029462>', 'BatchTimestamp<batch_index: 8721, timestamp: 1679239028.3283317>', 'BatchTimestamp<batch_index: 9234, timestamp: 1679239506.558369>', 'BatchTimestamp<batch_index: 9747, timestamp: 1679239984.4382937>', 'BatchTimestamp<batch_index: 10260, timestamp: 1679240474.3921533>', 'BatchTimestamp<batch_index: 10773, timestamp: 1679240951.96138>', 'BatchTimestamp<batch_index: 11286, timestamp: 1679241429.1646736>', 'BatchTimestamp<batch_index: 11799, timestamp: 1679241906.888098>', 'BatchTimestamp<batch_index: 12312, timestamp: 1679242395.0172863>', 'BatchTimestamp<batch_index: 12825, timestamp: 1679242871.8736327>', 'BatchTimestamp<batch_index: 13338, timestamp: 1679243348.7574499>', 'BatchTimestamp<batch_index: 13851, timestamp: 1679243825.9569237>', 'BatchTimestamp<batch_index: 14364, timestamp: 1679244314.2721043>', 'BatchTimestamp<batch_index: 14877, timestamp: 1679244792.2427475>', 'BatchTimestamp<batch_index: 15390, timestamp: 1679245268.9251325>', 'BatchTimestamp<batch_index: 15903, timestamp: 1679245745.4601164>', 'BatchTimestamp<batch_index: 16416, timestamp: 1679246232.7534444>', 'BatchTimestamp<batch_index: 16929, timestamp: 1679246708.728656>', 'BatchTimestamp<batch_index: 17442, timestamp: 1679247185.7591805>', 'BatchTimestamp<batch_index: 17955, timestamp: 1679247662.4490402>', 'BatchTimestamp<batch_index: 18468, timestamp: 1679248150.4474506>', 'BatchTimestamp<batch_index: 18981, timestamp: 1679248627.1151292>', 'BatchTimestamp<batch_index: 19494, timestamp: 1679249103.5206127>', 'BatchTimestamp<batch_index: 20007, timestamp: 1679249580.1458325>', 'BatchTimestamp<batch_index: 20520, timestamp: 1679250067.8068252>', 'BatchTimestamp<batch_index: 21033, timestamp: 1679250544.0597591>', 'BatchTimestamp<batch_index: 21546, timestamp: 1679251020.501157>', 'BatchTimestamp<batch_index: 22059, timestamp: 1679251497.4479887>', 'BatchTimestamp<batch_index: 22572, timestamp: 1679251985.6137266>', 'BatchTimestamp<batch_index: 23085, timestamp: 1679252463.3410485>', 'BatchTimestamp<batch_index: 23598, timestamp: 1679252940.9701052>', 'BatchTimestamp<batch_index: 24111, timestamp: 1679253417.9039862>', 'BatchTimestamp<batch_index: 24624, timestamp: 1679253905.8428304>', 'BatchTimestamp<batch_index: 25137, timestamp: 1679254381.919039>', 'BatchTimestamp<batch_index: 25650, timestamp: 1679254858.9284008>', 'BatchTimestamp<batch_index: 26163, timestamp: 1679255335.0505683>', 'BatchTimestamp<batch_index: 26676, timestamp: 1679255824.8272724>', 'BatchTimestamp<batch_index: 27189, timestamp: 1679256301.335169>', 'BatchTimestamp<batch_index: 27702, timestamp: 1679256778.1833446>', 'BatchTimestamp<batch_index: 28215, timestamp: 1679257254.531205>', 'BatchTimestamp<batch_index: 28728, timestamp: 1679257742.5492072>', 'BatchTimestamp<batch_index: 29241, timestamp: 1679258219.7008102>', 'BatchTimestamp<batch_index: 29754, timestamp: 1679258696.5058227>', 'BatchTimestamp<batch_index: 30267, timestamp: 1679259172.9825177>'], 'train_finish_time': 1679259184.4644349, 'avg_exp_per_second': 2683.1639506599217}
absl-py
pandas
numpy
tqdm
git+https://github.com/mlcommons/logging.git@0.7.0
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
# worker = []
# nodelist = os.environ["SLURM_JOB_NODELIST"]
# nodename = os.environ["SLURMD_NODENAME"]
# nodelist = hostlist.expand_hostlist(nodelist)
# print('print nodelist2',nodelist)
# num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
# port_number =40000
# worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
## print('print worker_nodes',worker_nodes)
# for node in worker_nodes:
# for index in range(4):
# print('node',node)
# worker_sockets = ":".join([node, str(port_number + index )])
# worker.append(worker_sockets)
# os.environ['TF_CONFIG'] = json.dumps({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
#
#
# print({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
worker = []
nodelist = os.environ["SLURM_JOB_NODELIST"]
nodename = os.environ["SLURMD_NODENAME"]
nodelist = hostlist.expand_hostlist(nodelist)
print('print nodelist2',nodelist)
num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
port_number =40000
worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
# print('print worker_nodes',worker_nodes)
for node in worker_nodes:
for index in range(4):
print('node',node)
worker_sockets = ":".join([node, str(port_number + index )])
worker.append(worker_sockets)
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': worker
},
'task': {'type': 'worker', 'index': num_index}
})
print({
'cluster': {
'worker': worker
},
'task': {'type': 'worker', 'index': num_index}
})
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
# worker = []
# nodelist = os.environ["SLURM_JOB_NODELIST"]
# nodename = os.environ["SLURMD_NODENAME"]
# nodelist = hostlist.expand_hostlist(nodelist)
# print('print nodelist2',nodelist)
# num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
# port_number =40000
# worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
## print('print worker_nodes',worker_nodes)
# for node in worker_nodes:
# for index in range(4):
# print('node',node)
# worker_sockets = ":".join([node, str(port_number + index )])
# worker.append(worker_sockets)
# os.environ['TF_CONFIG'] = json.dumps({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
#
#
# print({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet50 model for Keras.
Adapted from tf.keras.applications.resnet50.ResNet50().
This is ResNet model version 1.5.
Related papers/blogs:
- https://arxiv.org/abs/1512.03385
- https://arxiv.org/pdf/1603.05027v2.pdf
- http://torch.ch/blog/2016/02/04/resnets.html
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
import imagenet_preprocessing
from tensorflow.keras import backend
from tensorflow.keras import initializers
from tensorflow.keras import layers as tf_python_keras_layers
from tensorflow.keras import models
from tensorflow.keras import regularizers
BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
FLAGS = flags.FLAGS
flags.DEFINE_float(
'weight_decay',
default=1e-4,
help=('Weight decay coefficiant for l2 regularization.'))
flags.DEFINE_integer(
'num_accumulation_steps',
default=8,
help=('number of steps to accumulate with large batch size.'))
layers = tf_python_keras_layers
def change_keras_layer(use_tf_keras_layers=False):
"""Change layers to either tf.keras.layers or tf.python.keras.layers.
Layer version of tf.keras.layers is depends on tensorflow version, but
tf.python.keras.layers checks environment variable TF2_BEHAVIOR.
This function is a temporal function to use tf.keras.layers.
Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer.
this function is useful for tracking benchmark result for each version.
This function will be removed when we use tf.keras.layers as default.
TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training
speed parity with tf v1 batchnorm.
Args:
use_tf_keras_layers: whether to use tf.keras.layers.
"""
global layers
if use_tf_keras_layers:
layers = tf.keras.layers
else:
layers = tf_python_keras_layers
def _gen_l2_regularizer(use_l2_regularizer=True):
return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None
def identity_block(input_tensor,
kernel_size,
filters,
stage,
block,
use_l2_regularizer=True):
"""The identity block is the block that has no conv layer at shortcut.
Args:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
use_l2_regularizer: whether to use L2 regularizer on Conv layer.
Returns:
Output tensor for the block.
"""
filters1, filters2, filters3 = filters
if backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(
filters1, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2a')(
input_tensor)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters2,
kernel_size,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2b')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters3, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2c')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2c')(
x)
x = layers.add([x, input_tensor])
x = layers.Activation('relu')(x)
return x
def conv_block(input_tensor,
kernel_size,
filters,
stage,
block,
strides=(2, 2),
use_l2_regularizer=True):
"""A block that has a conv layer at shortcut.
Note that from stage 3,
the second conv layer at main path is with strides=(2, 2)
And the shortcut should have strides=(2, 2) as well
Args:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
strides: Strides for the second conv layer in the block.
use_l2_regularizer: whether to use L2 regularizer on Conv layer.
Returns:
Output tensor for the block.
"""
filters1, filters2, filters3 = filters
if backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(
filters1, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2a')(
input_tensor)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters2,
kernel_size,
strides=strides,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2b')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters3, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2c')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2c')(
x)
shortcut = layers.Conv2D(
filters3, (1, 1),
strides=strides,
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '1')(
input_tensor)
shortcut = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '1')(
shortcut)
x = layers.add([x, shortcut])
x = layers.Activation('relu')(x)
return x
def resnet50(num_classes,
batch_size=None,
use_l2_regularizer=True,
rescale_inputs=False):
"""Instantiates the ResNet50 architecture.
Args:
num_classes: `int` number of classes for image classification.
batch_size: Size of the batches for each step.
use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
rescale_inputs: whether to rescale inputs from 0 to 1.
Returns:
A Keras model instance.
"""
input_shape = (224, 224, 3)
img_input = layers.Input(shape=input_shape)
if rescale_inputs:
# Hub image modules expect inputs in the range [0, 1]. This rescales these
# inputs to the range expected by the trained model.
x = layers.Lambda(
lambda x: x * 255.0 - backend.constant(
imagenet_preprocessing.CHANNEL_MEANS,
shape=[1, 1, 3],
dtype=x.dtype),
name='rescale')(
img_input)
else:
x = img_input
if backend.image_data_format() == 'channels_first':
x = layers.Lambda(
lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
name='transpose')(x)
bn_axis = 1
else: # channels_last
bn_axis = 3
x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
x = layers.Conv2D(
64, (7, 7),
strides=(2, 2),
padding='valid',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name='conv1')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name='bn_conv1')(
x)
x = layers.Activation('relu')(x)
x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
x = conv_block(
x,
3, [64, 64, 256],
stage=2,
block='a',
strides=(1, 1),
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [64, 64, 256],
stage=2,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [64, 64, 256],
stage=2,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [128, 128, 512],
stage=3,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='d',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [256, 256, 1024],
stage=4,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='d',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='e',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='f',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [512, 512, 2048],
stage=5,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [512, 512, 2048],
stage=5,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [512, 512, 2048],
stage=5,
block='c',
use_l2_regularizer=use_l2_regularizer)
rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
x = layers.Dense(
num_classes,
kernel_initializer=initializers.RandomNormal(stddev=0.01),
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name='fc1000')(
x)
# print('x.dtype: %s' % x.dtype_policy)
# 'kernel' is dense1's variable
#print('layers.Dense.kernel.dtype: %s' % layers.Dense.kernel.dtype.name)
# A softmax that is followed by the model loss must be done cannot be done
# in float16 due to numeric issues. So we pass dtype=float32.
x = layers.Activation('softmax', dtype='float32')(x)
# Create model.
return models.Model(img_input, x, name='resnet50')
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from absl import logging
import tensorflow as tf
from tf2_common.training import standard_runnable
from tf2_common.training import utils
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_model
flags.DEFINE_boolean('trace_warmup', default=False,
help='Whether or not to programmatically capture an Xprof'
' trace in the warmup loop.')
class _UnwrapPreventer(object):
"""Wrapper that DistributionStrategy will not unwrap.
Typically, DistributionStrategy will unwrap values when going from a cross-
replica context to a replica context via `call_for_each_replica`. This class
is a wrapper that DistributionStrategy will not unwrap, so it can be used to
prevent it from unwrapping a value.
TODO(reedwm): Find/implement a better way of preventing values from being
unwrapped by DistributionStrategy
"""
__slots__ = ['value']
def __init__(self, value):
self.value = value
class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup):
"""Implements the training and evaluation APIs for Resnet model."""
def __init__(self, flags_obj, time_callback):
standard_runnable.StandardRunnableWithWarmup.__init__(
self,
flags_obj.use_tf_while_loop,
flags_obj.use_tf_function)
self.strategy = tf.distribute.get_strategy()
self.flags_obj = flags_obj
self.dtype = flags_core.get_tf_dtype(flags_obj)
self.time_callback = time_callback
# Input pipeline related
batch_size = flags_obj.batch_size
if batch_size % self.strategy.num_replicas_in_sync != 0:
raise ValueError(
'Batch size must be divisible by number of replicas : {}'.format(
self.strategy.num_replicas_in_sync))
steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
if train_epochs > 1:
train_epochs = flags_obj.train_epochs
# As auto rebatching is not supported in
# `experimental_distribute_datasets_from_function()` API, which is
# required when cloning dataset to multiple workers in eager mode,
# we use per-replica batch size.
self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
self.synthetic_input_fn = common.get_synth_input_fn(
height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
num_channels=imagenet_preprocessing.NUM_CHANNELS,
num_classes=self.flags_obj.num_classes,
dtype=self.dtype,
drop_remainder=True)
if self.flags_obj.use_synthetic_data:
self.input_fn = self.synthetic_input_fn
else:
self.input_fn = imagenet_preprocessing.input_fn
resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
self.model = resnet_model.resnet50(
num_classes=self.flags_obj.num_classes,
batch_size=flags_obj.batch_size,
use_l2_regularizer=not flags_obj.single_l2_loss_op)
self.use_lars_optimizer = False
self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
if self.flags_obj.optimizer == 'LARS':
self.use_lars_optimizer = True
self.optimizer, _ = common.get_optimizer(
flags_obj=flags_obj,
steps_per_epoch=steps_per_epoch,
train_steps=steps_per_epoch * train_epochs)
# Make sure iterations variable is created inside scope.
self.global_step = self.optimizer.iterations
if self.dtype == tf.float16:
print("enter fp16 computing")
loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
self.optimizer = (
tf.keras.mixed_precision.LossScaleOptimizer(
self.optimizer, dynamic=False, initial_scale=loss_scale))
elif flags_obj.fp16_implementation == 'graph_rewrite':
# `dtype` is still float32 in this case. We built the graph in float32
# and let the graph rewrite change parts of it float16.
if not flags_obj.use_tf_function:
raise ValueError('--fp16_implementation=graph_rewrite requires '
'--use_tf_function to be true')
loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
self.optimizer = (
tf.train.experimental.enable_mixed_precision_graph_rewrite(
self.optimizer, loss_scale))
self.one_hot = False
self.label_smoothing = flags_obj.label_smoothing
if self.label_smoothing and self.label_smoothing > 0:
self.one_hot = True
if flags_obj.report_accuracy_metrics:
self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
if self.one_hot:
self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
'train_accuracy', dtype=tf.float32)
else:
self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
'train_accuracy', dtype=tf.float32)
self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
else:
self.train_loss = None
self.train_accuracy = None
self.test_loss = None
if self.one_hot:
self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
'test_accuracy', dtype=tf.float32)
else:
self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
'test_accuracy', dtype=tf.float32)
# self.test_corrects = tf.keras.metrics.Sum(
# 'test_corrects', dtype=tf.float32)
self.num_eval_steps = common.get_num_eval_steps(flags_obj)
self.checkpoint = tf.train.Checkpoint(
model=self.model, optimizer=self.optimizer)
# Handling epochs.
self.epoch_steps = steps_per_epoch
self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)
self.steps_per_loop = flags_obj.steps_per_loop
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
self.trace_end_step = profile_steps[1]
else:
self.trace_start_step = None
self.trace_end_step = None
self.epochs_between_evals = flags_obj.epochs_between_evals
self.training_vars = self.model.trainable_variables
self.accum_grads = []
self.accum_grads_dtype = tf.float32
if self.num_accumulation_steps > 1:
for var in self.training_vars:
self.accum_grads.append(self.optimizer.add_weight(
name=var.name + '_accum',
shape=var.shape,
dtype=self.accum_grads_dtype,
initializer='zeros',
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.SUM))
def build_train_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=True,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_train_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.training_dataset_cache,
prefetch_batchs=self.flags_obj.training_prefetch_batchs)
def build_eval_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=False,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_eval_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.eval_dataset_cache,
prefetch_batchs=self.flags_obj.eval_prefetch_batchs)
def build_synthetic_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.synthetic_input_fn,
is_training=True,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_train_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.training_dataset_cache,
prefetch_batchs=self.flags_obj.training_prefetch_batchs)
def train_loop_begin(self):
"""See base class."""
# Reset all metrics
if self.train_loss:
self.train_loss.reset_states()
if self.train_accuracy:
self.train_accuracy.reset_states()
self._epoch_begin()
if self.trace_start_step:
global_step = self.global_step.numpy()
next_global_step = global_step + self.steps_per_loop
if (global_step <= self.trace_start_step and
self.trace_start_step < next_global_step):
self.trace_start(global_step)
self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
def train_step(self, iterator):
"""See base class."""
@tf.function(experimental_compile=False)
def local_step(images, labels):
"""Local computation of a step."""
with tf.GradientTape() as tape:
logits = self.model(images, training=True)
if self.one_hot:
prediction_loss = tf.keras.losses.categorical_crossentropy(
labels, logits, label_smoothing=self.label_smoothing)
else:
prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
labels, logits)
loss = tf.reduce_sum(prediction_loss) * (
1.0 / self.flags_obj.batch_size)
# Save ~3 seconds per epoch on GPU when skipping
# L2 loss computation; can only skip when using LARS
# Details in decription of cl/308018913
if not self.use_lars_optimizer:
num_replicas = self.strategy.num_replicas_in_sync
if self.flags_obj.single_l2_loss_op:
l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([
tf.nn.l2_loss(v)
for v in self.model.trainable_variables
if 'bn' not in v.name
])
loss += (l2_loss / num_replicas)
else:
loss += (tf.reduce_sum(self.model.losses) / num_replicas)
# Scale the loss
if self.flags_obj.dtype == 'fp16':
loss = self.optimizer.get_scaled_loss(loss)
grads = tape.gradient(loss, self.model.trainable_variables)
# Unscale the grads
if self.flags_obj.dtype == 'fp16':
grads = self.optimizer.get_unscaled_gradients(grads)
return logits, loss, grads
def _maybe_apply_grads_and_clear(distribution):
def _apply_grads_and_clear_for_each_replica():
local_replica_id = tf.get_static_value(
self.strategy.extended._get_local_replica_id(
tf.distribute.get_replica_context().replica_id_in_sync_group))
replica_accum_grads = []
for accum_grad, var in zip(self.accum_grads, self.training_vars):
local_accum_grad = self.strategy.experimental_local_results(
accum_grad)
replica_accum_grad = local_accum_grad[local_replica_id]
replica_accum_grad = tf.cast(replica_accum_grad, var.dtype)
replica_accum_grads.append(replica_accum_grad)
self.optimizer.apply_gradients(
zip(replica_accum_grads, self.training_vars))
for accum_grad in self.accum_grads:
accum_grad.assign(tf.zeros_like(accum_grad,
dtype=self.accum_grads_dtype),
read_value=False)
def _apply_grads_and_clear():
distribution.extended.call_for_each_replica(
_apply_grads_and_clear_for_each_replica,
args=())
return self.optimizer.iterations.assign_add(0, read_value=False)
def _advance_iteration():
return self.optimizer.iterations.assign_add(1, read_value=False)
tf.cond(
tf.equal(self.optimizer.iterations % self.num_accumulation_steps,
self.num_accumulation_steps - 1),
_apply_grads_and_clear,
_advance_iteration)
def step_fn(inputs):
"""Function to run on the device."""
images, labels = inputs
logits, loss, grads = local_step(images, labels)
if self.num_accumulation_steps > 1:
for grad, accum_grad in zip(grads, self.accum_grads):
accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype),
read_value=False)
tf.distribute.get_replica_context().merge_call(
_maybe_apply_grads_and_clear,
args=())
else:
self.optimizer.apply_gradients(zip(grads, self.training_vars))
if self.train_loss:
self.train_loss.update_state(loss)
if self.train_accuracy:
self.train_accuracy.update_state(labels, logits)
self.strategy.run(step_fn, args=(next(iterator),))
def train_loop_end(self):
"""See base class."""
metrics = {}
if self.train_loss:
metrics['train_loss'] = self.train_loss.result()
if self.train_accuracy:
metrics['train_accuracy'] = self.train_accuracy.result()
self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
if self.trace_end_step:
global_step = self.global_step.numpy()
next_global_step = global_step + self.steps_per_loop
if (global_step <= self.trace_end_step and
self.trace_end_step < next_global_step):
self.trace_end(global_step)
self._epoch_end()
return metrics
def eval_begin(self):
"""See base class."""
if self.test_loss:
self.test_loss.reset_states()
if self.test_accuracy:
self.test_accuracy.reset_states()
# self.test_corrects.reset_states()
epoch_num = int(self.epoch_helper.current_epoch)
mlp_log.mlperf_print('eval_start', None,
metadata={'epoch_num': epoch_num + 1})
def eval_step(self, iterator):
"""See base class."""
def step_fn(inputs):
"""Function to run on the device."""
images, labels = inputs
logits = self.model(images, training=False)
if self.test_loss:
if self.one_hot:
loss = tf.keras.losses.categorical_crossentropy(
labels, logits, label_smoothing=self.label_smoothing)
else:
loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
self.test_loss.update_state(loss)
if self.test_accuracy:
self.test_accuracy.update_state(labels, logits)
# tf.print('labels.shape: ', labels.shape,
# ', logits.shape: ', logits.shape,
# ', result: ', self.test_accuracy.result())
# self.test_corrects.update_state(
# tf.cast(
# tf.reduce_sum(
# tf.cast(
# tf.equal(
# tf.cast(tf.argmax(logits, axis=1), labels.dtype),
# labels), tf.int32)), tf.float32))
self.strategy.run(step_fn, args=(next(iterator),))
def eval_end(self):
"""See base class."""
epoch_num = int(self.epoch_helper.current_epoch)
mlp_log.mlperf_print('eval_stop', None,
metadata={'epoch_num': epoch_num + 1})
eval_accuracy = float(self.test_accuracy.result())
# eval_accuracy = float(self.test_corrects.result()
# ) / imagenet_preprocessing.NUM_IMAGES['validation']
# eval_accuracy = float(self.test_accuracy.result()) * \
# self.flags_obj.batch_size * self.num_eval_steps / \
# imagenet_preprocessing.NUM_IMAGES['validation']
mlp_log.mlperf_print(
'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1})
first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0)
epoch_count = self.epochs_between_evals
if first_epoch_num == 0:
epoch_count = self.flags_obj.eval_offset_epochs
if epoch_count == 0:
epoch_count = self.flags_obj.epochs_between_evals
mlp_log.mlperf_print(
'block_stop',
None,
metadata={
'first_epoch_num': first_epoch_num + 1,
'epoch_count': epoch_count
})
continue_training = True
if eval_accuracy >= self.flags_obj.target_accuracy:
continue_training = False
else:
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num': epoch_num + 2,
'epoch_count': self.epochs_between_evals
})
results = {}
if self.test_loss:
results['test_loss'] = self.test_loss.result()
if self.test_accuracy:
results['test_accuracy'] = self.test_accuracy.result()
results['continue_training'] = continue_training
return results
def warmup_loop_begin(self):
"""See base class."""
if self.flags_obj.trace_warmup:
self.trace_start(-3)
logging.info('Entering the warmup loop.')
def warmup_loop_end(self):
"""See base class."""
if self.flags_obj.trace_warmup:
self.trace_end(-2)
# Reset the state
self.model.reset_states()
tf.keras.backend.set_value(self.optimizer.iterations, 0)
for accum_grad in self.accum_grads:
accum_grad.assign(tf.zeros_like(accum_grad,
dtype=self.accum_grads_dtype),
read_value=False)
logging.info('Exiting the warmup loop.')
def _epoch_begin(self):
if self.epoch_helper.epoch_begin():
self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
def _epoch_end(self):
# mlp_log.mlperf_print('epoch_stop', None)
if self.epoch_helper.epoch_end():
self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
def trace_start(self, global_step):
logging.info('Starting tracing at step %d.', global_step)
tf.profiler.experimental.start(self.flags_obj.model_dir)
def trace_end(self, global_step):
logging.info('Ending trace at step %d', global_step)
tf.profiler.experimental.stop()
XLA_FLAGS="--xla_gpu_cuda_data_dir=/public/software/compiler/rocm/dtk-21.10.1/amdgcn/bitcode/ --xla_dump_hlo_pass_re=.* --xla_dump_hlo_as_html --xla_dump_to=./tmp" TF_DUMP_GRAPH_PREFIX="./tf_graph" hipprof --hip-trace python3 ./resnet_ctl_imagenet_main.py \
--base_learning_rate=10.0 \
--batch_size=32 \
--nocache_decoded_image \
--data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow \
--device_warmup_steps=1 \
--dtype=fp32 \
--noenable_checkpoint_and_export \
--noenable_device_warmup \
--enable_eager \
--epochs_between_evals=4 \
--noeval_dataset_cache \
--eval_offset_epochs=2 \
--label_smoothing=0.1 \
--lars_epsilon=0 \
--log_steps=125 \
--lr_schedule=polynomial \
--optimizer=LARS \
--noreport_accuracy_metrics \
--single_l2_loss_op \
--steps_per_loop=25 \
--train_epochs=1 \
--notraining_dataset_cache \
--notrace_warmup \
--nouse_synthetic_data \
--use_tf_function \
--verbosity=0 \
--warmup_epochs=5 \
--weight_decay=0.0002 \
--target_accuracy=0.759 \
--momentum=0.9 \
--num_replicas=64 \
--num_accumulation_steps=2 \
--num_classes=1000 \
--noskip_eval
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to training performance."""
import tensorflow as tf
def configure_optimizer(optimizer,
use_float16=False,
use_graph_rewrite=False,
loss_scale="dynamic"):
"""Configures optimizer object with performance options."""
if use_float16:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
optimizer, loss_scale=loss_scale))
if use_graph_rewrite:
# Note: the model dtype must be 'float32', which will ensure
# tf.ckeras.mixed_precision and
# tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
# up.
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer)
return optimizer
def set_mixed_precision_policy(dtype, loss_scale=None):
"""Sets mix precision policy."""
if dtype == tf.float16:
print("enter the tf.float16 set policy")
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
tf.keras.mixed_precision.experimental.set_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)
# tf.keras.mixed_precision.experimental.set_policy('float16')
elif dtype == tf.bfloat16:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy(policy)
elif dtype == tf.float32:
tf.keras.mixed_precision.experimental.set_policy('float32')
else:
raise ValueError("Unexpected dtype: %s" % dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment