Commit 05631eec authored by liangjing's avatar liangjing
Browse files

version 1

parent 7e0391d9
This diff is collapsed.
cat $1 |grep eval_accuracy|awk -F eval_accuracy '{print $2}'|awk -F value '{print $2}'|awk '{print $2}'|uniq
This diff is collapsed.
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
# from tf2_common.training import optimizer_v2modified
from tensorflow.python.framework import ops
from tensorflow.python.keras import backend_config
from tensorflow.python.keras.optimizer_v2 import optimizer_v2
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import linalg_ops
from tensorflow.python.ops import math_ops
from tensorflow.python.training import training_ops
from tensorflow.python.ops import state_ops
# class LARSOptimizer(optimizer_v2modified.OptimizerV2Modified):
#class LARSOptimizer(optimizer_v2.OptimizerV2):
class LARSOptimizer(tf.keras.optimizers.Optimizer):
"""Layer-wise Adaptive Rate Scaling for large batch training.
Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
Implements the LARS learning rate scheme presented in the paper above. This
optimizer is useful when scaling the batch size to up to 32K without
significant performance degradation. It is recommended to use the optimizer
in conjunction with:
- Gradual learning rate warm-up
- Linear learning rate scaling
- Poly rule learning rate decay
Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
use the default momentum optimizer.
"""
def __init__(
self,
learning_rate,
momentum=0.9,
weight_decay=0.0001,
# The LARS coefficient is a hyperparameter
eeta=0.001,
epsilon=0.0,
name="LARSOptimizer",
# Enable skipping variables from LARS scaling.
# TODO(sameerkm): Enable a direct mechanism to pass a
# subset of variables to the optimizer.
skip_list=None,
use_nesterov=False,
**kwargs):
"""Construct a new LARS Optimizer.
Args:
learning_rate: A `Tensor`, floating point value, or a schedule that is a
`tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
that takes no arguments and returns the actual value to use. The
learning rate.
momentum: A floating point value. Momentum hyperparameter.
weight_decay: A floating point value. Weight decay hyperparameter.
eeta: LARS coefficient as used in the paper. Dfault set to LARS
coefficient from the paper. (eeta / weight_decay) determines the highest
scaling factor in LARS.
epsilon: Optional epsilon parameter to be set in models that have very
small gradients. Default set to 0.0.
name: Optional name prefix for variables and ops created by LARSOptimizer.
skip_list: List of strings to enable skipping variables from LARS scaling.
If any of the strings in skip_list is a subset of var.name, variable
'var' is skipped from LARS scaling. For a typical classification model
with batch normalization, the skip_list is ['batch_normalization',
'bias']
use_nesterov: when set to True, nesterov momentum will be enabled
**kwargs: keyword arguments.
Raises:
ValueError: If a hyperparameter is set to a non-sensical value.
"""
if momentum < 0.0:
raise ValueError("momentum should be positive: %s" % momentum)
if weight_decay < 0.0:
raise ValueError("weight_decay should be positive: %s" % weight_decay)
super(LARSOptimizer, self).__init__(name=name, **kwargs)
self._set_hyper("learning_rate", learning_rate)
# When directly using class members, instead of
# _set_hyper and _get_hyper (such as learning_rate above),
# the values are fixed after __init(), and not being
# updated during the training process.
# This provides better performance but less flexibility.
self.momentum = momentum
self.weight_decay = weight_decay
self.eeta = eeta
self.epsilon = epsilon or backend_config.epsilon()
self._skip_list = skip_list
self.use_nesterov = use_nesterov
def _prepare_local(self, var_device, var_dtype, apply_state):
lr_t = self._get_hyper("learning_rate", var_dtype)
local_step = math_ops.cast(self.iterations, var_dtype)
lr_t = math_ops.cast(lr_t(local_step), var_dtype)
learning_rate_t = array_ops.identity(lr_t)
apply_state[(var_device, var_dtype)].update(
dict(
learning_rate=learning_rate_t,
))
def _create_slots(self, var_list):
for v in var_list:
self.add_slot(v, "momentum")
def compute_lr(self, grad, var, coefficients):
scaled_lr = coefficients["learning_rate"]
if self._skip_list is None or not any(v in var.name
for v in self._skip_list):
w_norm = linalg_ops.norm(var, ord=2)
g_norm = linalg_ops.norm(grad, ord=2)
trust_ratio = array_ops.where(
math_ops.greater(w_norm, 0),
array_ops.where(
math_ops.greater(g_norm, 0),
(self.eeta * w_norm /
(g_norm + self.weight_decay * w_norm + self.epsilon)), 1.0), 1.0)
scaled_lr = coefficients["learning_rate"] * trust_ratio
# Add the weight regularization gradient
grad = grad + self.weight_decay * var
return scaled_lr, grad
def _apply_dense(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
scaled_lr, grad = self.compute_lr(grad, var, coefficients)
mom = self.get_slot(var, "momentum")
return training_ops.apply_momentum(
var,
mom,
math_ops.cast(1.0, var.dtype.base_dtype),
grad * scaled_lr,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def _resource_apply_dense(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
scaled_lr, grad = self.compute_lr(grad, var, coefficients)
mom = self.get_slot(var, "momentum")
# Use ApplyKerasMomentum instead of ApplyMomentum
# training_ops.resource_apply_keras_momentum(
# var.handle,
# mom.handle,
# scaled_lr,
# grad,
# coefficients["momentum"],
# use_locking=False,
# use_nesterov=self.use_nesterov)
mom_t = mom * self.momentum - grad * scaled_lr
mom_t = state_ops.assign(mom, mom_t, use_locking=False)
if self.use_nesterov:
var_t = var + mom_t * self.momentum - grad * scaled_lr
else:
var_t = var + mom_t
return state_ops.assign(var, var_t, use_locking=False).op
# Fallback to momentum optimizer for sparse tensors
def _apply_sparse(self, grad, var, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
mom = self.get_slot(var, "momentum")
return training_ops.sparse_apply_momentum(
var,
mom,
coefficients["learning_rate"],
grad.values,
grad.indices,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
var_device, var_dtype = var.device, var.dtype.base_dtype
coefficients = ((apply_state or {}).get((var_device, var_dtype))
or self._fallback_apply_state(var_device, var_dtype))
mom = self.get_slot(var, "momentum")
return training_ops.resource_sparse_apply_keras_momentum(
var.handle,
mom.handle,
coefficients["learning_rate"],
grad,
indices,
self.momentum,
use_locking=False,
use_nesterov=self.use_nesterov)
def get_config(self):
config = super(LARSOptimizer, self).get_config()
config.update({
"learning_rate": self._serialize_hyperparameter("learning_rate"),
"momentum": self.momentum,
"weight_decay": self.weight_decay,
"eeta": self.eeta,
"epsilon": self.epsilon,
"use_nesterov": self.use_nesterov,
})
return config
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Enable Layer-wise Adaptive Rate Scaling optimizer in ResNet."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
from tf2_common.utils.mlp_log import mlp_log
from tensorflow.python.eager import context
from tensorflow.python.framework import ops
from tensorflow.python.ops import math_ops
FLAGS = flags.FLAGS
def define_lars_flags():
"""Defines flags needed by LARS optimizer."""
flags.DEFINE_float(
'end_learning_rate', default=None,
help=('Polynomial decay end learning rate.'))
flags.DEFINE_float(
'lars_epsilon', default=0.0,
help=('Override autoselected LARS epsilon.'))
flags.DEFINE_float(
'warmup_epochs', default=None,
help=('Override autoselected polynomial decay warmup epochs.'))
flags.DEFINE_float(
'momentum',
default=0.9,
help=('Momentum parameter used in the MomentumOptimizer.'))
class PolynomialDecayWithWarmup(
tf.keras.optimizers.schedules.LearningRateSchedule):
"""A LearningRateSchedule that uses a polynomial decay with warmup."""
def __init__(
self,
batch_size,
steps_per_epoch,
train_steps,
initial_learning_rate=None,
end_learning_rate=None,
warmup_epochs=None,
compute_lr_on_cpu=False,
name=None):
"""Applies a polynomial decay to the learning rate with warmup."""
super(PolynomialDecayWithWarmup, self).__init__()
self.batch_size = batch_size
self.steps_per_epoch = steps_per_epoch
self.train_steps = train_steps
self.name = name
self.learning_rate_ops_cache = {}
self.compute_lr_on_cpu = compute_lr_on_cpu
if batch_size < 16384:
self.initial_learning_rate = 10.0
warmup_epochs_ = 5
elif batch_size < 32768:
self.initial_learning_rate = 25.0
warmup_epochs_ = 5
else:
self.initial_learning_rate = 31.2
warmup_epochs_ = 25
# Override default poly learning rate and warmup epochs
if initial_learning_rate:
self.initial_learning_rate = initial_learning_rate
if end_learning_rate:
self.end_learning_rate = end_learning_rate
else:
self.end_learning_rate = 0.0001
if warmup_epochs is not None:
warmup_epochs_ = warmup_epochs
self.warmup_epochs = warmup_epochs_
opt_name = FLAGS.optimizer.lower()
mlp_log.mlperf_print('opt_name', opt_name)
if opt_name == 'lars':
mlp_log.mlperf_print('{}_epsilon'.format(opt_name), FLAGS.lars_epsilon)
mlp_log.mlperf_print('{}_opt_weight_decay'.format(opt_name),
FLAGS.weight_decay)
mlp_log.mlperf_print('{}_opt_base_learning_rate'.format(opt_name),
self.initial_learning_rate)
mlp_log.mlperf_print('{}_opt_learning_rate_warmup_epochs'.format(opt_name),
warmup_epochs_)
mlp_log.mlperf_print('{}_opt_end_learning_rate'.format(opt_name),
self.end_learning_rate)
warmup_steps = warmup_epochs_ * steps_per_epoch
self.warmup_steps = tf.cast(warmup_steps, tf.float32)
self.decay_steps = train_steps - warmup_steps + 1
mlp_log.mlperf_print('{}_opt_learning_rate_decay_steps'.format(opt_name),
int(self.decay_steps))
mlp_log.mlperf_print(
'{}_opt_learning_rate_decay_poly_power'.format(opt_name), 2.0)
mlp_log.mlperf_print('{}_opt_momentum'.format(opt_name), FLAGS.momentum)
self.poly_rate_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
initial_learning_rate=self.initial_learning_rate,
decay_steps=self.decay_steps,
end_learning_rate=self.end_learning_rate,
power=2.0)
def __call__(self, step):
if tf.executing_eagerly():
return self._get_learning_rate(step)
# In an eager function or graph, the current implementation of optimizer
# repeatedly call and thus create ops for the learning rate schedule. To
# avoid this, we cache the ops if not executing eagerly.
graph = tf.compat.v1.get_default_graph()
if graph not in self.learning_rate_ops_cache:
if self.compute_lr_on_cpu:
with tf.device('/device:CPU:0'):
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
else:
self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
return self.learning_rate_ops_cache[graph]
def _get_learning_rate(self, step):
with ops.name_scope_v2(self.name or 'PolynomialDecayWithWarmup') as name:
initial_learning_rate = ops.convert_to_tensor_v2(
self.initial_learning_rate, name='initial_learning_rate')
warmup_steps = ops.convert_to_tensor_v2(
self.warmup_steps, name='warmup_steps')
warmup_rate = (
initial_learning_rate * step / warmup_steps)
poly_steps = math_ops.subtract(step, warmup_steps)
poly_rate = self.poly_rate_scheduler(poly_steps)
decay_rate = tf.where(step <= warmup_steps,
warmup_rate, poly_rate, name=name)
return decay_rate
def get_config(self):
return {
'batch_size': self.batch_size,
'steps_per_epoch': self.steps_per_epoch,
'train_steps': self.train_steps,
'initial_learning_rate': self.initial_learning_rate,
'end_learning_rate': self.end_learning_rate,
'warmup_epochs': self.warmup_epochs,
'name': self.name,
}
This diff is collapsed.
absl-py
pandas
numpy
tqdm
git+https://github.com/mlcommons/logging.git@0.7.0
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
# worker = []
# nodelist = os.environ["SLURM_JOB_NODELIST"]
# nodename = os.environ["SLURMD_NODENAME"]
# nodelist = hostlist.expand_hostlist(nodelist)
# print('print nodelist2',nodelist)
# num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
# port_number =40000
# worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
## print('print worker_nodes',worker_nodes)
# for node in worker_nodes:
# for index in range(4):
# print('node',node)
# worker_sockets = ":".join([node, str(port_number + index )])
# worker.append(worker_sockets)
# os.environ['TF_CONFIG'] = json.dumps({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
#
#
# print({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
worker = []
nodelist = os.environ["SLURM_JOB_NODELIST"]
nodename = os.environ["SLURMD_NODENAME"]
nodelist = hostlist.expand_hostlist(nodelist)
print('print nodelist2',nodelist)
num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
port_number =40000
worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
# print('print worker_nodes',worker_nodes)
for node in worker_nodes:
for index in range(4):
print('node',node)
worker_sockets = ":".join([node, str(port_number + index )])
worker.append(worker_sockets)
os.environ['TF_CONFIG'] = json.dumps({
'cluster': {
'worker': worker
},
'task': {'type': 'worker', 'index': num_index}
})
print({
'cluster': {
'worker': worker
},
'task': {'type': 'worker', 'index': num_index}
})
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app
from absl import flags
from absl import logging
import tensorflow as tf
import hostlist
import os
import re
import json
from tf2_common.modeling import performance
from tf2_common.training import controller
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.logs import logger
from tf2_common.utils.misc import distribution_utils
from tf2_common.utils.misc import keras_utils
from tf2_common.utils.misc import model_helpers
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_runnable
flags.DEFINE_boolean(name='use_tf_function', default=True,
help='Wrap the train and test step inside a '
'tf.function.')
flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
help='Calculate L2_loss on concatenated weights, '
'instead of using Keras per-layer L2 loss.')
flags.DEFINE_boolean(name='cache_decoded_image', default=False,
help='Whether or not to cache decoded images in the '
'input pipeline. If this flag and `cache` is enabled, '
'then TFExample protos will be parsed and then cached '
'which reduces the load on hosts.')
flags.DEFINE_boolean(name='enable_device_warmup', default=False,
help='Whether or not to enable device warmup. This '
'includes training on dummy data and enabling graph/XLA '
'compilation before run_start.')
flags.DEFINE_integer(name='device_warmup_steps', default=1,
help='The number of steps to apply for device warmup.')
flags.DEFINE_integer(name='num_replicas', default=32,
help='The number of TPU cores to use, '
'for log printout only.')
def build_stats(runnable, time_callback):
"""Normalizes and returns dictionary of stats.
Args:
runnable: The module containing all the training and evaluation metrics.
time_callback: Time tracking callback instance.
Returns:
Dictionary of normalized results.
"""
stats = {}
if not runnable.flags_obj.skip_eval:
if runnable.test_loss:
stats['eval_loss'] = runnable.test_loss.result().numpy()
if runnable.test_accuracy:
stats['eval_acc'] = runnable.test_accuracy.result().numpy()
if runnable.train_loss:
stats['train_loss'] = runnable.train_loss.result().numpy()
if runnable.train_accuracy:
stats['train_acc'] = runnable.train_accuracy.result().numpy()
if time_callback:
timestamp_log = time_callback.timestamp_log
stats['step_timestamp_log'] = timestamp_log
stats['train_finish_time'] = time_callback.train_finish_time
if time_callback.epoch_runtime_log:
stats['avg_exp_per_second'] = time_callback.average_examples_per_second
return stats
def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
"""Calculates steps to run on device."""
if steps_per_loop <= 0:
raise ValueError('steps_per_loop should be positive integer.')
if steps_per_loop == 1:
return steps_per_loop
return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
def run(flags_obj):
"""Run ResNet ImageNet training and eval loop using custom training loops.
Args:
flags_obj: An object containing parsed flag values.
Raises:
ValueError: If fp16 is passed as it is not currently supported.
Returns:
Dictionary of training and eval stats.
"""
mlp_log.mlperf_print('cache_clear', True)
mlp_log.mlperf_print('init_start', None)
mlp_log.mlperf_print('submission_benchmark', 'resnet')
mlp_log.mlperf_print('submission_division', 'closed')
mlp_log.mlperf_print('submission_org', 'google')
mlp_log.mlperf_print(
'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
mlp_log.mlperf_print('submission_status', 'cloud')
common.print_flags(flags_obj)
num_index = flags_obj.task_index
print('num_index',num_index)
# worker = []
# nodelist = os.environ["SLURM_JOB_NODELIST"]
# nodename = os.environ["SLURMD_NODENAME"]
# nodelist = hostlist.expand_hostlist(nodelist)
# print('print nodelist2',nodelist)
# num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
# port_number =40000
# worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
## print('print worker_nodes',worker_nodes)
# for node in worker_nodes:
# for index in range(4):
# print('node',node)
# worker_sockets = ":".join([node, str(port_number + index )])
# worker.append(worker_sockets)
# os.environ['TF_CONFIG'] = json.dumps({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
#
#
# print({
# 'cluster': {
# 'worker': worker
# },
# 'task': {'type': 'worker', 'index': num_index}
# })
keras_utils.set_session_config(
enable_eager=flags_obj.enable_eager,
enable_xla=flags_obj.enable_xla)
performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
if tf.config.list_physical_devices('GPU'):
if flags_obj.tf_gpu_thread_mode:
datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
per_gpu_thread_count=flags_obj.per_gpu_thread_count,
gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
num_gpus=flags_obj.num_gpus)
if not flags_obj.datasets_num_private_threads:
flags_obj.datasets_num_private_threads = datasets_num_private_threads
common.set_cudnn_batchnorm_mode()
# TODO(anj-s): Set data_format without using Keras.
data_format = flags_obj.data_format
if data_format is None:
data_format = ('channels_first'
if tf.test.is_built_with_cuda() else 'channels_last')
tf.keras.backend.set_image_data_format(data_format)
strategy = distribution_utils.get_distribution_strategy(
distribution_strategy=flags_obj.distribution_strategy,
num_gpus=flags_obj.num_gpus,
all_reduce_alg=flags_obj.all_reduce_alg,
num_packs=flags_obj.num_packs,
tpu_address=flags_obj.tpu,
tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
# strategy = tf.distribute.get_strategy()
# print('after distribution number of replicas : {}'.format(
# strategy.num_replicas_in_sync))
mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
mlp_log.mlperf_print('train_samples',
imagenet_preprocessing.NUM_IMAGES['train'])
mlp_log.mlperf_print('eval_samples',
imagenet_preprocessing.NUM_IMAGES['validation'])
mlp_log.mlperf_print(
'model_bn_span',
int(flags_obj.batch_size /
(flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
eval_steps = common.get_num_eval_steps(flags_obj)
steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
logging.info(
'Training %d epochs, each epoch has %d steps, '
'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
train_epochs * per_epoch_steps, eval_steps)
time_callback = keras_utils.TimeHistory(
flags_obj.batch_size,
flags_obj.log_steps,
logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
with distribution_utils.get_strategy_scope(strategy):
runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
eval_interval = (
flags_obj.epochs_between_evals *
per_epoch_steps if not flags_obj.skip_eval else None)
eval_offset = (
flags_obj.eval_offset_epochs *
per_epoch_steps if not flags_obj.skip_eval else 0)
if eval_offset != 0:
eval_offset -= eval_interval
checkpoint_interval = (
per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
checkpoint_manager = tf.train.CheckpointManager(
runnable.checkpoint,
directory=flags_obj.model_dir,
max_to_keep=10,
step_counter=runnable.global_step,
checkpoint_interval=checkpoint_interval)
device_warmup_steps = (flags_obj.device_warmup_steps
if flags_obj.enable_device_warmup else 0)
if flags_obj.enable_device_warmup:
logging.info('Warmup for %d steps.', device_warmup_steps)
resnet_controller = controller.Controller(
strategy,
runnable.train,
runnable.evaluate,
runnable.warmup,
global_step=runnable.global_step,
steps_per_loop=steps_per_loop,
train_steps=per_epoch_steps * train_epochs,
device_warmup_steps=device_warmup_steps,
checkpoint_manager=checkpoint_manager,
summary_interval=summary_interval,
eval_steps=eval_steps,
eval_interval=eval_interval,
eval_offset=eval_offset)
if flags_obj.enable_device_warmup:
resnet_controller.warmup()
mlp_log.mlperf_print('init_stop', None)
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
if profile_steps[0] < 0:
runnable.trace_start(-1)
time_callback.on_train_begin()
mlp_log.mlperf_print('run_start', None)
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num':
1,
'epoch_count':
(flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
else flags_obj.epochs_between_evals)
})
resnet_controller.train(evaluate=not flags_obj.skip_eval)
mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
time_callback.on_train_end()
mlp_log.mlperf_print('run_final', None)
stats = build_stats(runnable, time_callback)
return stats
def define_imagenet_keras_flags():
common.define_keras_flags()
flags_core.set_defaults()
flags.adopt_module_key_flags(common)
def main(_):
# tf.keras.backend.set_floatx('float16')
model_helpers.apply_clean(flags.FLAGS)
with logger.benchmark_context(flags.FLAGS):
stats = run(flags.FLAGS)
logging.info('Run stats:\n%s', stats)
if __name__ == '__main__':
logging.set_verbosity(logging.INFO)
common.define_keras_flags()
app.run(main)
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""ResNet50 model for Keras.
Adapted from tf.keras.applications.resnet50.ResNet50().
This is ResNet model version 1.5.
Related papers/blogs:
- https://arxiv.org/abs/1512.03385
- https://arxiv.org/pdf/1603.05027v2.pdf
- http://torch.ch/blog/2016/02/04/resnets.html
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
import tensorflow as tf
import imagenet_preprocessing
from tensorflow.keras import backend
from tensorflow.keras import initializers
from tensorflow.keras import layers as tf_python_keras_layers
from tensorflow.keras import models
from tensorflow.keras import regularizers
BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
FLAGS = flags.FLAGS
flags.DEFINE_float(
'weight_decay',
default=1e-4,
help=('Weight decay coefficiant for l2 regularization.'))
flags.DEFINE_integer(
'num_accumulation_steps',
default=8,
help=('number of steps to accumulate with large batch size.'))
layers = tf_python_keras_layers
def change_keras_layer(use_tf_keras_layers=False):
"""Change layers to either tf.keras.layers or tf.python.keras.layers.
Layer version of tf.keras.layers is depends on tensorflow version, but
tf.python.keras.layers checks environment variable TF2_BEHAVIOR.
This function is a temporal function to use tf.keras.layers.
Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer.
this function is useful for tracking benchmark result for each version.
This function will be removed when we use tf.keras.layers as default.
TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training
speed parity with tf v1 batchnorm.
Args:
use_tf_keras_layers: whether to use tf.keras.layers.
"""
global layers
if use_tf_keras_layers:
layers = tf.keras.layers
else:
layers = tf_python_keras_layers
def _gen_l2_regularizer(use_l2_regularizer=True):
return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None
def identity_block(input_tensor,
kernel_size,
filters,
stage,
block,
use_l2_regularizer=True):
"""The identity block is the block that has no conv layer at shortcut.
Args:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
use_l2_regularizer: whether to use L2 regularizer on Conv layer.
Returns:
Output tensor for the block.
"""
filters1, filters2, filters3 = filters
if backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(
filters1, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2a')(
input_tensor)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters2,
kernel_size,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2b')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters3, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2c')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2c')(
x)
x = layers.add([x, input_tensor])
x = layers.Activation('relu')(x)
return x
def conv_block(input_tensor,
kernel_size,
filters,
stage,
block,
strides=(2, 2),
use_l2_regularizer=True):
"""A block that has a conv layer at shortcut.
Note that from stage 3,
the second conv layer at main path is with strides=(2, 2)
And the shortcut should have strides=(2, 2) as well
Args:
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: 'a','b'..., current block label, used for generating layer names
strides: Strides for the second conv layer in the block.
use_l2_regularizer: whether to use L2 regularizer on Conv layer.
Returns:
Output tensor for the block.
"""
filters1, filters2, filters3 = filters
if backend.image_data_format() == 'channels_last':
bn_axis = 3
else:
bn_axis = 1
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = layers.Conv2D(
filters1, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2a')(
input_tensor)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2a')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters2,
kernel_size,
strides=strides,
padding='same',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2b')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2b')(
x)
x = layers.Activation('relu')(x)
x = layers.Conv2D(
filters3, (1, 1),
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '2c')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '2c')(
x)
shortcut = layers.Conv2D(
filters3, (1, 1),
strides=strides,
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name=conv_name_base + '1')(
input_tensor)
shortcut = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name=bn_name_base + '1')(
shortcut)
x = layers.add([x, shortcut])
x = layers.Activation('relu')(x)
return x
def resnet50(num_classes,
batch_size=None,
use_l2_regularizer=True,
rescale_inputs=False):
"""Instantiates the ResNet50 architecture.
Args:
num_classes: `int` number of classes for image classification.
batch_size: Size of the batches for each step.
use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
rescale_inputs: whether to rescale inputs from 0 to 1.
Returns:
A Keras model instance.
"""
input_shape = (224, 224, 3)
img_input = layers.Input(shape=input_shape)
if rescale_inputs:
# Hub image modules expect inputs in the range [0, 1]. This rescales these
# inputs to the range expected by the trained model.
x = layers.Lambda(
lambda x: x * 255.0 - backend.constant(
imagenet_preprocessing.CHANNEL_MEANS,
shape=[1, 1, 3],
dtype=x.dtype),
name='rescale')(
img_input)
else:
x = img_input
if backend.image_data_format() == 'channels_first':
x = layers.Lambda(
lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
name='transpose')(x)
bn_axis = 1
else: # channels_last
bn_axis = 3
x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
x = layers.Conv2D(
64, (7, 7),
strides=(2, 2),
padding='valid',
use_bias=False,
kernel_initializer='he_normal',
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name='conv1')(
x)
x = layers.BatchNormalization(
axis=bn_axis,
momentum=BATCH_NORM_DECAY,
epsilon=BATCH_NORM_EPSILON,
name='bn_conv1')(
x)
x = layers.Activation('relu')(x)
x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
x = conv_block(
x,
3, [64, 64, 256],
stage=2,
block='a',
strides=(1, 1),
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [64, 64, 256],
stage=2,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [64, 64, 256],
stage=2,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [128, 128, 512],
stage=3,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [128, 128, 512],
stage=3,
block='d',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [256, 256, 1024],
stage=4,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='c',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='d',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='e',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [256, 256, 1024],
stage=4,
block='f',
use_l2_regularizer=use_l2_regularizer)
x = conv_block(
x,
3, [512, 512, 2048],
stage=5,
block='a',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [512, 512, 2048],
stage=5,
block='b',
use_l2_regularizer=use_l2_regularizer)
x = identity_block(
x,
3, [512, 512, 2048],
stage=5,
block='c',
use_l2_regularizer=use_l2_regularizer)
rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
x = layers.Dense(
num_classes,
kernel_initializer=initializers.RandomNormal(stddev=0.01),
kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
name='fc1000')(
x)
# print('x.dtype: %s' % x.dtype_policy)
# 'kernel' is dense1's variable
#print('layers.Dense.kernel.dtype: %s' % layers.Dense.kernel.dtype.name)
# A softmax that is followed by the model loss must be done cannot be done
# in float16 due to numeric issues. So we pass dtype=float32.
x = layers.Activation('softmax', dtype='float32')(x)
# Create model.
return models.Model(img_input, x, name='resnet50')
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import flags
from absl import logging
import tensorflow as tf
from tf2_common.training import standard_runnable
from tf2_common.training import utils
from tf2_common.utils.flags import core as flags_core
from tf2_common.utils.mlp_log import mlp_log
import common
import imagenet_preprocessing
import resnet_model
flags.DEFINE_boolean('trace_warmup', default=False,
help='Whether or not to programmatically capture an Xprof'
' trace in the warmup loop.')
class _UnwrapPreventer(object):
"""Wrapper that DistributionStrategy will not unwrap.
Typically, DistributionStrategy will unwrap values when going from a cross-
replica context to a replica context via `call_for_each_replica`. This class
is a wrapper that DistributionStrategy will not unwrap, so it can be used to
prevent it from unwrapping a value.
TODO(reedwm): Find/implement a better way of preventing values from being
unwrapped by DistributionStrategy
"""
__slots__ = ['value']
def __init__(self, value):
self.value = value
class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup):
"""Implements the training and evaluation APIs for Resnet model."""
def __init__(self, flags_obj, time_callback):
standard_runnable.StandardRunnableWithWarmup.__init__(
self,
flags_obj.use_tf_while_loop,
flags_obj.use_tf_function)
self.strategy = tf.distribute.get_strategy()
self.flags_obj = flags_obj
self.dtype = flags_core.get_tf_dtype(flags_obj)
self.time_callback = time_callback
# Input pipeline related
batch_size = flags_obj.batch_size
if batch_size % self.strategy.num_replicas_in_sync != 0:
raise ValueError(
'Batch size must be divisible by number of replicas : {}'.format(
self.strategy.num_replicas_in_sync))
steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
if train_epochs > 1:
train_epochs = flags_obj.train_epochs
# As auto rebatching is not supported in
# `experimental_distribute_datasets_from_function()` API, which is
# required when cloning dataset to multiple workers in eager mode,
# we use per-replica batch size.
self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
self.synthetic_input_fn = common.get_synth_input_fn(
height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
num_channels=imagenet_preprocessing.NUM_CHANNELS,
num_classes=self.flags_obj.num_classes,
dtype=self.dtype,
drop_remainder=True)
if self.flags_obj.use_synthetic_data:
self.input_fn = self.synthetic_input_fn
else:
self.input_fn = imagenet_preprocessing.input_fn
resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
self.model = resnet_model.resnet50(
num_classes=self.flags_obj.num_classes,
batch_size=flags_obj.batch_size,
use_l2_regularizer=not flags_obj.single_l2_loss_op)
self.use_lars_optimizer = False
self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
if self.flags_obj.optimizer == 'LARS':
self.use_lars_optimizer = True
self.optimizer, _ = common.get_optimizer(
flags_obj=flags_obj,
steps_per_epoch=steps_per_epoch,
train_steps=steps_per_epoch * train_epochs)
# Make sure iterations variable is created inside scope.
self.global_step = self.optimizer.iterations
if self.dtype == tf.float16:
print("enter fp16 computing")
loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
self.optimizer = (
tf.keras.mixed_precision.LossScaleOptimizer(
self.optimizer, dynamic=False, initial_scale=loss_scale))
elif flags_obj.fp16_implementation == 'graph_rewrite':
# `dtype` is still float32 in this case. We built the graph in float32
# and let the graph rewrite change parts of it float16.
if not flags_obj.use_tf_function:
raise ValueError('--fp16_implementation=graph_rewrite requires '
'--use_tf_function to be true')
loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
self.optimizer = (
tf.train.experimental.enable_mixed_precision_graph_rewrite(
self.optimizer, loss_scale))
self.one_hot = False
self.label_smoothing = flags_obj.label_smoothing
if self.label_smoothing and self.label_smoothing > 0:
self.one_hot = True
if flags_obj.report_accuracy_metrics:
self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
if self.one_hot:
self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
'train_accuracy', dtype=tf.float32)
else:
self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
'train_accuracy', dtype=tf.float32)
self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
else:
self.train_loss = None
self.train_accuracy = None
self.test_loss = None
if self.one_hot:
self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
'test_accuracy', dtype=tf.float32)
else:
self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
'test_accuracy', dtype=tf.float32)
# self.test_corrects = tf.keras.metrics.Sum(
# 'test_corrects', dtype=tf.float32)
self.num_eval_steps = common.get_num_eval_steps(flags_obj)
self.checkpoint = tf.train.Checkpoint(
model=self.model, optimizer=self.optimizer)
# Handling epochs.
self.epoch_steps = steps_per_epoch
self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)
self.steps_per_loop = flags_obj.steps_per_loop
profile_steps = flags_obj.profile_steps
if profile_steps:
profile_steps = [int(i) for i in profile_steps.split(',')]
self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
self.trace_end_step = profile_steps[1]
else:
self.trace_start_step = None
self.trace_end_step = None
self.epochs_between_evals = flags_obj.epochs_between_evals
self.training_vars = self.model.trainable_variables
self.accum_grads = []
self.accum_grads_dtype = tf.float32
if self.num_accumulation_steps > 1:
for var in self.training_vars:
self.accum_grads.append(self.optimizer.add_weight(
name=var.name + '_accum',
shape=var.shape,
dtype=self.accum_grads_dtype,
initializer='zeros',
trainable=False,
synchronization=tf.VariableSynchronization.ON_READ,
aggregation=tf.VariableAggregation.SUM))
def build_train_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=True,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_train_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.training_dataset_cache,
prefetch_batchs=self.flags_obj.training_prefetch_batchs)
def build_eval_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.input_fn,
is_training=False,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_eval_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.eval_dataset_cache,
prefetch_batchs=self.flags_obj.eval_prefetch_batchs)
def build_synthetic_dataset(self):
"""See base class."""
return utils.make_distributed_dataset(
self.strategy,
self.synthetic_input_fn,
is_training=True,
data_dir=self.flags_obj.data_dir,
batch_size=self.batch_size,
datasets_num_private_threads=self.flags_obj
.datasets_num_private_threads,
dtype=self.dtype,
drop_remainder=self.flags_obj.drop_train_remainder,
tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
dataset_cache=self.flags_obj.training_dataset_cache,
prefetch_batchs=self.flags_obj.training_prefetch_batchs)
def train_loop_begin(self):
"""See base class."""
# Reset all metrics
if self.train_loss:
self.train_loss.reset_states()
if self.train_accuracy:
self.train_accuracy.reset_states()
self._epoch_begin()
if self.trace_start_step:
global_step = self.global_step.numpy()
next_global_step = global_step + self.steps_per_loop
if (global_step <= self.trace_start_step and
self.trace_start_step < next_global_step):
self.trace_start(global_step)
self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
def train_step(self, iterator):
"""See base class."""
@tf.function(experimental_compile=False)
def local_step(images, labels):
"""Local computation of a step."""
with tf.GradientTape() as tape:
logits = self.model(images, training=True)
if self.one_hot:
prediction_loss = tf.keras.losses.categorical_crossentropy(
labels, logits, label_smoothing=self.label_smoothing)
else:
prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
labels, logits)
loss = tf.reduce_sum(prediction_loss) * (
1.0 / self.flags_obj.batch_size)
# Save ~3 seconds per epoch on GPU when skipping
# L2 loss computation; can only skip when using LARS
# Details in decription of cl/308018913
if not self.use_lars_optimizer:
num_replicas = self.strategy.num_replicas_in_sync
if self.flags_obj.single_l2_loss_op:
l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([
tf.nn.l2_loss(v)
for v in self.model.trainable_variables
if 'bn' not in v.name
])
loss += (l2_loss / num_replicas)
else:
loss += (tf.reduce_sum(self.model.losses) / num_replicas)
# Scale the loss
if self.flags_obj.dtype == 'fp16':
loss = self.optimizer.get_scaled_loss(loss)
grads = tape.gradient(loss, self.model.trainable_variables)
# Unscale the grads
if self.flags_obj.dtype == 'fp16':
grads = self.optimizer.get_unscaled_gradients(grads)
return logits, loss, grads
def _maybe_apply_grads_and_clear(distribution):
def _apply_grads_and_clear_for_each_replica():
local_replica_id = tf.get_static_value(
self.strategy.extended._get_local_replica_id(
tf.distribute.get_replica_context().replica_id_in_sync_group))
replica_accum_grads = []
for accum_grad, var in zip(self.accum_grads, self.training_vars):
local_accum_grad = self.strategy.experimental_local_results(
accum_grad)
replica_accum_grad = local_accum_grad[local_replica_id]
replica_accum_grad = tf.cast(replica_accum_grad, var.dtype)
replica_accum_grads.append(replica_accum_grad)
self.optimizer.apply_gradients(
zip(replica_accum_grads, self.training_vars))
for accum_grad in self.accum_grads:
accum_grad.assign(tf.zeros_like(accum_grad,
dtype=self.accum_grads_dtype),
read_value=False)
def _apply_grads_and_clear():
distribution.extended.call_for_each_replica(
_apply_grads_and_clear_for_each_replica,
args=())
return self.optimizer.iterations.assign_add(0, read_value=False)
def _advance_iteration():
return self.optimizer.iterations.assign_add(1, read_value=False)
tf.cond(
tf.equal(self.optimizer.iterations % self.num_accumulation_steps,
self.num_accumulation_steps - 1),
_apply_grads_and_clear,
_advance_iteration)
def step_fn(inputs):
"""Function to run on the device."""
images, labels = inputs
logits, loss, grads = local_step(images, labels)
if self.num_accumulation_steps > 1:
for grad, accum_grad in zip(grads, self.accum_grads):
accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype),
read_value=False)
tf.distribute.get_replica_context().merge_call(
_maybe_apply_grads_and_clear,
args=())
else:
self.optimizer.apply_gradients(zip(grads, self.training_vars))
if self.train_loss:
self.train_loss.update_state(loss)
if self.train_accuracy:
self.train_accuracy.update_state(labels, logits)
self.strategy.run(step_fn, args=(next(iterator),))
def train_loop_end(self):
"""See base class."""
metrics = {}
if self.train_loss:
metrics['train_loss'] = self.train_loss.result()
if self.train_accuracy:
metrics['train_accuracy'] = self.train_accuracy.result()
self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
if self.trace_end_step:
global_step = self.global_step.numpy()
next_global_step = global_step + self.steps_per_loop
if (global_step <= self.trace_end_step and
self.trace_end_step < next_global_step):
self.trace_end(global_step)
self._epoch_end()
return metrics
def eval_begin(self):
"""See base class."""
if self.test_loss:
self.test_loss.reset_states()
if self.test_accuracy:
self.test_accuracy.reset_states()
# self.test_corrects.reset_states()
epoch_num = int(self.epoch_helper.current_epoch)
mlp_log.mlperf_print('eval_start', None,
metadata={'epoch_num': epoch_num + 1})
def eval_step(self, iterator):
"""See base class."""
def step_fn(inputs):
"""Function to run on the device."""
images, labels = inputs
logits = self.model(images, training=False)
if self.test_loss:
if self.one_hot:
loss = tf.keras.losses.categorical_crossentropy(
labels, logits, label_smoothing=self.label_smoothing)
else:
loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
self.test_loss.update_state(loss)
if self.test_accuracy:
self.test_accuracy.update_state(labels, logits)
# tf.print('labels.shape: ', labels.shape,
# ', logits.shape: ', logits.shape,
# ', result: ', self.test_accuracy.result())
# self.test_corrects.update_state(
# tf.cast(
# tf.reduce_sum(
# tf.cast(
# tf.equal(
# tf.cast(tf.argmax(logits, axis=1), labels.dtype),
# labels), tf.int32)), tf.float32))
self.strategy.run(step_fn, args=(next(iterator),))
def eval_end(self):
"""See base class."""
epoch_num = int(self.epoch_helper.current_epoch)
mlp_log.mlperf_print('eval_stop', None,
metadata={'epoch_num': epoch_num + 1})
eval_accuracy = float(self.test_accuracy.result())
# eval_accuracy = float(self.test_corrects.result()
# ) / imagenet_preprocessing.NUM_IMAGES['validation']
# eval_accuracy = float(self.test_accuracy.result()) * \
# self.flags_obj.batch_size * self.num_eval_steps / \
# imagenet_preprocessing.NUM_IMAGES['validation']
mlp_log.mlperf_print(
'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1})
first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0)
epoch_count = self.epochs_between_evals
if first_epoch_num == 0:
epoch_count = self.flags_obj.eval_offset_epochs
if epoch_count == 0:
epoch_count = self.flags_obj.epochs_between_evals
mlp_log.mlperf_print(
'block_stop',
None,
metadata={
'first_epoch_num': first_epoch_num + 1,
'epoch_count': epoch_count
})
continue_training = True
if eval_accuracy >= self.flags_obj.target_accuracy:
continue_training = False
else:
mlp_log.mlperf_print(
'block_start',
None,
metadata={
'first_epoch_num': epoch_num + 2,
'epoch_count': self.epochs_between_evals
})
results = {}
if self.test_loss:
results['test_loss'] = self.test_loss.result()
if self.test_accuracy:
results['test_accuracy'] = self.test_accuracy.result()
results['continue_training'] = continue_training
return results
def warmup_loop_begin(self):
"""See base class."""
if self.flags_obj.trace_warmup:
self.trace_start(-3)
logging.info('Entering the warmup loop.')
def warmup_loop_end(self):
"""See base class."""
if self.flags_obj.trace_warmup:
self.trace_end(-2)
# Reset the state
self.model.reset_states()
tf.keras.backend.set_value(self.optimizer.iterations, 0)
for accum_grad in self.accum_grads:
accum_grad.assign(tf.zeros_like(accum_grad,
dtype=self.accum_grads_dtype),
read_value=False)
logging.info('Exiting the warmup loop.')
def _epoch_begin(self):
if self.epoch_helper.epoch_begin():
self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
def _epoch_end(self):
# mlp_log.mlperf_print('epoch_stop', None)
if self.epoch_helper.epoch_end():
self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
def trace_start(self, global_step):
logging.info('Starting tracing at step %d.', global_step)
tf.profiler.experimental.start(self.flags_obj.model_dir)
def trace_end(self, global_step):
logging.info('Ending trace at step %d', global_step)
tf.profiler.experimental.stop()
XLA_FLAGS="--xla_gpu_cuda_data_dir=/public/software/compiler/rocm/dtk-21.10.1/amdgcn/bitcode/ --xla_dump_hlo_pass_re=.* --xla_dump_hlo_as_html --xla_dump_to=./tmp" TF_DUMP_GRAPH_PREFIX="./tf_graph" hipprof --hip-trace python3 ./resnet_ctl_imagenet_main.py \
--base_learning_rate=10.0 \
--batch_size=32 \
--nocache_decoded_image \
--data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow \
--device_warmup_steps=1 \
--dtype=fp32 \
--noenable_checkpoint_and_export \
--noenable_device_warmup \
--enable_eager \
--epochs_between_evals=4 \
--noeval_dataset_cache \
--eval_offset_epochs=2 \
--label_smoothing=0.1 \
--lars_epsilon=0 \
--log_steps=125 \
--lr_schedule=polynomial \
--optimizer=LARS \
--noreport_accuracy_metrics \
--single_l2_loss_op \
--steps_per_loop=25 \
--train_epochs=1 \
--notraining_dataset_cache \
--notrace_warmup \
--nouse_synthetic_data \
--use_tf_function \
--verbosity=0 \
--warmup_epochs=5 \
--weight_decay=0.0002 \
--target_accuracy=0.759 \
--momentum=0.9 \
--num_replicas=64 \
--num_accumulation_steps=2 \
--num_classes=1000 \
--noskip_eval
# Lint as: python3
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to training performance."""
import tensorflow as tf
def configure_optimizer(optimizer,
use_float16=False,
use_graph_rewrite=False,
loss_scale="dynamic"):
"""Configures optimizer object with performance options."""
if use_float16:
# Wraps optimizer with a LossScaleOptimizer. This is done automatically
# in compile() with the "mixed_float16" policy, but since we do not call
# compile(), we must wrap the optimizer manually.
optimizer = (
tf.keras.mixed_precision.experimental.LossScaleOptimizer(
optimizer, loss_scale=loss_scale))
if use_graph_rewrite:
# Note: the model dtype must be 'float32', which will ensure
# tf.ckeras.mixed_precision and
# tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
# up.
optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer)
return optimizer
def set_mixed_precision_policy(dtype, loss_scale=None):
"""Sets mix precision policy."""
if dtype == tf.float16:
print("enter the tf.float16 set policy")
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_float16', loss_scale=loss_scale)
tf.keras.mixed_precision.experimental.set_policy(policy)
print('Compute dtype: %s' % policy.compute_dtype)
print('Variable dtype: %s' % policy.variable_dtype)
# tf.keras.mixed_precision.experimental.set_policy('float16')
elif dtype == tf.bfloat16:
policy = tf.keras.mixed_precision.experimental.Policy(
'mixed_bfloat16')
tf.keras.mixed_precision.experimental.set_policy(policy)
elif dtype == tf.float32:
tf.keras.mixed_precision.experimental.set_policy('float32')
else:
raise ValueError("Unexpected dtype: %s" % dtype)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment