version 1

05631eec · liangjing · 7e0391d9 · 05631eec · 05631eec · 05631eec
Commit 05631eec authored Apr 10, 2023 by liangjing
20 changed files
--- a/common.py
+++ b/common.py
--- a/eval_accu.sh
+++ b/eval_accu.sh
+cat $1 |grep eval_accuracy|awk -F eval_accuracy '{print $2}'|awk -F value '{print $2}'|awk '{print $2}'|uniq
+
--- a/imagenet_preprocessing.py
+++ b/imagenet_preprocessing.py
--- a/lars_optimizer.py
+++ b/lars_optimizer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+# from tf2_common.training import optimizer_v2modified
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+from tensorflow.python.ops import state_ops
+
+
+# class LARSOptimizer(optimizer_v2modified.OptimizerV2Modified):
+#class LARSOptimizer(optimizer_v2.OptimizerV2):
+class LARSOptimizer(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+
+  Implements the LARS learning rate scheme presented in the paper above. This
+  optimizer is useful when scaling the batch size to up to 32K without
+  significant performance degradation. It is recommended to use the optimizer
+  in conjunction with:
+      - Gradual learning rate warm-up
+      - Linear learning rate scaling
+      - Poly rule learning rate decay
+
+  Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
+  use the default momentum optimizer.
+  """
+
+  def __init__(
+      self,
+      learning_rate,
+      momentum=0.9,
+      weight_decay=0.0001,
+      # The LARS coefficient is a hyperparameter
+      eeta=0.001,
+      epsilon=0.0,
+      name="LARSOptimizer",
+      # Enable skipping variables from LARS scaling.
+      # TODO(sameerkm): Enable a direct mechanism to pass a
+      # subset of variables to the optimizer.
+      skip_list=None,
+      use_nesterov=False,
+      **kwargs):
+    """Construct a new LARS Optimizer.
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+        that takes no arguments and returns the actual value to use. The
+        learning rate.
+      momentum: A floating point value. Momentum hyperparameter.
+      weight_decay: A floating point value. Weight decay hyperparameter.
+      eeta: LARS coefficient as used in the paper. Dfault set to LARS
+        coefficient from the paper. (eeta / weight_decay) determines the highest
+        scaling factor in LARS.
+      epsilon: Optional epsilon parameter to be set in models that have very
+        small gradients. Default set to 0.0.
+      name: Optional name prefix for variables and ops created by LARSOptimizer.
+      skip_list: List of strings to enable skipping variables from LARS scaling.
+        If any of the strings in skip_list is a subset of var.name, variable
+        'var' is skipped from LARS scaling. For a typical classification model
+        with batch normalization, the skip_list is ['batch_normalization',
+        'bias']
+      use_nesterov: when set to True, nesterov momentum will be enabled
+      **kwargs: keyword arguments.
+
+    Raises:
+      ValueError: If a hyperparameter is set to a non-sensical value.
+    """
+    if momentum < 0.0:
+      raise ValueError("momentum should be positive: %s" % momentum)
+    if weight_decay < 0.0:
+      raise ValueError("weight_decay should be positive: %s" % weight_decay)
+    super(LARSOptimizer, self).__init__(name=name, **kwargs)
+
+    self._set_hyper("learning_rate", learning_rate)
+
+    # When directly using class members, instead of
+    # _set_hyper and _get_hyper (such as learning_rate above),
+    # the values are fixed after __init(), and not being
+    # updated during the training process.
+    # This provides better performance but less flexibility.
+    self.momentum = momentum
+    self.weight_decay = weight_decay
+    self.eeta = eeta
+    self.epsilon = epsilon or backend_config.epsilon()
+    self._skip_list = skip_list
+    self.use_nesterov = use_nesterov
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    lr_t = self._get_hyper("learning_rate", var_dtype)
+    local_step = math_ops.cast(self.iterations, var_dtype)
+    lr_t = math_ops.cast(lr_t(local_step), var_dtype)
+    learning_rate_t = array_ops.identity(lr_t)
+
+    apply_state[(var_device, var_dtype)].update(
+        dict(
+            learning_rate=learning_rate_t,
+            ))
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+
+  def compute_lr(self, grad, var, coefficients):
+    scaled_lr = coefficients["learning_rate"]
+    if self._skip_list is None or not any(v in var.name
+                                          for v in self._skip_list):
+      w_norm = linalg_ops.norm(var, ord=2)
+      g_norm = linalg_ops.norm(grad, ord=2)
+      trust_ratio = array_ops.where(
+          math_ops.greater(w_norm, 0),
+          array_ops.where(
+              math_ops.greater(g_norm, 0),
+              (self.eeta * w_norm /
+               (g_norm + self.weight_decay * w_norm + self.epsilon)), 1.0), 1.0)
+
+      scaled_lr = coefficients["learning_rate"] * trust_ratio
+      # Add the weight regularization gradient
+      grad = grad + self.weight_decay * var
+    return scaled_lr, grad
+
+  def _apply_dense(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    scaled_lr, grad = self.compute_lr(grad, var, coefficients)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        math_ops.cast(1.0, var.dtype.base_dtype),
+        grad * scaled_lr,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    scaled_lr, grad = self.compute_lr(grad, var, coefficients)
+    mom = self.get_slot(var, "momentum")
+    # Use ApplyKerasMomentum instead of ApplyMomentum
+    # training_ops.resource_apply_keras_momentum(
+    #     var.handle,
+    #     mom.handle,
+    #     scaled_lr,
+    #     grad,
+    #     coefficients["momentum"],
+    #     use_locking=False,
+    #     use_nesterov=self.use_nesterov)
+
+    mom_t = mom * self.momentum - grad * scaled_lr
+    mom_t = state_ops.assign(mom, mom_t, use_locking=False)
+    if self.use_nesterov:
+      var_t = var + mom_t * self.momentum - grad * scaled_lr
+    else:
+      var_t = var + mom_t
+    return state_ops.assign(var, var_t, use_locking=False).op
+
+  # Fallback to momentum optimizer for sparse tensors
+  def _apply_sparse(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    mom = self.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        coefficients["learning_rate"],
+        grad.values,
+        grad.indices,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_keras_momentum(
+        var.handle,
+        mom.handle,
+        coefficients["learning_rate"],
+        grad,
+        indices,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def get_config(self):
+    config = super(LARSOptimizer, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "momentum": self.momentum,
+        "weight_decay": self.weight_decay,
+        "eeta": self.eeta,
+        "epsilon": self.epsilon,
+        "use_nesterov": self.use_nesterov,
+    })
+    return config
--- a/lars_util.py
+++ b/lars_util.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enable Layer-wise Adaptive Rate Scaling optimizer in ResNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+from tf2_common.utils.mlp_log import mlp_log
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+FLAGS = flags.FLAGS
+
+
+def define_lars_flags():
+  """Defines flags needed by LARS optimizer."""
+
+  flags.DEFINE_float(
+      'end_learning_rate', default=None,
+      help=('Polynomial decay end learning rate.'))
+
+  flags.DEFINE_float(
+      'lars_epsilon', default=0.0,
+      help=('Override autoselected LARS epsilon.'))
+
+  flags.DEFINE_float(
+      'warmup_epochs', default=None,
+      help=('Override autoselected polynomial decay warmup epochs.'))
+
+  flags.DEFINE_float(
+      'momentum',
+      default=0.9,
+      help=('Momentum parameter used in the MomentumOptimizer.'))
+
+
+class PolynomialDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """A LearningRateSchedule that uses a polynomial decay with warmup."""
+
+  def __init__(
+      self,
+      batch_size,
+      steps_per_epoch,
+      train_steps,
+      initial_learning_rate=None,
+      end_learning_rate=None,
+      warmup_epochs=None,
+      compute_lr_on_cpu=False,
+      name=None):
+    """Applies a polynomial decay to the learning rate with warmup."""
+    super(PolynomialDecayWithWarmup, self).__init__()
+
+    self.batch_size = batch_size
+    self.steps_per_epoch = steps_per_epoch
+    self.train_steps = train_steps
+    self.name = name
+    self.learning_rate_ops_cache = {}
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+
+    if batch_size < 16384:
+      self.initial_learning_rate = 10.0
+      warmup_epochs_ = 5
+    elif batch_size < 32768:
+      self.initial_learning_rate = 25.0
+      warmup_epochs_ = 5
+    else:
+      self.initial_learning_rate = 31.2
+      warmup_epochs_ = 25
+
+    # Override default poly learning rate and warmup epochs
+    if initial_learning_rate:
+      self.initial_learning_rate = initial_learning_rate
+
+    if end_learning_rate:
+      self.end_learning_rate = end_learning_rate
+    else:
+      self.end_learning_rate = 0.0001
+
+    if warmup_epochs is not None:
+      warmup_epochs_ = warmup_epochs
+    self.warmup_epochs = warmup_epochs_
+
+    opt_name = FLAGS.optimizer.lower()
+    mlp_log.mlperf_print('opt_name', opt_name)
+    if opt_name == 'lars':
+      mlp_log.mlperf_print('{}_epsilon'.format(opt_name), FLAGS.lars_epsilon)
+    mlp_log.mlperf_print('{}_opt_weight_decay'.format(opt_name),
+                         FLAGS.weight_decay)
+    mlp_log.mlperf_print('{}_opt_base_learning_rate'.format(opt_name),
+                         self.initial_learning_rate)
+    mlp_log.mlperf_print('{}_opt_learning_rate_warmup_epochs'.format(opt_name),
+                         warmup_epochs_)
+    mlp_log.mlperf_print('{}_opt_end_learning_rate'.format(opt_name),
+                         self.end_learning_rate)
+    warmup_steps = warmup_epochs_ * steps_per_epoch
+    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
+    self.decay_steps = train_steps - warmup_steps + 1
+    mlp_log.mlperf_print('{}_opt_learning_rate_decay_steps'.format(opt_name),
+                         int(self.decay_steps))
+    mlp_log.mlperf_print(
+        '{}_opt_learning_rate_decay_poly_power'.format(opt_name), 2.0)
+    mlp_log.mlperf_print('{}_opt_momentum'.format(opt_name), FLAGS.momentum)
+
+    self.poly_rate_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=self.initial_learning_rate,
+        decay_steps=self.decay_steps,
+        end_learning_rate=self.end_learning_rate,
+        power=2.0)
+
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+
+  def _get_learning_rate(self, step):
+    with ops.name_scope_v2(self.name or 'PolynomialDecayWithWarmup') as name:
+
+      initial_learning_rate = ops.convert_to_tensor_v2(
+          self.initial_learning_rate, name='initial_learning_rate')
+      warmup_steps = ops.convert_to_tensor_v2(
+          self.warmup_steps, name='warmup_steps')
+
+      warmup_rate = (
+          initial_learning_rate * step / warmup_steps)
+
+      poly_steps = math_ops.subtract(step, warmup_steps)
+      poly_rate = self.poly_rate_scheduler(poly_steps)
+
+      decay_rate = tf.where(step <= warmup_steps,
+                            warmup_rate, poly_rate, name=name)
+      return decay_rate
+
+  def get_config(self):
+    return {
+        'batch_size': self.batch_size,
+        'steps_per_epoch': self.steps_per_epoch,
+        'train_steps': self.train_steps,
+        'initial_learning_rate': self.initial_learning_rate,
+        'end_learning_rate': self.end_learning_rate,
+        'warmup_epochs': self.warmup_epochs,
+        'name': self.name,
+    }
--- a/reference.log
+++ b/reference.log
--- a/requirement.txt
+++ b/requirement.txt
+absl-py
+pandas
+numpy
+tqdm
+git+https://github.com/mlcommons/logging.git@0.7.0
+
--- a/resnet_ctl_imagenet_main.py
+++ b/resnet_ctl_imagenet_main.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+#  worker = []
+#  nodelist = os.environ["SLURM_JOB_NODELIST"]
+#  nodename = os.environ["SLURMD_NODENAME"]
+#  nodelist = hostlist.expand_hostlist(nodelist) 
+#  print('print nodelist2',nodelist)
+#  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+#  port_number =40000
+#  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+##  print('print worker_nodes',worker_nodes)
+#  for node in worker_nodes:
+#      for index in range(4):
+#          print('node',node)
+#          worker_sockets = ":".join([node, str(port_number + index )])
+#          worker.append(worker_sockets)
+#  os.environ['TF_CONFIG'] = json.dumps({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+#
+#  
+#  print({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_ctl_imagenet_main.py.multinode
+++ b/resnet_ctl_imagenet_main.py.multinode
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+  worker = []
+  nodelist = os.environ["SLURM_JOB_NODELIST"]
+  nodename = os.environ["SLURMD_NODENAME"]
+  nodelist = hostlist.expand_hostlist(nodelist) 
+  print('print nodelist2',nodelist)
+  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+  port_number =40000
+  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+#  print('print worker_nodes',worker_nodes)
+  for node in worker_nodes:
+      for index in range(4):
+          print('node',node)
+          worker_sockets = ":".join([node, str(port_number + index )])
+          worker.append(worker_sockets)
+  os.environ['TF_CONFIG'] = json.dumps({
+    'cluster': {
+        'worker': worker
+    },
+    'task': {'type': 'worker', 'index': num_index}
+  })
+
+  
+  print({
+    'cluster': {
+        'worker': worker
+    },
+    'task': {'type': 'worker', 'index': num_index}
+  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_ctl_imagenet_main.py.onenode
+++ b/resnet_ctl_imagenet_main.py.onenode
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+#  worker = []
+#  nodelist = os.environ["SLURM_JOB_NODELIST"]
+#  nodename = os.environ["SLURMD_NODENAME"]
+#  nodelist = hostlist.expand_hostlist(nodelist) 
+#  print('print nodelist2',nodelist)
+#  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+#  port_number =40000
+#  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+##  print('print worker_nodes',worker_nodes)
+#  for node in worker_nodes:
+#      for index in range(4):
+#          print('node',node)
+#          worker_sockets = ":".join([node, str(port_number + index )])
+#          worker.append(worker_sockets)
+#  os.environ['TF_CONFIG'] = json.dumps({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+#
+#  
+#  print({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_model.py
+++ b/resnet_model.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet50 model for Keras.
+
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+import imagenet_preprocessing
+from tensorflow.keras import backend
+from tensorflow.keras import initializers
+from tensorflow.keras import layers as tf_python_keras_layers
+from tensorflow.keras import models
+from tensorflow.keras import regularizers
+
+BATCH_NORM_DECAY = 0.9
+BATCH_NORM_EPSILON = 1e-5
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_float(
+    'weight_decay',
+    default=1e-4,
+    help=('Weight decay coefficiant for l2 regularization.'))
+
+flags.DEFINE_integer(
+    'num_accumulation_steps',
+    default=8,
+    help=('number of steps to accumulate with large batch size.'))
+
+layers = tf_python_keras_layers
+
+
+def change_keras_layer(use_tf_keras_layers=False):
+  """Change layers to either tf.keras.layers or tf.python.keras.layers.
+
+  Layer version of  tf.keras.layers is depends on tensorflow version, but
+  tf.python.keras.layers checks environment variable TF2_BEHAVIOR.
+  This function is a temporal function to use tf.keras.layers.
+  Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer.
+  this function is useful for tracking benchmark result for each version.
+  This function will be removed when we use tf.keras.layers as default.
+
+  TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training
+  speed parity with tf v1 batchnorm.
+
+  Args:
+      use_tf_keras_layers: whether to use tf.keras.layers.
+  """
+  global layers
+  if use_tf_keras_layers:
+    layers = tf.keras.layers
+  else:
+    layers = tf_python_keras_layers
+
+
+def _gen_l2_regularizer(use_l2_regularizer=True):
+  return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None
+
+
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True):
+  """A block that has a conv layer at shortcut.
+
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '1')(
+          shortcut)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False):
+  """Instantiates the ResNet50 architecture.
+
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (224, 224, 3)
+  img_input = layers.Input(shape=input_shape)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - backend.constant(
+            imagenet_preprocessing.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Lambda(
+        lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+        name='transpose')(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+
+  x = conv_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='a',
+      strides=(1, 1),
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='e',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='f',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
+  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=initializers.RandomNormal(stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+#  print('x.dtype: %s' % x.dtype_policy)
+# 'kernel' is dense1's variable
+  #print('layers.Dense.kernel.dtype: %s' % layers.Dense.kernel.dtype.name)
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+
+  # Create model.
+  return models.Model(img_input, x, name='resnet50')
--- a/resnet_runnable.py
+++ b/resnet_runnable.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+from absl import logging
+
+import tensorflow as tf
+
+from tf2_common.training import standard_runnable
+from tf2_common.training import utils
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_model
+
+flags.DEFINE_boolean('trace_warmup', default=False,
+                     help='Whether or not to programmatically capture an Xprof'
+                     ' trace in the warmup loop.')
+
+
+class _UnwrapPreventer(object):
+  """Wrapper that DistributionStrategy will not unwrap.
+
+  Typically, DistributionStrategy will unwrap values when going from a cross-
+  replica context to a replica context via `call_for_each_replica`. This class
+  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
+  prevent it from unwrapping a value.
+
+  TODO(reedwm): Find/implement a better way of preventing values from being
+  unwrapped by DistributionStrategy
+  """
+
+  __slots__ = ['value']
+
+  def __init__(self, value):
+    self.value = value
+
+
+class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup):
+  """Implements the training and evaluation APIs for Resnet model."""
+
+  def __init__(self, flags_obj, time_callback):
+    standard_runnable.StandardRunnableWithWarmup.__init__(
+        self,
+        flags_obj.use_tf_while_loop,
+        flags_obj.use_tf_function)
+
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+
+    steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
+    if train_epochs > 1:
+      train_epochs = flags_obj.train_epochs
+
+    # As auto rebatching is not supported in
+    # `experimental_distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+
+    self.synthetic_input_fn = common.get_synth_input_fn(
+        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        num_channels=imagenet_preprocessing.NUM_CHANNELS,
+        num_classes=self.flags_obj.num_classes,
+        dtype=self.dtype,
+        drop_remainder=True)
+
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = self.synthetic_input_fn
+    else:
+      self.input_fn = imagenet_preprocessing.input_fn
+
+    resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
+    self.model = resnet_model.resnet50(
+        num_classes=self.flags_obj.num_classes,
+        batch_size=flags_obj.batch_size,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+
+    self.use_lars_optimizer = False
+    self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
+    if self.flags_obj.optimizer == 'LARS':
+      self.use_lars_optimizer = True
+    self.optimizer, _ = common.get_optimizer(
+        flags_obj=flags_obj,
+        steps_per_epoch=steps_per_epoch,
+        train_steps=steps_per_epoch * train_epochs)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+
+    if self.dtype == tf.float16:
+      print("enter fp16 computing")
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.keras.mixed_precision.LossScaleOptimizer(
+              self.optimizer, dynamic=False, initial_scale=loss_scale))
+    elif flags_obj.fp16_implementation == 'graph_rewrite':
+      # `dtype` is still float32 in this case. We built the graph in float32
+      # and let the graph rewrite change parts of it float16.
+      if not flags_obj.use_tf_function:
+        raise ValueError('--fp16_implementation=graph_rewrite requires '
+                         '--use_tf_function to be true')
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.train.experimental.enable_mixed_precision_graph_rewrite(
+              self.optimizer, loss_scale))
+
+    self.one_hot = False
+    self.label_smoothing = flags_obj.label_smoothing
+    if self.label_smoothing and self.label_smoothing > 0:
+      self.one_hot = True
+
+    if flags_obj.report_accuracy_metrics:
+      self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+      if self.one_hot:
+        self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
+            'train_accuracy', dtype=tf.float32)
+      else:
+        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+            'train_accuracy', dtype=tf.float32)
+      self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    else:
+      self.train_loss = None
+      self.train_accuracy = None
+      self.test_loss = None
+
+    if self.one_hot:
+      self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
+          'test_accuracy', dtype=tf.float32)
+    else:
+      self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+          'test_accuracy', dtype=tf.float32)
+    # self.test_corrects = tf.keras.metrics.Sum(
+    #     'test_corrects', dtype=tf.float32)
+    self.num_eval_steps = common.get_num_eval_steps(flags_obj)
+
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+
+    # Handling epochs.
+    self.epoch_steps = steps_per_epoch
+    self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)
+
+    self.steps_per_loop = flags_obj.steps_per_loop
+    profile_steps = flags_obj.profile_steps
+    if profile_steps:
+      profile_steps = [int(i) for i in profile_steps.split(',')]
+      self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
+      self.trace_end_step = profile_steps[1]
+    else:
+      self.trace_start_step = None
+      self.trace_end_step = None
+
+    self.epochs_between_evals = flags_obj.epochs_between_evals
+    self.training_vars = self.model.trainable_variables
+    self.accum_grads = []
+    self.accum_grads_dtype = tf.float32
+
+    if self.num_accumulation_steps > 1:
+      for var in self.training_vars:
+        self.accum_grads.append(self.optimizer.add_weight(
+            name=var.name + '_accum',
+            shape=var.shape,
+            dtype=self.accum_grads_dtype,
+            initializer='zeros',
+            trainable=False,
+            synchronization=tf.VariableSynchronization.ON_READ,
+            aggregation=tf.VariableAggregation.SUM))
+
+  def build_train_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def build_eval_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=False,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_eval_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.eval_dataset_cache,
+        prefetch_batchs=self.flags_obj.eval_prefetch_batchs)
+
+  def build_synthetic_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.synthetic_input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    if self.train_loss:
+      self.train_loss.reset_states()
+    if self.train_accuracy:
+      self.train_accuracy.reset_states()
+
+    self._epoch_begin()
+    if self.trace_start_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_start_step and
+          self.trace_start_step < next_global_step):
+        self.trace_start(global_step)
+
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    @tf.function(experimental_compile=False)
+    def local_step(images, labels):
+      """Local computation of a step."""
+
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+
+        if self.one_hot:
+          prediction_loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+              labels, logits)
+        loss = tf.reduce_sum(prediction_loss) * (
+            1.0 / self.flags_obj.batch_size)
+
+        # Save ~3 seconds per epoch on GPU when skipping
+        # L2 loss computation; can only skip when using LARS
+        # Details in decription of cl/308018913
+        if not self.use_lars_optimizer:
+          num_replicas = self.strategy.num_replicas_in_sync
+
+          if self.flags_obj.single_l2_loss_op:
+            l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([
+                tf.nn.l2_loss(v)
+                for v in self.model.trainable_variables
+                if 'bn' not in v.name
+            ])
+
+            loss += (l2_loss / num_replicas)
+          else:
+            loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+
+        # Scale the loss
+        if self.flags_obj.dtype == 'fp16':
+          loss = self.optimizer.get_scaled_loss(loss)
+
+      grads = tape.gradient(loss, self.model.trainable_variables)
+
+      # Unscale the grads
+      if self.flags_obj.dtype == 'fp16':
+        grads = self.optimizer.get_unscaled_gradients(grads)
+      
+      return logits, loss, grads
+
+    def _maybe_apply_grads_and_clear(distribution):
+      def _apply_grads_and_clear_for_each_replica():
+        local_replica_id = tf.get_static_value(
+            self.strategy.extended._get_local_replica_id(
+                tf.distribute.get_replica_context().replica_id_in_sync_group))
+        replica_accum_grads = []
+        for accum_grad, var in zip(self.accum_grads, self.training_vars):
+          local_accum_grad = self.strategy.experimental_local_results(
+              accum_grad)
+          replica_accum_grad = local_accum_grad[local_replica_id]
+          replica_accum_grad = tf.cast(replica_accum_grad, var.dtype)
+          replica_accum_grads.append(replica_accum_grad)
+
+        self.optimizer.apply_gradients(
+            zip(replica_accum_grads, self.training_vars))
+        for accum_grad in self.accum_grads:
+          accum_grad.assign(tf.zeros_like(accum_grad,
+                                          dtype=self.accum_grads_dtype),
+                            read_value=False)
+      def _apply_grads_and_clear():
+        distribution.extended.call_for_each_replica(
+            _apply_grads_and_clear_for_each_replica,
+            args=())
+        return self.optimizer.iterations.assign_add(0, read_value=False)
+
+      def _advance_iteration():
+        return self.optimizer.iterations.assign_add(1, read_value=False)
+
+      tf.cond(
+          tf.equal(self.optimizer.iterations % self.num_accumulation_steps,
+                   self.num_accumulation_steps - 1),
+          _apply_grads_and_clear,
+          _advance_iteration)
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits, loss, grads = local_step(images, labels)
+
+      if self.num_accumulation_steps > 1:
+        for grad, accum_grad in zip(grads, self.accum_grads):
+          accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype),
+                                read_value=False)
+        tf.distribute.get_replica_context().merge_call(
+            _maybe_apply_grads_and_clear,
+            args=())
+      else:
+        self.optimizer.apply_gradients(zip(grads, self.training_vars))
+
+      if self.train_loss:
+        self.train_loss.update_state(loss)
+      if self.train_accuracy:
+        self.train_accuracy.update_state(labels, logits)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {}
+    if self.train_loss:
+      metrics['train_loss'] = self.train_loss.result()
+    if self.train_accuracy:
+      metrics['train_accuracy'] = self.train_accuracy.result()
+
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+
+    if self.trace_end_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_end_step and
+          self.trace_end_step < next_global_step):
+        self.trace_end(global_step)
+
+    self._epoch_end()
+    return metrics
+
+  def eval_begin(self):
+    """See base class."""
+    if self.test_loss:
+      self.test_loss.reset_states()
+    if self.test_accuracy:
+      self.test_accuracy.reset_states()
+    # self.test_corrects.reset_states()
+
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_start', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+
+      if self.test_loss:
+        if self.one_hot:
+          loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+        loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+        self.test_loss.update_state(loss)
+
+      if self.test_accuracy:
+        self.test_accuracy.update_state(labels, logits)
+        # tf.print('labels.shape: ', labels.shape,
+        #          ', logits.shape: ', logits.shape,
+        #          ', result: ', self.test_accuracy.result())
+      # self.test_corrects.update_state(
+      #     tf.cast(
+      #         tf.reduce_sum(
+      #             tf.cast(
+      #                 tf.equal(
+      #                     tf.cast(tf.argmax(logits, axis=1), labels.dtype),
+      #                     labels), tf.int32)), tf.float32))
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """See base class."""
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_stop', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+    eval_accuracy = float(self.test_accuracy.result())
+    # eval_accuracy = float(self.test_corrects.result()
+    #                      ) / imagenet_preprocessing.NUM_IMAGES['validation']
+    # eval_accuracy = float(self.test_accuracy.result()) * \
+    #     self.flags_obj.batch_size * self.num_eval_steps / \
+    #     imagenet_preprocessing.NUM_IMAGES['validation']
+    mlp_log.mlperf_print(
+        'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1})
+
+    first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0)
+    epoch_count = self.epochs_between_evals
+    if first_epoch_num == 0:
+      epoch_count = self.flags_obj.eval_offset_epochs
+      if epoch_count == 0:
+        epoch_count = self.flags_obj.epochs_between_evals
+    mlp_log.mlperf_print(
+        'block_stop',
+        None,
+        metadata={
+            'first_epoch_num': first_epoch_num + 1,
+            'epoch_count': epoch_count
+        })
+
+    continue_training = True
+    if eval_accuracy >= self.flags_obj.target_accuracy:
+      continue_training = False
+    else:
+      mlp_log.mlperf_print(
+          'block_start',
+          None,
+          metadata={
+              'first_epoch_num': epoch_num + 2,
+              'epoch_count': self.epochs_between_evals
+          })
+
+    results = {}
+    if self.test_loss:
+      results['test_loss'] = self.test_loss.result()
+    if self.test_accuracy:
+      results['test_accuracy'] = self.test_accuracy.result()
+    results['continue_training'] = continue_training
+    return results
+
+  def warmup_loop_begin(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_start(-3)
+    logging.info('Entering the warmup loop.')
+
+  def warmup_loop_end(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_end(-2)
+    # Reset the state
+    self.model.reset_states()
+    tf.keras.backend.set_value(self.optimizer.iterations, 0)
+    for accum_grad in self.accum_grads:
+      accum_grad.assign(tf.zeros_like(accum_grad,
+                                      dtype=self.accum_grads_dtype),
+                        read_value=False)
+    logging.info('Exiting the warmup loop.')
+
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+
+  def _epoch_end(self):
+    # mlp_log.mlperf_print('epoch_stop', None)
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
+
+  def trace_start(self, global_step):
+    logging.info('Starting tracing at step %d.', global_step)
+    tf.profiler.experimental.start(self.flags_obj.model_dir)
+
+  def trace_end(self, global_step):
+    logging.info('Ending trace at step %d', global_step)
+    tf.profiler.experimental.stop()
--- a/run_debug.sh
+++ b/run_debug.sh
+XLA_FLAGS="--xla_gpu_cuda_data_dir=/public/software/compiler/rocm/dtk-21.10.1/amdgcn/bitcode/ --xla_dump_hlo_pass_re=.* --xla_dump_hlo_as_html --xla_dump_to=./tmp" TF_DUMP_GRAPH_PREFIX="./tf_graph" hipprof --hip-trace python3 ./resnet_ctl_imagenet_main.py \
+--base_learning_rate=10.0 \
+--batch_size=32 \
+--nocache_decoded_image \
+--data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow \
+--device_warmup_steps=1 \
+--dtype=fp32 \
+--noenable_checkpoint_and_export \
+--noenable_device_warmup \
+--enable_eager \
+--epochs_between_evals=4 \
+--noeval_dataset_cache \
+--eval_offset_epochs=2 \
+--label_smoothing=0.1 \
+--lars_epsilon=0 \
+--log_steps=125 \
+--lr_schedule=polynomial \
+--optimizer=LARS \
+--noreport_accuracy_metrics \
+--single_l2_loss_op \
+--steps_per_loop=25 \
+--train_epochs=1 \
+--notraining_dataset_cache \
+--notrace_warmup \
+--nouse_synthetic_data \
+--use_tf_function \
+--verbosity=0 \
+--warmup_epochs=5 \
+--weight_decay=0.0002 \
+--target_accuracy=0.759 \
+--momentum=0.9 \
+--num_replicas=64 \
+--num_accumulation_steps=2 \
+--num_classes=1000 \
+--noskip_eval
+
--- a/tf2_common/modeling/__pycache__/performance.cpython-36.pyc
+++ b/tf2_common/modeling/__pycache__/performance.cpython-36.pyc
--- a/tf2_common/modeling/__pycache__/performance.cpython-38.pyc
+++ b/tf2_common/modeling/__pycache__/performance.cpython-38.pyc
--- a/tf2_common/modeling/performance.py
+++ b/tf2_common/modeling/performance.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale="dynamic"):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    # Wraps optimizer with a LossScaleOptimizer. This is done automatically
+    # in compile() with the "mixed_float16" policy, but since we do not call
+    # compile(), we must wrap the optimizer manually.
+    optimizer = (
+        tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+            optimizer, loss_scale=loss_scale))
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.ckeras.mixed_precision and
+    # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
+    # up.
+    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+        optimizer)
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets mix precision policy."""
+  if dtype == tf.float16:
+    print("enter the tf.float16 set policy")
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_float16', loss_scale=loss_scale)
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    print('Compute dtype: %s' % policy.compute_dtype)
+    print('Variable dtype: %s' % policy.variable_dtype)
+#    tf.keras.mixed_precision.experimental.set_policy('float16')
+  elif dtype == tf.bfloat16:
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_bfloat16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.experimental.set_policy('float32')
+  else:
+    raise ValueError("Unexpected dtype: %s" % dtype)
--- a/tf2_common/training/__pycache__/controller.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/controller.cpython-36.pyc
--- a/tf2_common/training/__pycache__/controller.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/controller.cpython-38.pyc
--- a/tf2_common/training/__pycache__/runnable.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/runnable.cpython-36.pyc
--- a/tf2_common/training/__pycache__/runnable.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/runnable.cpython-38.pyc