version 1

05631eec · liangjing · 7e0391d9 · 05631eec · 05631eec · 05631eec
Commit 05631eec authored Apr 10, 2023 by liangjing
20 changed files
--- a/common.py
+++ b/common.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common util functions and classes used by both keras cifar and imagenet."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import os
+import sys
+
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.mlp_log import mlp_log
+import imagenet_preprocessing
+import lars_optimizer
+import lars_util
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+
+FLAGS = flags.FLAGS
+# BASE_LEARNING_RATE = 0.1  # This matches Jing's version.
+TRAIN_TOP_1 = 'training_accuracy_top_1'
+LR_SCHEDULE = [    # (multiplier, epoch to start) tuples
+    (1.0, 5), (0.1, 30), (0.01, 60), (0.001, 80)
+]
+
+
+def learning_rate_schedule(current_epoch,
+                           current_batch,
+                           steps_per_epoch,
+                           batch_size):
+  """Handles linear scaling rule, gradual warmup, and LR decay.
+
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
+  provided scaling factor.
+
+  Args:
+    current_epoch: integer, current epoch indexed from 0.
+    current_batch: integer, current batch in the current epoch, indexed from 0.
+    steps_per_epoch: integer, number of steps in an epoch.
+    batch_size: integer, total batch sized.
+
+  Returns:
+    Adjusted learning rate.
+  """
+  initial_lr = FLAGS.base_learning_rate * batch_size / 256
+  epoch = current_epoch + float(current_batch) / steps_per_epoch
+  warmup_lr_multiplier, warmup_end_epoch = LR_SCHEDULE[0]
+  if epoch < warmup_end_epoch:
+    # Learning rate increases linearly per step.
+    return initial_lr * warmup_lr_multiplier * epoch / warmup_end_epoch
+  for mult, start_epoch in LR_SCHEDULE:
+    if epoch >= start_epoch:
+      learning_rate = initial_lr * mult
+    else:
+      break
+  return learning_rate
+
+
+class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
+  """Callback to update learning rate on every batch (not epoch boundaries).
+
+  N.B. Only support Keras optimizers, not TF optimizers.
+
+  Attributes:
+      schedule: a function that takes an epoch index and a batch index as input
+          (both integer, indexed from 0) and returns a new learning rate as
+          output (float).
+  """
+
+  def __init__(self, schedule, batch_size, steps_per_epoch):
+    super(LearningRateBatchScheduler, self).__init__()
+    self.schedule = schedule
+    self.steps_per_epoch = steps_per_epoch
+    self.batch_size = batch_size
+    self.epochs = -1
+    self.prev_lr = -1
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if not hasattr(self.model.optimizer, 'learning_rate'):
+      raise ValueError('Optimizer must have a "learning_rate" attribute.')
+    self.epochs += 1
+
+  def on_batch_begin(self, batch, logs=None):
+    """Executes before step begins."""
+    lr = self.schedule(self.epochs,
+                       batch,
+                       self.steps_per_epoch,
+                       self.batch_size)
+    if not isinstance(lr, (float, np.float32, np.float64)):
+      raise ValueError('The output of the "schedule" function should be float.')
+    if lr != self.prev_lr:
+      self.model.optimizer.learning_rate = lr  # lr should be a float here
+      self.prev_lr = lr
+      tf.compat.v1.logging.debug(
+          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
+          'change learning rate to %s.', self.epochs, batch, lr)
+
+
+class PiecewiseConstantDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """Piecewise constant decay with warmup schedule."""
+
+  def __init__(self, batch_size, steps_per_epoch, warmup_epochs, boundaries,
+               multipliers, compute_lr_on_cpu=True, name=None):
+    super(PiecewiseConstantDecayWithWarmup, self).__init__()
+    if len(boundaries) != len(multipliers) - 1:
+      raise ValueError('The length of boundaries must be 1 less than the '
+                       'length of multipliers')
+
+    base_lr_batch_size = 256
+    self.steps_per_epoch = steps_per_epoch
+
+    self.rescaled_lr = FLAGS.base_learning_rate * batch_size / base_lr_batch_size
+    self.step_boundaries = [float(self.steps_per_epoch) * x for x in boundaries]
+    self.lr_values = [self.rescaled_lr * m for m in multipliers]
+    self.warmup_steps = warmup_epochs * self.steps_per_epoch
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+    self.name = name
+
+    self.learning_rate_ops_cache = {}
+
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+
+  def _get_learning_rate(self, step):
+    """Compute learning rate at given step."""
+    with tf.compat.v1.name_scope(self.name, 'PiecewiseConstantDecayWithWarmup',
+                                 [self.rescaled_lr, self.step_boundaries,
+                                  self.lr_values, self.warmup_steps,
+                                  self.compute_lr_on_cpu]):
+      def warmup_lr(step):
+        return self.rescaled_lr * (
+            tf.cast(step, tf.float32) / tf.cast(self.warmup_steps, tf.float32))
+      def piecewise_lr(step):
+        return tf.compat.v1.train.piecewise_constant(step, self.step_boundaries,
+                                                     self.lr_values)
+
+      lr = tf.cond(step < self.warmup_steps, lambda: warmup_lr(step),
+                   lambda: piecewise_lr(step))
+      return lr
+
+  def get_config(self):
+    return {
+        'rescaled_lr': self.rescaled_lr,
+        'step_boundaries': self.step_boundaries,
+        'lr_values': self.lr_values,
+        'warmup_steps': self.warmup_steps,
+        'compute_lr_on_cpu': self.compute_lr_on_cpu,
+        'name': self.name
+    }
+
+
+def get_optimizer(flags_obj,
+                  steps_per_epoch,
+                  train_steps):
+  """Returns optimizer to use."""
+  optimizer = None
+  learning_rate_schedule_fn = None
+
+  if (get_flag_module(flags_obj, 'model') is None or
+      flags_obj.model == 'resnet50_v1.5'):
+    if flags_obj.lr_schedule == 'polynomial':
+      lr_schedule = lars_util.PolynomialDecayWithWarmup(
+          batch_size=flags_obj.batch_size,
+          steps_per_epoch=steps_per_epoch,
+          train_steps=train_steps,
+          initial_learning_rate=flags_obj.base_learning_rate,
+          end_learning_rate=flags_obj.end_learning_rate,
+          warmup_epochs=flags_obj.warmup_epochs)
+    elif flags_obj.lr_schedule == 'piecewise':
+      lr_schedule = PiecewiseConstantDecayWithWarmup(
+          batch_size=flags_obj.batch_size,
+          steps_per_epoch=steps_per_epoch,
+          warmup_epochs=LR_SCHEDULE[0][1],
+          boundaries=list(p[1] for p in LR_SCHEDULE[1:]),
+          multipliers=list(p[0] for p in LR_SCHEDULE),
+          compute_lr_on_cpu=True)
+    elif flags_obj.lr_schedule == 'constant':
+      lr_schedule = flags_obj.base_learning_rate * flags_obj.batch_size / 256
+    else:
+      raise ValueError('lr_schedule "%s" is unknown.' % flags_obj.lr_schedule)
+
+    if flags_obj.optimizer == 'SGD':
+      # The learning_rate is overwritten at the beginning of
+      # each step by callback.
+      optimizer = gradient_descent_v2.SGD(
+          learning_rate=lr_schedule, momentum=FLAGS.momentum)
+
+    elif flags_obj.optimizer == 'LARS':
+      use_experimental_compile = True if tf.config.list_physical_devices(
+          'GPU') else False
+
+      optimizer = lars_optimizer.LARSOptimizer(
+          learning_rate=lr_schedule,
+          momentum=flags_obj.momentum,
+          weight_decay=flags_obj.weight_decay,
+          skip_list=['batch_normalization', 'bias', 'bn'],
+          epsilon=flags_obj.lars_epsilon)
+          # use_experimental_compile=use_experimental_compile)
+
+    learning_rate_schedule_fn = learning_rate_schedule
+
+  elif flags_obj.model == 'mobilenet':
+    initial_learning_rate = \
+          flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
+    optimizer = tf.keras.optimizers.SGD(
+        learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
+            initial_learning_rate,
+            decay_steps=steps_per_epoch * flags_obj.num_epochs_per_decay,
+            decay_rate=flags_obj.lr_decay_factor,
+            staircase=True),
+        momentum=flags_obj.momentum)
+
+  return optimizer, learning_rate_schedule_fn
+
+
+# TODO(hongkuny,haoyuzhang): make cifar model use_tensor_lr to clean up code.
+def get_callbacks(
+    steps_per_epoch,
+    learning_rate_schedule_fn=None,
+    pruning_method=None,
+    enable_checkpoint_and_export=False,
+    model_dir=None):
+  """Returns common callbacks."""
+  time_callback = keras_utils.TimeHistory(
+      FLAGS.batch_size,
+      FLAGS.log_steps,
+      logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None)
+  callbacks = [time_callback]
+
+  if FLAGS.lr_schedule == 'constant' and learning_rate_schedule_fn:
+    lr_callback = LearningRateBatchScheduler(
+        learning_rate_schedule_fn,
+        batch_size=FLAGS.batch_size,
+        steps_per_epoch=steps_per_epoch)
+    callbacks.append(lr_callback)
+
+  if FLAGS.enable_tensorboard:
+    tensorboard_callback = tf.keras.callbacks.TensorBoard(
+        log_dir=FLAGS.model_dir)
+    callbacks.append(tensorboard_callback)
+
+  if FLAGS.profile_steps:
+    profiler_callback = keras_utils.get_profiler_callback(
+        FLAGS.model_dir,
+        FLAGS.profile_steps,
+        FLAGS.enable_tensorboard,
+        steps_per_epoch)
+    callbacks.append(profiler_callback)
+
+  is_pruning_enabled = pruning_method is not None
+  if is_pruning_enabled:
+    callbacks.append(tfmot.sparsity.keras.UpdatePruningStep())
+    if model_dir is not None:
+      callbacks.append(tfmot.sparsity.keras.PruningSummaries(
+          log_dir=model_dir, profile_batch=0))
+
+  if enable_checkpoint_and_export:
+    if model_dir is not None:
+      ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}')
+      callbacks.append(
+          tf.keras.callbacks.ModelCheckpoint(ckpt_full_path,
+                                             save_weights_only=True))
+  return callbacks
+
+
+def build_stats(history, eval_output, callbacks):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    history: Results of the training step. Supports both categorical_accuracy
+      and sparse_categorical_accuracy.
+    eval_output: Output of the eval step. Assumes first value is eval_loss and
+      second value is accuracy_top_1.
+    callbacks: a list of callbacks which might include a time history callback
+      used during keras.fit.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+  if eval_output:
+    stats['accuracy_top_1'] = float(eval_output[1])
+    if FLAGS.report_accuracy_metrics:
+      stats['eval_loss'] = float(eval_output[0])
+
+  if history and history.history and FLAGS.report_accuracy_metrics:
+    train_hist = history.history
+    # Gets final loss from training.
+    stats['loss'] = float(train_hist['loss'][-1])
+    # Gets top_1 training accuracy.
+    if 'categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['categorical_accuracy'][-1])
+    elif 'sparse_categorical_accuracy' in train_hist:
+      stats[TRAIN_TOP_1] = float(train_hist['sparse_categorical_accuracy'][-1])
+
+  if not callbacks:
+    return stats
+
+  # Look for the time history callback which was used during keras.fit
+  for callback in callbacks:
+    if isinstance(callback, keras_utils.TimeHistory):
+      timestamp_log = callback.timestamp_log
+      stats['step_timestamp_log'] = timestamp_log
+      stats['train_finish_time'] = callback.train_finish_time
+      if callback.epoch_runtime_log:
+        stats['avg_exp_per_second'] = callback.average_examples_per_second
+
+  return stats
+
+
+def define_keras_flags(
+    dynamic_loss_scale=True,
+    model=False,
+    optimizer=False,
+    pretrained_filepath=False):
+  """Define flags for Keras models."""
+  flags_core.define_base(clean=True, num_gpu=True, run_eagerly=True,
+                         train_epochs=True, epochs_between_evals=True,
+                         distribution_strategy=True)
+  flags_core.define_performance(num_parallel_calls=False,
+                                synthetic_data=True,
+                                dtype=True,
+                                all_reduce_alg=True,
+                                num_packs=True,
+                                tf_gpu_thread_mode=True,
+                                datasets_num_private_threads=True,
+                                dynamic_loss_scale=dynamic_loss_scale,
+                                loss_scale=True,
+                                fp16_implementation=True,
+                                tf_data_experimental_slack=True,
+                                enable_xla=True,
+                                force_v2_in_keras_compile=True,
+                                training_dataset_cache=True,
+                                training_prefetch_batchs=True,
+                                eval_dataset_cache=True,
+                                eval_prefetch_batchs=True)
+  flags_core.define_image()
+  flags_core.define_benchmark()
+  flags_core.define_distribution()
+  flags.adopt_module_key_flags(flags_core)
+
+  flags.DEFINE_boolean(name='enable_eager', default=False, help='Enable eager?')
+  flags.DEFINE_boolean(name='skip_eval', default=False, help='Skip evaluation?')
+  # TODO(b/135607288): Remove this flag once we understand the root cause of
+  # slowdown when setting the learning phase in Keras backend.
+  flags.DEFINE_boolean(
+      name='set_learning_phase_to_train', default=True,
+      help='If skip eval, also set Keras learning phase to 1 (training).')
+  flags.DEFINE_boolean(
+      name='explicit_gpu_placement', default=False,
+      help='If not using distribution strategy, explicitly set device scope '
+      'for the Keras training loop.')
+  flags.DEFINE_boolean(name='use_trivial_model', default=False,
+                       help='Whether to use a trivial Keras model.')
+  flags.DEFINE_boolean(name='report_accuracy_metrics', default=True,
+                       help='Report metrics during training and evaluation.')
+  flags.DEFINE_string(
+      name='lr_schedule', default='piecewise',
+      help='learning rate schedule. '
+      '"piecewise" for PiecewiseConstantDecayWithWarmup, '
+      '"polynomial" for PolynomialDecayWithWarmup, '
+      'and "constant" for static learning rate.')
+  flags.DEFINE_boolean(
+      name='enable_tensorboard', default=False,
+      help='Whether to enable Tensorboard callback.')
+  flags.DEFINE_integer(
+      name='train_steps', default=None,
+      help='The number of steps to run for training. If it is larger than '
+      '# batches per epoch, then use # batches per epoch. This flag will be '
+      'ignored if train_epochs is set to be larger than 1. ')
+  flags.DEFINE_string(
+      name='profile_steps', default=None,
+      help='Save profiling data to model dir at given range of global steps. The '
+      'value must be a comma separated pair of positive integers, specifying '
+      'the first and last step to profile. For example, "--profile_steps=2,4" '
+      'triggers the profiler to process 3 steps, starting from the 2nd step. '
+      'Note that profiler has a non-trivial performance overhead, and the '
+      'output file can be gigantic if profiling many steps.')
+  flags.DEFINE_boolean(
+      name='batchnorm_spatial_persistent', default=True,
+      help='Enable the spacial persistent mode for CuDNN batch norm kernel.')
+  flags.DEFINE_boolean(
+      name='enable_get_next_as_optional', default=False,
+      help='Enable get_next_as_optional behavior in DistributedIterator.')
+  flags.DEFINE_boolean(
+      name='enable_checkpoint_and_export', default=False,
+      help='Whether to enable a checkpoint callback and export the savedmodel.')
+  flags.DEFINE_string(
+      name='tpu', default='', help='TPU address to connect to.')
+  flags.DEFINE_string(
+      name='tpu_zone', default='', help='Zone in which the TPU resides.')
+  flags.DEFINE_integer(
+      name='steps_per_loop',
+      default=500,
+      help='Number of steps per training loop. Only training step happens '
+      'inside the loop. Callbacks will not be called inside. Will be capped at '
+      'steps per epoch.')
+  flags.DEFINE_boolean(
+      name='use_tf_while_loop',
+      default=True,
+      help='Whether to build a tf.while_loop inside the training loop on the '
+      'host. Setting it to True is critical to have peak performance on '
+      'TPU.')
+  flags.DEFINE_boolean(
+      name='use_tf_keras_layers', default=False,
+      help='Whether to use tf.keras.layers instead of tf.python.keras.layers.'
+      'It only changes imagenet resnet model layers for now. This flag is '
+      'a temporal flag during transition to tf.keras.layers. Do not use this '
+      'flag for external usage. this will be removed shortly.')
+  flags.DEFINE_float(
+      'base_learning_rate', 0.1,
+      'Base learning rate. '
+      'This is the learning rate when using batch size 256; when using other '
+      'batch sizes, the learning rate will be scaled linearly.')
+  flags.DEFINE_string(
+      'optimizer', 'SGD',
+      'Name of optimizer preset. (SGD, LARS)')
+  flags.DEFINE_boolean(
+      'drop_train_remainder', True,
+      'Whether to drop remainder in the training dataset.')
+  flags.DEFINE_boolean(
+      'drop_eval_remainder', False,
+      'Whether to drop remainder in the eval dataset.')
+  flags.DEFINE_float(
+      'label_smoothing', 0.0,
+      'Apply label smoothing to the loss. This applies to '
+      'categorical_cross_entropy; when label_smoothing > 0, '
+      'one-hot encoding is used for the labels.')
+  flags.DEFINE_integer(
+      'num_classes', 1000,
+      'Number of classes for labels, at least 2.')
+  flags.DEFINE_integer(
+      'eval_offset_epochs', 0,
+      'Epoch number of the first evaluation.')
+
+  lars_util.define_lars_flags()
+
+  if model:
+    flags.DEFINE_string('model', 'resnet50_v1.5',
+                        'Name of model preset. (mobilenet, resnet50_v1.5)')
+  if optimizer:
+    # TODO(kimjaehong): Replace as general hyper-params not only for mobilenet.
+    flags.DEFINE_float('initial_learning_rate_per_sample', 0.00007,
+                       'Initial value of learning rate per sample for '
+                       'SGD optimizer when using mobilenet.')
+    flags.DEFINE_float('lr_decay_factor', 0.94,
+                       'Learning rate decay factor for SGD optimizer '
+                       'when using mobilenet.')
+    flags.DEFINE_float('num_epochs_per_decay', 2.5,
+                       'Number of epochs per decay for SGD optimizer '
+                       'when using mobilenet.')
+  if pretrained_filepath:
+    flags.DEFINE_string('pretrained_filepath', '',
+                        'Pretrained file path.')
+  flags.DEFINE_float('target_accuracy', 0.759,
+                     'Target eval accuracy, after which training will stop.')
+
+
+def get_synth_data(height, width, num_channels, num_classes, dtype):
+  """Creates a set of synthetic random data.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+
+  Returns:
+    A tuple of tensors representing the inputs and labels.
+
+  """
+  # Synthetic input should be within [0, 255].
+  inputs = tf.random.truncated_normal([height, width, num_channels],
+                                      dtype=dtype,
+                                      mean=127,
+                                      stddev=60,
+                                      name='synthetic_inputs')
+  labels = tf.random.uniform([1],
+                             minval=0,
+                             maxval=num_classes - 1,
+                             dtype=tf.int32,
+                             name='synthetic_labels')
+  return inputs, labels
+
+
+def define_pruning_flags():
+  """Define flags for pruning methods."""
+  flags.DEFINE_string('pruning_method', None,
+                      'Pruning method.'
+                      'None (no pruning) or polynomial_decay.')
+  flags.DEFINE_float('pruning_initial_sparsity', 0.0,
+                     'Initial sparsity for pruning.')
+  flags.DEFINE_float('pruning_final_sparsity', 0.5,
+                     'Final sparsity for pruning.')
+  flags.DEFINE_integer('pruning_begin_step', 0,
+                       'Begin step for pruning.')
+  flags.DEFINE_integer('pruning_end_step', 100000,
+                       'End step for pruning.')
+  flags.DEFINE_integer('pruning_frequency', 100,
+                       'Frequency for pruning.')
+
+
+def get_synth_input_fn(height, width, num_channels, num_classes,
+                       dtype=tf.float32, drop_remainder=True):
+  """Returns an input function that returns a dataset with random data.
+
+  This input_fn returns a data set that iterates over a set of random data and
+  bypasses all preprocessing, e.g. jpeg decode and copy. The host to device
+  copy is still included. This used to find the upper throughput bound when
+  tuning the full input pipeline.
+
+  Args:
+    height: Integer height that will be used to create a fake image tensor.
+    width: Integer width that will be used to create a fake image tensor.
+    num_channels: Integer depth that will be used to create a fake image tensor.
+    num_classes: Number of classes that should be represented in the fake labels
+      tensor
+    dtype: Data type for features/images.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+
+  Returns:
+    An input_fn that can be used in place of a real one to return a dataset
+    that can be used for iteration.
+  """
+  # pylint: disable=unused-argument
+  def input_fn(is_training, data_dir, batch_size, *args, **kwargs):
+    """Returns dataset filled with random data."""
+    inputs, labels = get_synth_data(height=height,
+                                    width=width,
+                                    num_channels=num_channels,
+                                    num_classes=num_classes,
+                                    dtype=dtype)
+
+    if FLAGS.label_smoothing and FLAGS.label_smoothing > 0:
+      labels = tf.one_hot(labels, num_classes)
+      labels = tf.reshape(labels, [num_classes])
+    else:
+      labels = tf.cast(labels, tf.float32)
+
+    labels = tf.cast(labels, dtype=tf.float32)
+    data = tf.data.Dataset.from_tensors((inputs, labels)).repeat()
+
+    # `drop_remainder` will make dataset produce outputs with known shapes.
+    data = data.batch(batch_size, drop_remainder=drop_remainder)
+    data = data.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+    return data
+
+  return input_fn
+
+
+def set_cudnn_batchnorm_mode():
+  """Set CuDNN batchnorm mode for better performance.
+
+     Note: Spatial Persistent mode may lead to accuracy losses for certain
+     models.
+  """
+  if FLAGS.batchnorm_spatial_persistent:
+    os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'
+  else:
+    os.environ.pop('TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT', None)
+
+
+def print_flags(flags_obj):
+  """Print out all flags."""
+  flags_by_module = flags_obj.flags_by_module_dict()
+  modules = sorted(flags_by_module)
+  main_module = sys.argv[0]
+  if main_module in modules:
+    modules.remove(main_module)
+    modules = [main_module] + modules
+
+  selections = ['mlperf', 'tensorflow', 'absl', 'xla', 'tf2', 'main']
+  for module in modules:
+    hit_selections = False
+    for selection in selections:
+      if selection in module:
+        hit_selections = True
+        break
+    # if not hit_selections:
+    #   continue
+
+    logging.info('Module %s:', module)
+    flags_dict = flags_by_module[module]
+    for flag in flags_dict:
+      logging.info('\t flags_obj.%s = %s', flag.name, flag.value)
+
+
+def get_flag_module(flags_obj, flag):
+  """Get which module a flag is defined in."""
+  flags_by_module = flags_obj.flags_by_module_dict()
+  modules = sorted(flags_by_module)
+
+  for module in modules:
+    if flag in flags_by_module[module]:
+      return module
+
+  return None
+
+
+def get_num_train_iterations(flags_obj):
+  """Returns the number of training steps, train and test epochs."""
+  if flags_obj.drop_train_remainder:
+    steps_per_epoch = (
+        imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  else:
+    steps_per_epoch = (
+        math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['train'] /
+                  flags_obj.batch_size))
+
+  train_epochs = flags_obj.train_epochs
+  # if mutliple epochs, ignore the train_steps flag.
+  if train_epochs <= 1 and flags_obj.train_steps:
+    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
+    train_epochs = 1
+  else:
+    eval_offset_epochs = flags_obj.eval_offset_epochs
+    epochs_between_evals = flags_obj.epochs_between_evals
+    train_epochs = eval_offset_epochs + math.ceil(
+        (train_epochs - eval_offset_epochs) /
+        epochs_between_evals) * epochs_between_evals
+
+  return steps_per_epoch, train_epochs
+
+
+def get_num_eval_steps(flags_obj):
+  """Returns the number of eval steps."""
+  if flags_obj.drop_eval_remainder:
+    eval_steps = (
+        imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)
+  else:
+    eval_steps = (
+        math.ceil(1.0 * imagenet_preprocessing.NUM_IMAGES['validation'] /
+                  flags_obj.batch_size))
+
+  return eval_steps
+
--- a/eval_accu.sh
+++ b/eval_accu.sh
+cat $1 |grep eval_accuracy|awk -F eval_accuracy '{print $2}'|awk -F value '{print $2}'|awk '{print $2}'|uniq
+
--- a/imagenet_preprocessing.py
+++ b/imagenet_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to preprocess images.
+
+Training images are sampled using the provided bounding boxes, and subsequently
+cropped to the sampled bounding box. Images are additionally flipped randomly,
+then resized to the target output size (without aspect-ratio preservation).
+
+Images used during evaluation are resized (with aspect-ratio preservation) and
+centrally cropped.
+
+All images undergo mean color subtraction.
+
+Note that these steps are colloquially referred to as "ResNet preprocessing,"
+and they differ from "VGG preprocessing," which does not use bounding boxes
+and instead does an aspect-preserving resize followed by random crop during
+training. (These both differ from "Inception preprocessing," which introduces
+color distortion steps.)
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+DEFAULT_IMAGE_SIZE = 224
+NUM_CHANNELS = 3
+
+NUM_IMAGES = {
+    'train': 1281167,
+    'validation': 50000,
+}
+
+_NUM_TRAIN_FILES = 1024
+_SHUFFLE_BUFFER = 10000
+
+_R_MEAN = 123.68
+_G_MEAN = 116.78
+_B_MEAN = 103.94
+CHANNEL_MEANS = [_R_MEAN, _G_MEAN, _B_MEAN]
+
+# The lower bound for the smallest side of the image for aspect-preserving
+# resizing. For example, if an image is 500 x 1000, it will be resized to
+# _RESIZE_MIN x (_RESIZE_MIN * 2).
+_RESIZE_MIN = 256
+
+FLAGS = flags.FLAGS
+
+
+def process_record_dataset(dataset,
+                           is_training,
+                           batch_size,
+                           shuffle_buffer,
+                           dtype=tf.float32,
+                           datasets_num_private_threads=None,
+                           drop_remainder=False,
+                           tf_data_experimental_slack=False,
+                           prefetch_batchs=tf.data.experimental.AUTOTUNE):
+  """Given a Dataset with raw records, return an iterator over the records.
+
+  Args:
+    dataset: A Dataset representing raw records
+    is_training: A boolean denoting whether the input is for training.
+    batch_size: The number of samples per batch.
+    shuffle_buffer: The buffer size to use when shuffling records. A larger
+      value results in better randomness, but smaller values reduce startup
+      time and use less memory.
+    dtype: Data type to use for images/features.
+    datasets_num_private_threads: Number of threads for a private
+      threadpool created for all datasets computation.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+    prefetch_batchs: The number of batchs to prefetch.
+
+  Returns:
+    Dataset of (image, label) pairs ready for iteration.
+  """
+  # Defines a specific size thread pool for tf.data operations.
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = (
+        datasets_num_private_threads)
+    dataset = dataset.with_options(options)
+    logging.info(
+        'datasets_num_private_threads: %s', datasets_num_private_threads)
+
+  if is_training:
+    # Shuffles records before repeating to respect epoch boundaries.
+    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
+    # Repeats the dataset for the number of epochs to train.
+    dataset = dataset.repeat()
+
+  one_hot = False
+  num_classes = FLAGS.num_classes
+  if FLAGS.label_smoothing and FLAGS.label_smoothing > 0:
+    one_hot = True
+
+  logging.info('Num classes: %d', num_classes)
+  logging.info('One hot: %s', one_hot)
+  if is_training and FLAGS.cache_decoded_image:
+    parse_record_fn = preprocess_parsed_example
+  else:
+    parse_record_fn = parse_and_preprocess_record
+
+  map_fn = functools.partial(
+      parse_record_fn,
+      is_training=is_training,
+      dtype=dtype,
+      num_classes=num_classes,
+      one_hot=one_hot)
+
+  # Parses the raw records into images and labels.
+  dataset = dataset.map(
+      map_fn,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+
+  # Operations between the final prefetch and the get_next call to the iterator
+  # will happen synchronously during run time. We prefetch here again to
+  # background all of the above processing work and keep it out of the
+  # critical training path. Setting buffer_size to tf.data.experimental.AUTOTUNE
+  # allows DistributionStrategies to adjust how many batches to fetch based
+  # on how many devices are present.
+  dataset = dataset.prefetch(buffer_size=prefetch_batchs)
+
+  options = tf.data.Options()
+  options.experimental_slack = tf_data_experimental_slack
+  dataset = dataset.with_options(options)
+
+  return dataset
+
+
+def get_filenames(is_training, data_dir):
+  """Return filenames for dataset."""
+  if is_training:
+    return [
+        os.path.join(data_dir,'train', 'train-%05d-of-01024' % i)
+        for i in range(_NUM_TRAIN_FILES)]
+  else:
+    return [
+        os.path.join(data_dir,'val', 'val-%05d-of-00128' % i)
+        for i in range(128)]
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+
+  The output of the build_image_data.py image preprocessing script is a dataset
+  containing serialized Example protocol buffers. Each Example proto contains
+  the following fields (values are included as examples):
+
+    image/height: 462
+    image/width: 581
+    image/colorspace: 'RGB'
+    image/channels: 3
+    image/class/label: 615
+    image/class/synset: 'n03623198'
+    image/class/text: 'knee pad'
+    image/object/bbox/xmin: 0.1
+    image/object/bbox/xmax: 0.9
+    image/object/bbox/ymin: 0.2
+    image/object/bbox/ymax: 0.6
+    image/object/bbox/label: 615
+    image/format: 'JPEG'
+    image/filename: 'ILSVRC2012_val_00041207.JPEG'
+    image/encoded: <JPEG encoded string>
+
+  Args:
+    example_serialized: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+
+  Returns:
+    image_buffer: Tensor tf.string containing the contents of a JPEG file.
+    label: Tensor tf.int32 containing the label.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+  """
+  # Dense features in Example proto.
+  feature_map = {
+      'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                             default_value=''),
+      'image/class/label': tf.io.FixedLenFeature([], dtype=tf.int64,
+                                                 default_value=-1),
+      'image/class/text': tf.io.FixedLenFeature([], dtype=tf.string,
+                                                default_value=''),
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+      {k: sparse_float32 for k in [
+          'image/object/bbox/xmin', 'image/object/bbox/ymin',
+          'image/object/bbox/xmax', 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized,
+                                        features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+
+  xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0)
+  ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0)
+  xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0)
+  ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0)
+
+  # Note that we impose an ordering of (y, x) just to make life difficult.
+  bbox = tf.concat([ymin, xmin, ymax, xmax], 0)
+
+  # Force the variable number of bounding boxes into the shape
+  # [1, num_boxes, coords].
+  bbox = tf.expand_dims(bbox, 0)
+  bbox = tf.transpose(a=bbox, perm=[0, 2, 1])
+
+  return features['image/encoded'], label, bbox
+
+
+def parse_example_proto_and_decode(example_serialized):
+  """Parses an example and decodes the image to prepare for caching."""
+  image_buffer, label, bbox = parse_example_proto(example_serialized)
+  image_buffer = tf.reshape(image_buffer, shape=[])
+  image_buffer = tf.io.decode_jpeg(image_buffer, 3)
+  return image_buffer, label, bbox
+
+
+def preprocess_parsed_example(
+    image_buffer, label, bbox, is_training, dtype, num_classes, one_hot=False):
+  """Applies preprocessing steps to the input parsed example."""
+  image = preprocess_image(
+      image_buffer=image_buffer,
+      bbox=bbox,
+      output_height=DEFAULT_IMAGE_SIZE,
+      output_width=DEFAULT_IMAGE_SIZE,
+      num_channels=NUM_CHANNELS,
+      is_training=is_training)
+  image = tf.cast(image, dtype)
+
+  # Subtract one so that labels are in [0, 1000), and cast to float32 for
+  # Keras model.
+  label = tf.reshape(label, shape=[1])
+  label = tf.cast(label, tf.int32)
+  label -= 1
+
+  if one_hot:
+    label = tf.one_hot(label, num_classes)
+    label = tf.reshape(label, [num_classes])
+  else:
+    label = tf.cast(label, tf.float32)
+
+  return image, label
+
+
+def parse_and_preprocess_record(
+    raw_record, is_training, dtype, num_classes, one_hot=False):
+  """Parses and preprocesses a record containing a training example of an image.
+
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized
+      Example protocol buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: data type to use for images/features.
+    num_classes: Number of classes for one hot encoding.
+    one_hot: Whether to use one_hot encoding on label.
+
+  Returns:
+    Tuple with processed image tensor in a channel-last format and
+    one-hot-encoded label tensor.
+  """
+  image_buffer, label, bbox = parse_example_proto(raw_record)
+  return preprocess_parsed_example(image_buffer=image_buffer,
+                                   label=label,
+                                   bbox=bbox,
+                                   is_training=is_training,
+                                   dtype=dtype,
+                                   one_hot=one_hot,
+                                   num_classes=num_classes)
+
+
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             input_context=None,
+             drop_remainder=False,
+             tf_data_experimental_slack=False,
+             dataset_cache=False,
+             filenames=None,
+             prefetch_batchs=tf.data.experimental.AUTOTUNE):
+  """Input function which provides batches for train or eval.
+
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+    tf_data_experimental_slack: Whether to enable tf.data's
+      `experimental_slack` option.
+    dataset_cache: Whether to cache the dataset on workers.
+       Typically used to improve training performance when training data is in
+       remote storage and can fit into worker memory.
+    filenames: Optional field for providing the file names of the TFRecords.
+    prefetch_batchs: The number of batchs to prefetch.
+
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  if filenames is None:
+    filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.Dataset.from_tensor_slices(filenames)
+
+  if input_context:
+    logging.info(
+        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+        input_context.input_pipeline_id, input_context.num_input_pipelines)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+
+  if is_training:
+    # Shuffle the input files
+    dataset = dataset.shuffle(buffer_size=_NUM_TRAIN_FILES)
+
+  # Convert to individual records.
+  # cycle_length = 10 means that up to 10 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset,
+      cycle_length=10,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training and FLAGS.cache_decoded_image:
+    dataset = dataset.map(
+        parse_example_proto_and_decode,
+        num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  if dataset_cache:
+    # Improve training / eval performance when data is in remote storage and
+    # can fit into worker memory.
+    dataset = dataset.cache()
+
+  return process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=_SHUFFLE_BUFFER,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder,
+      tf_data_experimental_slack=tf_data_experimental_slack,
+      prefetch_batchs=prefetch_batchs,
+  )
+
+
+def _decode_crop_and_flip(image_buffer, bbox, num_channels):
+  """Crops the given image to a random part of the image, and randomly flips.
+
+  We use the fused decode_and_crop op, which performs better than the two ops
+  used separately in series, but note that this requires that the image be
+  passed in as an un-decoded string Tensor.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    num_channels: Integer depth of the image buffer for decoding.
+
+  Returns:
+    3-D tensor with cropped image.
+
+  """
+  # A large fraction of image datasets contain a human-annotated bounding box
+  # delineating the region of the image containing the object of interest.  We
+  # choose to create a new bounding box for the object which is a randomly
+  # distorted version of the human-annotated bounding box that obeys an
+  # allowed range of aspect ratios, sizes and overlap with the human-annotated
+  # bounding box. If no box is supplied, then we assume the bounding box is
+  # the entire image.
+  decoded = image_buffer.dtype != tf.string
+  shape = (tf.shape(image_buffer) if decoded
+           else tf.image.extract_jpeg_shape(image_buffer))
+  sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box(
+      shape,
+      bounding_boxes=bbox,
+      min_object_covered=0.1,
+      aspect_ratio_range=[0.75, 1.33],
+      area_range=[0.05, 1.0],
+      max_attempts=100,
+      use_image_if_no_bounding_boxes=True)
+  bbox_begin, bbox_size, _ = sample_distorted_bounding_box
+
+  # Reassemble the bounding box in the format the crop op requires.
+  offset_y, offset_x, _ = tf.unstack(bbox_begin)
+  target_height, target_width, _ = tf.unstack(bbox_size)
+  crop_window = tf.stack([offset_y, offset_x, target_height, target_width])
+
+  if decoded:
+    cropped = tf.image.crop_to_bounding_box(
+        image_buffer,
+        offset_height=offset_y,
+        offset_width=offset_x,
+        target_height=target_height,
+        target_width=target_width)
+  else:
+    # Use the fused decode and crop op here, which is faster than sequential.
+    cropped = tf.image.decode_and_crop_jpeg(
+        image_buffer, crop_window, channels=num_channels)
+
+  # Flip to add a little more random distortion in.
+  cropped = tf.image.random_flip_left_right(cropped)
+  return cropped
+
+
+def _central_crop(image, crop_height, crop_width):
+  """Performs central crops of the given image list.
+
+  Args:
+    image: a 3-D image tensor
+    crop_height: the height of the image following the crop.
+    crop_width: the width of the image following the crop.
+
+  Returns:
+    3-D tensor with cropped image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  amount_to_be_cropped_h = (height - crop_height)
+  crop_top = amount_to_be_cropped_h // 2
+  amount_to_be_cropped_w = (width - crop_width)
+  crop_left = amount_to_be_cropped_w // 2
+  return tf.slice(
+      image, [crop_top, crop_left, 0], [crop_height, crop_width, -1])
+
+
+def _mean_image_subtraction(image, means, num_channels):
+  """Subtracts the given means from each image channel.
+
+  For example:
+    means = [123.68, 116.779, 103.939]
+    image = _mean_image_subtraction(image, means)
+
+  Note that the rank of `image` must be known.
+
+  Args:
+    image: a tensor of size [height, width, C].
+    means: a C-vector of values to subtract from each channel.
+    num_channels: number of color channels in the image that will be distorted.
+
+  Returns:
+    the centered image.
+
+  Raises:
+    ValueError: If the rank of `image` is unknown, if `image` has a rank other
+      than three or if the number of channels in `image` doesn't match the
+      number of values in `means`.
+  """
+  if image.get_shape().ndims != 3:
+    raise ValueError('Input must be of size [height, width, C>0]')
+
+  if len(means) != num_channels:
+    raise ValueError('len(means) must match the number of channels')
+
+  # We have a 1-D tensor of means; convert to 3-D.
+  # Note(b/130245863): we explicitly call `broadcast` instead of simply
+  # expanding dimensions for better performance.
+  means = tf.broadcast_to(means, tf.shape(image))
+
+  return image - means
+
+
+def _smallest_size_at_least(height, width, resize_min):
+  """Computes new shape with the smallest side equal to `smallest_side`.
+
+  Computes new shape with the smallest side equal to `smallest_side` while
+  preserving the original aspect ratio.
+
+  Args:
+    height: an int32 scalar tensor indicating the current height.
+    width: an int32 scalar tensor indicating the current width.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    new_height: an int32 scalar tensor indicating the new height.
+    new_width: an int32 scalar tensor indicating the new width.
+  """
+  resize_min = tf.cast(resize_min, tf.float32)
+
+  # Convert to floats to make subsequent calculations go smoothly.
+  height, width = tf.cast(height, tf.float32), tf.cast(width, tf.float32)
+
+  smaller_dim = tf.minimum(height, width)
+  scale_ratio = resize_min / smaller_dim
+
+  # Convert back to ints to make heights and widths that TF ops will accept.
+  new_height = tf.cast(height * scale_ratio, tf.int32)
+  new_width = tf.cast(width * scale_ratio, tf.int32)
+
+  return new_height, new_width
+
+
+def _aspect_preserving_resize(image, resize_min):
+  """Resize images preserving the original aspect ratio.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    resize_min: A python integer or scalar `Tensor` indicating the size of
+      the smallest side after resize.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image.
+  """
+  shape = tf.shape(input=image)
+  height, width = shape[0], shape[1]
+
+  new_height, new_width = _smallest_size_at_least(height, width, resize_min)
+
+  return _resize_image(image, new_height, new_width)
+
+
+def _resize_image(image, height, width):
+  """Simple wrapper around tf.resize_images.
+
+  This is primarily to make sure we use the same `ResizeMethod` and other
+  details each time.
+
+  Args:
+    image: A 3-D image `Tensor`.
+    height: The target height for the resized image.
+    width: The target width for the resized image.
+
+  Returns:
+    resized_image: A 3-D tensor containing the resized image. The first two
+      dimensions have the shape [height, width].
+  """
+  return tf.compat.v1.image.resize(
+      image, [height, width], method=tf.image.ResizeMethod.BILINEAR,
+      align_corners=False)
+
+
+def preprocess_image(image_buffer, bbox, output_height, output_width,
+                     num_channels, is_training=False):
+  """Preprocesses the given image.
+
+  Preprocessing includes decoding, cropping, and resizing for both training
+  and eval images. Training preprocessing, however, introduces some random
+  distortion of the image to improve accuracy.
+
+  Args:
+    image_buffer: scalar string Tensor representing the raw JPEG image buffer.
+    bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords]
+      where each coordinate is [0, 1) and the coordinates are arranged as
+      [ymin, xmin, ymax, xmax].
+    output_height: The height of the image after preprocessing.
+    output_width: The width of the image after preprocessing.
+    num_channels: Integer depth of the image buffer for decoding.
+    is_training: `True` if we're preprocessing the image for training and
+      `False` otherwise.
+
+  Returns:
+    A preprocessed image.
+  """
+  if is_training:
+    # For training, we want to randomize some of the distortions.
+    image = _decode_crop_and_flip(image_buffer, bbox, num_channels)
+    image = _resize_image(image, output_height, output_width)
+  else:
+    # For validation, we want to decode, resize, then just crop the middle.
+    if image_buffer.dtype == tf.string:
+      image = tf.image.decode_jpeg(image_buffer, channels=num_channels)
+    else:
+      image = image_buffer
+    image = _aspect_preserving_resize(image, _RESIZE_MIN)
+    image = _central_crop(image, output_height, output_width)
+
+  image.set_shape([output_height, output_width, num_channels])
+
+  return _mean_image_subtraction(image, CHANNEL_MEANS, num_channels)
--- a/lars_optimizer.py
+++ b/lars_optimizer.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Layer-wise Adaptive Rate Scaling optimizer for large-batch training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+# from tf2_common.training import optimizer_v2modified
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend_config
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+from tensorflow.python.ops import state_ops
+
+
+# class LARSOptimizer(optimizer_v2modified.OptimizerV2Modified):
+#class LARSOptimizer(optimizer_v2.OptimizerV2):
+class LARSOptimizer(tf.keras.optimizers.Optimizer):
+  """Layer-wise Adaptive Rate Scaling for large batch training.
+
+  Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
+  I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
+
+  Implements the LARS learning rate scheme presented in the paper above. This
+  optimizer is useful when scaling the batch size to up to 32K without
+  significant performance degradation. It is recommended to use the optimizer
+  in conjunction with:
+      - Gradual learning rate warm-up
+      - Linear learning rate scaling
+      - Poly rule learning rate decay
+
+  Note, LARS scaling is currently only enabled for dense tensors. Sparse tensors
+  use the default momentum optimizer.
+  """
+
+  def __init__(
+      self,
+      learning_rate,
+      momentum=0.9,
+      weight_decay=0.0001,
+      # The LARS coefficient is a hyperparameter
+      eeta=0.001,
+      epsilon=0.0,
+      name="LARSOptimizer",
+      # Enable skipping variables from LARS scaling.
+      # TODO(sameerkm): Enable a direct mechanism to pass a
+      # subset of variables to the optimizer.
+      skip_list=None,
+      use_nesterov=False,
+      **kwargs):
+    """Construct a new LARS Optimizer.
+
+    Args:
+      learning_rate: A `Tensor`, floating point value, or a schedule that is a
+        `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
+        that takes no arguments and returns the actual value to use. The
+        learning rate.
+      momentum: A floating point value. Momentum hyperparameter.
+      weight_decay: A floating point value. Weight decay hyperparameter.
+      eeta: LARS coefficient as used in the paper. Dfault set to LARS
+        coefficient from the paper. (eeta / weight_decay) determines the highest
+        scaling factor in LARS.
+      epsilon: Optional epsilon parameter to be set in models that have very
+        small gradients. Default set to 0.0.
+      name: Optional name prefix for variables and ops created by LARSOptimizer.
+      skip_list: List of strings to enable skipping variables from LARS scaling.
+        If any of the strings in skip_list is a subset of var.name, variable
+        'var' is skipped from LARS scaling. For a typical classification model
+        with batch normalization, the skip_list is ['batch_normalization',
+        'bias']
+      use_nesterov: when set to True, nesterov momentum will be enabled
+      **kwargs: keyword arguments.
+
+    Raises:
+      ValueError: If a hyperparameter is set to a non-sensical value.
+    """
+    if momentum < 0.0:
+      raise ValueError("momentum should be positive: %s" % momentum)
+    if weight_decay < 0.0:
+      raise ValueError("weight_decay should be positive: %s" % weight_decay)
+    super(LARSOptimizer, self).__init__(name=name, **kwargs)
+
+    self._set_hyper("learning_rate", learning_rate)
+
+    # When directly using class members, instead of
+    # _set_hyper and _get_hyper (such as learning_rate above),
+    # the values are fixed after __init(), and not being
+    # updated during the training process.
+    # This provides better performance but less flexibility.
+    self.momentum = momentum
+    self.weight_decay = weight_decay
+    self.eeta = eeta
+    self.epsilon = epsilon or backend_config.epsilon()
+    self._skip_list = skip_list
+    self.use_nesterov = use_nesterov
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    lr_t = self._get_hyper("learning_rate", var_dtype)
+    local_step = math_ops.cast(self.iterations, var_dtype)
+    lr_t = math_ops.cast(lr_t(local_step), var_dtype)
+    learning_rate_t = array_ops.identity(lr_t)
+
+    apply_state[(var_device, var_dtype)].update(
+        dict(
+            learning_rate=learning_rate_t,
+            ))
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      self.add_slot(v, "momentum")
+
+  def compute_lr(self, grad, var, coefficients):
+    scaled_lr = coefficients["learning_rate"]
+    if self._skip_list is None or not any(v in var.name
+                                          for v in self._skip_list):
+      w_norm = linalg_ops.norm(var, ord=2)
+      g_norm = linalg_ops.norm(grad, ord=2)
+      trust_ratio = array_ops.where(
+          math_ops.greater(w_norm, 0),
+          array_ops.where(
+              math_ops.greater(g_norm, 0),
+              (self.eeta * w_norm /
+               (g_norm + self.weight_decay * w_norm + self.epsilon)), 1.0), 1.0)
+
+      scaled_lr = coefficients["learning_rate"] * trust_ratio
+      # Add the weight regularization gradient
+      grad = grad + self.weight_decay * var
+    return scaled_lr, grad
+
+  def _apply_dense(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    scaled_lr, grad = self.compute_lr(grad, var, coefficients)
+    mom = self.get_slot(var, "momentum")
+    return training_ops.apply_momentum(
+        var,
+        mom,
+        math_ops.cast(1.0, var.dtype.base_dtype),
+        grad * scaled_lr,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    scaled_lr, grad = self.compute_lr(grad, var, coefficients)
+    mom = self.get_slot(var, "momentum")
+    # Use ApplyKerasMomentum instead of ApplyMomentum
+    # training_ops.resource_apply_keras_momentum(
+    #     var.handle,
+    #     mom.handle,
+    #     scaled_lr,
+    #     grad,
+    #     coefficients["momentum"],
+    #     use_locking=False,
+    #     use_nesterov=self.use_nesterov)
+
+    mom_t = mom * self.momentum - grad * scaled_lr
+    mom_t = state_ops.assign(mom, mom_t, use_locking=False)
+    if self.use_nesterov:
+      var_t = var + mom_t * self.momentum - grad * scaled_lr
+    else:
+      var_t = var + mom_t
+    return state_ops.assign(var, var_t, use_locking=False).op
+
+  # Fallback to momentum optimizer for sparse tensors
+  def _apply_sparse(self, grad, var, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    mom = self.get_slot(var, "momentum")
+    return training_ops.sparse_apply_momentum(
+        var,
+        mom,
+        coefficients["learning_rate"],
+        grad.values,
+        grad.indices,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                    or self._fallback_apply_state(var_device, var_dtype))
+
+    mom = self.get_slot(var, "momentum")
+    return training_ops.resource_sparse_apply_keras_momentum(
+        var.handle,
+        mom.handle,
+        coefficients["learning_rate"],
+        grad,
+        indices,
+        self.momentum,
+        use_locking=False,
+        use_nesterov=self.use_nesterov)
+
+  def get_config(self):
+    config = super(LARSOptimizer, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "momentum": self.momentum,
+        "weight_decay": self.weight_decay,
+        "eeta": self.eeta,
+        "epsilon": self.epsilon,
+        "use_nesterov": self.use_nesterov,
+    })
+    return config
--- a/lars_util.py
+++ b/lars_util.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Enable Layer-wise Adaptive Rate Scaling optimizer in ResNet."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+from tf2_common.utils.mlp_log import mlp_log
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+
+FLAGS = flags.FLAGS
+
+
+def define_lars_flags():
+  """Defines flags needed by LARS optimizer."""
+
+  flags.DEFINE_float(
+      'end_learning_rate', default=None,
+      help=('Polynomial decay end learning rate.'))
+
+  flags.DEFINE_float(
+      'lars_epsilon', default=0.0,
+      help=('Override autoselected LARS epsilon.'))
+
+  flags.DEFINE_float(
+      'warmup_epochs', default=None,
+      help=('Override autoselected polynomial decay warmup epochs.'))
+
+  flags.DEFINE_float(
+      'momentum',
+      default=0.9,
+      help=('Momentum parameter used in the MomentumOptimizer.'))
+
+
+class PolynomialDecayWithWarmup(
+    tf.keras.optimizers.schedules.LearningRateSchedule):
+  """A LearningRateSchedule that uses a polynomial decay with warmup."""
+
+  def __init__(
+      self,
+      batch_size,
+      steps_per_epoch,
+      train_steps,
+      initial_learning_rate=None,
+      end_learning_rate=None,
+      warmup_epochs=None,
+      compute_lr_on_cpu=False,
+      name=None):
+    """Applies a polynomial decay to the learning rate with warmup."""
+    super(PolynomialDecayWithWarmup, self).__init__()
+
+    self.batch_size = batch_size
+    self.steps_per_epoch = steps_per_epoch
+    self.train_steps = train_steps
+    self.name = name
+    self.learning_rate_ops_cache = {}
+    self.compute_lr_on_cpu = compute_lr_on_cpu
+
+    if batch_size < 16384:
+      self.initial_learning_rate = 10.0
+      warmup_epochs_ = 5
+    elif batch_size < 32768:
+      self.initial_learning_rate = 25.0
+      warmup_epochs_ = 5
+    else:
+      self.initial_learning_rate = 31.2
+      warmup_epochs_ = 25
+
+    # Override default poly learning rate and warmup epochs
+    if initial_learning_rate:
+      self.initial_learning_rate = initial_learning_rate
+
+    if end_learning_rate:
+      self.end_learning_rate = end_learning_rate
+    else:
+      self.end_learning_rate = 0.0001
+
+    if warmup_epochs is not None:
+      warmup_epochs_ = warmup_epochs
+    self.warmup_epochs = warmup_epochs_
+
+    opt_name = FLAGS.optimizer.lower()
+    mlp_log.mlperf_print('opt_name', opt_name)
+    if opt_name == 'lars':
+      mlp_log.mlperf_print('{}_epsilon'.format(opt_name), FLAGS.lars_epsilon)
+    mlp_log.mlperf_print('{}_opt_weight_decay'.format(opt_name),
+                         FLAGS.weight_decay)
+    mlp_log.mlperf_print('{}_opt_base_learning_rate'.format(opt_name),
+                         self.initial_learning_rate)
+    mlp_log.mlperf_print('{}_opt_learning_rate_warmup_epochs'.format(opt_name),
+                         warmup_epochs_)
+    mlp_log.mlperf_print('{}_opt_end_learning_rate'.format(opt_name),
+                         self.end_learning_rate)
+    warmup_steps = warmup_epochs_ * steps_per_epoch
+    self.warmup_steps = tf.cast(warmup_steps, tf.float32)
+    self.decay_steps = train_steps - warmup_steps + 1
+    mlp_log.mlperf_print('{}_opt_learning_rate_decay_steps'.format(opt_name),
+                         int(self.decay_steps))
+    mlp_log.mlperf_print(
+        '{}_opt_learning_rate_decay_poly_power'.format(opt_name), 2.0)
+    mlp_log.mlperf_print('{}_opt_momentum'.format(opt_name), FLAGS.momentum)
+
+    self.poly_rate_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=self.initial_learning_rate,
+        decay_steps=self.decay_steps,
+        end_learning_rate=self.end_learning_rate,
+        power=2.0)
+
+  def __call__(self, step):
+    if tf.executing_eagerly():
+      return self._get_learning_rate(step)
+
+    # In an eager function or graph, the current implementation of optimizer
+    # repeatedly call and thus create ops for the learning rate schedule. To
+    # avoid this, we cache the ops if not executing eagerly.
+    graph = tf.compat.v1.get_default_graph()
+    if graph not in self.learning_rate_ops_cache:
+      if self.compute_lr_on_cpu:
+        with tf.device('/device:CPU:0'):
+          self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+      else:
+        self.learning_rate_ops_cache[graph] = self._get_learning_rate(step)
+    return self.learning_rate_ops_cache[graph]
+
+  def _get_learning_rate(self, step):
+    with ops.name_scope_v2(self.name or 'PolynomialDecayWithWarmup') as name:
+
+      initial_learning_rate = ops.convert_to_tensor_v2(
+          self.initial_learning_rate, name='initial_learning_rate')
+      warmup_steps = ops.convert_to_tensor_v2(
+          self.warmup_steps, name='warmup_steps')
+
+      warmup_rate = (
+          initial_learning_rate * step / warmup_steps)
+
+      poly_steps = math_ops.subtract(step, warmup_steps)
+      poly_rate = self.poly_rate_scheduler(poly_steps)
+
+      decay_rate = tf.where(step <= warmup_steps,
+                            warmup_rate, poly_rate, name=name)
+      return decay_rate
+
+  def get_config(self):
+    return {
+        'batch_size': self.batch_size,
+        'steps_per_epoch': self.steps_per_epoch,
+        'train_steps': self.train_steps,
+        'initial_learning_rate': self.initial_learning_rate,
+        'end_learning_rate': self.end_learning_rate,
+        'warmup_epochs': self.warmup_epochs,
+        'name': self.name,
+    }
--- a/reference.log
+++ b/reference.log
+nohup: ignoring input
+:::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.145380 140547769902912 mlp_log.py:80] :::MLL 1679230527.145 cache_clear: {"value": true, "metadata": {"lineno": 116, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.146378 140547769902912 mlp_log.py:80] :::MLL 1679230527.146 init_start: {"value": null, "metadata": {"lineno": 117, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.147078 140547769902912 mlp_log.py:80] :::MLL 1679230527.147 submission_benchmark: {"value": "resnet", "metadata": {"lineno": 118, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.147791 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_division: {"value": "closed", "metadata": {"lineno": 119, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.148500 140547769902912 mlp_log.py:80] :::MLL 1679230527.148 submission_org: {"value": "google", "metadata": {"lineno": 120, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.149215 140547769902912 mlp_log.py:80] :::MLL 1679230527.149 submission_platform: {"value": "gpu-v100-8", "metadata": {"lineno": 121, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.149919 140547769902912 mlp_log.py:80] :::MLL 1679230527.150 submission_status: {"value": "cloud", "metadata": {"lineno": 124, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:27.150071 140547769902912 common.py:617] Module ./resnet_ctl_imagenet_main.py:
+I0319 12:55:27.150561 140547769902912 common.py:620] 	 flags_obj.use_tf_function = True
+I0319 12:55:27.150646 140547769902912 common.py:620] 	 flags_obj.single_l2_loss_op = True
+I0319 12:55:27.150727 140547769902912 common.py:620] 	 flags_obj.cache_decoded_image = False
+I0319 12:55:27.150808 140547769902912 common.py:620] 	 flags_obj.enable_device_warmup = True
+I0319 12:55:27.150889 140547769902912 common.py:620] 	 flags_obj.device_warmup_steps = 1
+I0319 12:55:27.150968 140547769902912 common.py:620] 	 flags_obj.num_replicas = 32
+I0319 12:55:27.151046 140547769902912 common.py:617] Module absl.app:
+I0319 12:55:27.151130 140547769902912 common.py:620] 	 flags_obj.run_with_pdb = False
+I0319 12:55:27.151208 140547769902912 common.py:620] 	 flags_obj.pdb_post_mortem = False
+I0319 12:55:27.151290 140547769902912 common.py:620] 	 flags_obj.pdb = False
+I0319 12:55:27.151383 140547769902912 common.py:620] 	 flags_obj.run_with_profiling = False
+I0319 12:55:27.151461 140547769902912 common.py:620] 	 flags_obj.profile_file = None
+I0319 12:55:27.151540 140547769902912 common.py:620] 	 flags_obj.use_cprofile_for_profiling = True
+I0319 12:55:27.151618 140547769902912 common.py:620] 	 flags_obj.only_check_args = False
+I0319 12:55:27.151695 140547769902912 common.py:620] 	 flags_obj.help = False
+I0319 12:55:27.151774 140547769902912 common.py:620] 	 flags_obj.helpshort = False
+I0319 12:55:27.151850 140547769902912 common.py:620] 	 flags_obj.helpfull = False
+I0319 12:55:27.151929 140547769902912 common.py:620] 	 flags_obj.helpxml = False
+I0319 12:55:27.152006 140547769902912 common.py:617] Module absl.logging:
+I0319 12:55:27.152086 140547769902912 common.py:620] 	 flags_obj.logtostderr = False
+I0319 12:55:27.152163 140547769902912 common.py:620] 	 flags_obj.alsologtostderr = False
+I0319 12:55:27.152240 140547769902912 common.py:620] 	 flags_obj.log_dir = 
+I0319 12:55:27.152339 140547769902912 common.py:620] 	 flags_obj.verbosity = 0
+I0319 12:55:27.152423 140547769902912 common.py:620] 	 flags_obj.logger_levels = {}
+I0319 12:55:27.152507 140547769902912 common.py:620] 	 flags_obj.stderrthreshold = fatal
+I0319 12:55:27.152584 140547769902912 common.py:620] 	 flags_obj.showprefixforinfo = True
+I0319 12:55:27.152662 140547769902912 common.py:617] Module absl.testing.absltest:
+I0319 12:55:27.152743 140547769902912 common.py:620] 	 flags_obj.test_srcdir = 
+I0319 12:55:27.152820 140547769902912 common.py:620] 	 flags_obj.test_tmpdir = /tmp/absl_testing
+I0319 12:55:27.152901 140547769902912 common.py:620] 	 flags_obj.test_random_seed = 301
+I0319 12:55:27.152981 140547769902912 common.py:620] 	 flags_obj.test_randomize_ordering_seed = 1
+I0319 12:55:27.153058 140547769902912 common.py:620] 	 flags_obj.xml_output_file = 
+I0319 12:55:27.153135 140547769902912 common.py:617] Module common:
+I0319 12:55:27.153217 140547769902912 common.py:620] 	 flags_obj.enable_eager = True
+I0319 12:55:27.153294 140547769902912 common.py:620] 	 flags_obj.skip_eval = False
+I0319 12:55:27.153382 140547769902912 common.py:620] 	 flags_obj.set_learning_phase_to_train = True
+I0319 12:55:27.153460 140547769902912 common.py:620] 	 flags_obj.explicit_gpu_placement = False
+I0319 12:55:27.153537 140547769902912 common.py:620] 	 flags_obj.use_trivial_model = False
+I0319 12:55:27.153614 140547769902912 common.py:620] 	 flags_obj.report_accuracy_metrics = True
+I0319 12:55:27.153692 140547769902912 common.py:620] 	 flags_obj.lr_schedule = polynomial
+I0319 12:55:27.153769 140547769902912 common.py:620] 	 flags_obj.enable_tensorboard = False
+I0319 12:55:27.153845 140547769902912 common.py:620] 	 flags_obj.train_steps = None
+I0319 12:55:27.153923 140547769902912 common.py:620] 	 flags_obj.profile_steps = None
+I0319 12:55:27.154000 140547769902912 common.py:620] 	 flags_obj.batchnorm_spatial_persistent = True
+I0319 12:55:27.154076 140547769902912 common.py:620] 	 flags_obj.enable_get_next_as_optional = False
+I0319 12:55:27.154153 140547769902912 common.py:620] 	 flags_obj.enable_checkpoint_and_export = False
+I0319 12:55:27.154229 140547769902912 common.py:620] 	 flags_obj.tpu = 
+I0319 12:55:27.154305 140547769902912 common.py:620] 	 flags_obj.tpu_zone = 
+I0319 12:55:27.154394 140547769902912 common.py:620] 	 flags_obj.steps_per_loop = 514
+I0319 12:55:27.154473 140547769902912 common.py:620] 	 flags_obj.use_tf_while_loop = True
+I0319 12:55:27.154549 140547769902912 common.py:620] 	 flags_obj.use_tf_keras_layers = False
+I0319 12:55:27.154627 140547769902912 common.py:620] 	 flags_obj.base_learning_rate = 4.9
+I0319 12:55:27.154710 140547769902912 common.py:620] 	 flags_obj.optimizer = LARS
+I0319 12:55:27.154787 140547769902912 common.py:620] 	 flags_obj.drop_train_remainder = True
+I0319 12:55:27.154863 140547769902912 common.py:620] 	 flags_obj.drop_eval_remainder = False
+I0319 12:55:27.154940 140547769902912 common.py:620] 	 flags_obj.label_smoothing = 0.1
+I0319 12:55:27.155020 140547769902912 common.py:620] 	 flags_obj.num_classes = 1000
+I0319 12:55:27.155099 140547769902912 common.py:620] 	 flags_obj.eval_offset_epochs = 3
+I0319 12:55:27.155177 140547769902912 common.py:620] 	 flags_obj.target_accuracy = 0.759
+I0319 12:55:27.155256 140547769902912 common.py:617] Module lars_util:
+I0319 12:55:27.155346 140547769902912 common.py:620] 	 flags_obj.end_learning_rate = None
+I0319 12:55:27.155426 140547769902912 common.py:620] 	 flags_obj.lars_epsilon = 0.0
+I0319 12:55:27.155504 140547769902912 common.py:620] 	 flags_obj.warmup_epochs = 5.0
+I0319 12:55:27.155582 140547769902912 common.py:620] 	 flags_obj.momentum = 0.9
+I0319 12:55:27.155662 140547769902912 common.py:617] Module resnet_model:
+I0319 12:55:27.155743 140547769902912 common.py:620] 	 flags_obj.weight_decay = 0.0002
+I0319 12:55:27.155822 140547769902912 common.py:620] 	 flags_obj.num_accumulation_steps = 1
+I0319 12:55:27.155900 140547769902912 common.py:617] Module resnet_runnable:
+I0319 12:55:27.155981 140547769902912 common.py:620] 	 flags_obj.trace_warmup = False
+I0319 12:55:27.156070 140547769902912 common.py:617] Module tensorflow.python.ops.parallel_for.pfor:
+I0319 12:55:27.156152 140547769902912 common.py:620] 	 flags_obj.op_conversion_fallback_to_while_loop = True
+I0319 12:55:27.156228 140547769902912 common.py:617] Module tensorflow.python.tpu.client.client:
+I0319 12:55:27.156317 140547769902912 common.py:620] 	 flags_obj.runtime_oom_exit = True
+I0319 12:55:27.156397 140547769902912 common.py:620] 	 flags_obj.hbm_oom_exit = True
+I0319 12:55:27.156476 140547769902912 common.py:617] Module tf2_common.utils.flags._base:
+I0319 12:55:27.156557 140547769902912 common.py:620] 	 flags_obj.data_dir = /data/tf-imagenet/imagenet
+I0319 12:55:27.156634 140547769902912 common.py:620] 	 flags_obj.model_dir = /tmp
+I0319 12:55:27.156712 140547769902912 common.py:620] 	 flags_obj.clean = False
+I0319 12:55:27.156790 140547769902912 common.py:620] 	 flags_obj.train_epochs = 70
+I0319 12:55:27.156867 140547769902912 common.py:620] 	 flags_obj.epochs_between_evals = 4
+I0319 12:55:27.156945 140547769902912 common.py:620] 	 flags_obj.batch_size = 2496
+I0319 12:55:27.157022 140547769902912 common.py:620] 	 flags_obj.num_gpus = 8
+I0319 12:55:27.157100 140547769902912 common.py:620] 	 flags_obj.run_eagerly = False
+I0319 12:55:27.157177 140547769902912 common.py:620] 	 flags_obj.distribution_strategy = mirrored
+I0319 12:55:27.157255 140547769902912 common.py:617] Module tf2_common.utils.flags._benchmark:
+I0319 12:55:27.157347 140547769902912 common.py:620] 	 flags_obj.benchmark_logger_type = BaseBenchmarkLogger
+I0319 12:55:27.157434 140547769902912 common.py:620] 	 flags_obj.benchmark_test_id = None
+I0319 12:55:27.157512 140547769902912 common.py:620] 	 flags_obj.log_steps = 125
+I0319 12:55:27.157588 140547769902912 common.py:620] 	 flags_obj.benchmark_log_dir = None
+I0319 12:55:27.157666 140547769902912 common.py:620] 	 flags_obj.gcp_project = None
+I0319 12:55:27.157744 140547769902912 common.py:620] 	 flags_obj.bigquery_data_set = test_benchmark
+I0319 12:55:27.157821 140547769902912 common.py:620] 	 flags_obj.bigquery_run_table = benchmark_run
+I0319 12:55:27.157899 140547769902912 common.py:620] 	 flags_obj.bigquery_run_status_table = benchmark_run_status
+I0319 12:55:27.157977 140547769902912 common.py:620] 	 flags_obj.bigquery_metric_table = benchmark_metric
+I0319 12:55:27.158053 140547769902912 common.py:617] Module tf2_common.utils.flags._distribution:
+I0319 12:55:27.158134 140547769902912 common.py:620] 	 flags_obj.worker_hosts = None
+I0319 12:55:27.158211 140547769902912 common.py:620] 	 flags_obj.task_index = -1
+I0319 12:55:27.158288 140547769902912 common.py:617] Module tf2_common.utils.flags._misc:
+I0319 12:55:27.158379 140547769902912 common.py:620] 	 flags_obj.data_format = None
+I0319 12:55:27.158457 140547769902912 common.py:617] Module tf2_common.utils.flags._performance:
+I0319 12:55:27.158539 140547769902912 common.py:620] 	 flags_obj.use_synthetic_data = False
+I0319 12:55:27.158615 140547769902912 common.py:620] 	 flags_obj.dtype = fp16
+I0319 12:55:27.158691 140547769902912 common.py:620] 	 flags_obj.loss_scale = None
+I0319 12:55:27.158768 140547769902912 common.py:620] 	 flags_obj.fp16_implementation = keras
+I0319 12:55:27.158844 140547769902912 common.py:620] 	 flags_obj.all_reduce_alg = nccl
+I0319 12:55:27.158921 140547769902912 common.py:620] 	 flags_obj.num_packs = 1
+I0319 12:55:27.158999 140547769902912 common.py:620] 	 flags_obj.tf_gpu_thread_mode = gpu_private
+I0319 12:55:27.159075 140547769902912 common.py:620] 	 flags_obj.per_gpu_thread_count = 0
+I0319 12:55:27.159153 140547769902912 common.py:620] 	 flags_obj.datasets_num_private_threads = 32
+I0319 12:55:27.159230 140547769902912 common.py:620] 	 flags_obj.training_dataset_cache = True
+I0319 12:55:27.159306 140547769902912 common.py:620] 	 flags_obj.training_prefetch_batchs = 128
+I0319 12:55:27.159394 140547769902912 common.py:620] 	 flags_obj.eval_dataset_cache = True
+I0319 12:55:27.159471 140547769902912 common.py:620] 	 flags_obj.eval_prefetch_batchs = 192
+I0319 12:55:27.159548 140547769902912 common.py:620] 	 flags_obj.tf_data_experimental_slack = False
+I0319 12:55:27.159631 140547769902912 common.py:620] 	 flags_obj.enable_xla = False
+I0319 12:55:27.159710 140547769902912 common.py:620] 	 flags_obj.force_v2_in_keras_compile = None
+WARNING:tensorflow:Mixed precision compatibility check (mixed_float16): WARNING
+Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
+  Z100L, no compute capability (probably not an Nvidia GPU) (x8)
+See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
+If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
+W0319 12:55:27.160811 140547769902912 device_compatibility_check.py:107] Mixed precision compatibility check (mixed_float16): WARNING
+Your GPUs may run slowly with dtype policy mixed_float16 because they do not have compute capability of at least 7.0. Your GPUs:
+  Z100L, no compute capability (probably not an Nvidia GPU) (x8)
+See https://developer.nvidia.com/cuda-gpus for a list of GPUs and their compute capabilities.
+If you will use compatible GPU(s) not attached to this host, e.g. by running a multi-worker model, you can ignore this warning. This message will only be logged once
+I0319 12:55:27.161139 140547769902912 keras_utils.py:243] Logical CPU cores: 128
+I0319 12:55:27.161378 140547769902912 keras_utils.py:249] TF_GPU_THREAD_COUNT: 2
+I0319 12:55:27.161468 140547769902912 keras_utils.py:251] TF_GPU_THREAD_MODE: gpu_private
+I0319 12:55:27.161551 140547769902912 keras_utils.py:261] Recommended datasets_num_private_threads: 64
+2023-03-19 12:55:27.162998: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
+To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
+2023-03-19 12:55:27.181835: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:27.181964: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 32252 MB memory:  -> device: 0, name: Z100L, pci bus id: 0000:07:00.0
+2023-03-19 12:55:27.582374: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:27.582493: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 32252 MB memory:  -> device: 1, name: Z100L, pci bus id: 0000:0a:00.0
+2023-03-19 12:55:27.961772: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:27.961893: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 32252 MB memory:  -> device: 2, name: Z100L, pci bus id: 0000:15:00.0
+2023-03-19 12:55:28.339247: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:28.339376: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 32252 MB memory:  -> device: 3, name: Z100L, pci bus id: 0000:0f:00.0
+2023-03-19 12:55:28.719486: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:28.719627: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:4 with 32252 MB memory:  -> device: 4, name: Z100L, pci bus id: 0000:85:00.0
+2023-03-19 12:55:29.097492: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:29.097606: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:5 with 32252 MB memory:  -> device: 5, name: Z100L, pci bus id: 0000:7f:00.0
+2023-03-19 12:55:29.475299: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:29.475428: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:6 with 32252 MB memory:  -> device: 6, name: Z100L, pci bus id: 0000:77:00.0
+2023-03-19 12:55:29.855076: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
+2023-03-19 12:55:29.855191: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:7 with 32252 MB memory:  -> device: 7, name: Z100L, pci bus id: 0000:7a:00.0
+INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
+I0319 12:55:30.261204 140547769902912 mirrored_strategy.py:376] Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3', '/job:localhost/replica:0/task:0/device:GPU:4', '/job:localhost/replica:0/task:0/device:GPU:5', '/job:localhost/replica:0/task:0/device:GPU:6', '/job:localhost/replica:0/task:0/device:GPU:7')
+num_index -1
+enter the tf.float16 set policy
+Compute dtype: float16
+Variable dtype: float32
+:::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:30.263783 140547769902912 mlp_log.py:80] :::MLL 1679230530.264 global_batch_size: {"value": 2496, "metadata": {"lineno": 190, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:30.264862 140547769902912 mlp_log.py:80] :::MLL 1679230530.265 train_samples: {"value": 1281167, "metadata": {"lineno": 191, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:30.265909 140547769902912 mlp_log.py:80] :::MLL 1679230530.266 eval_samples: {"value": 50000, "metadata": {"lineno": 193, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:30.266957 140547769902912 mlp_log.py:80] :::MLL 1679230530.267 model_bn_span: {"value": 312, "metadata": {"lineno": 195, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 12:55:30.267157 140547769902912 resnet_ctl_imagenet_main.py:204] Training 71 epochs, each epoch has 513 steps, total steps: 36423; Eval 21 steps
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.377633 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.390385 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.400095 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.402572 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.414422 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.426609 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.486386 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.488949 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.497610 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+I0319 12:55:30.500023 140547769902912 cross_device_ops.py:619] Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
+:::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.377869 140547769902912 mlp_log.py:80] :::MLL 1679230535.378 opt_name: {"value": "lars", "metadata": {"lineno": 101, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.378870 140547769902912 mlp_log.py:80] :::MLL 1679230535.379 lars_epsilon: {"value": 0.0, "metadata": {"lineno": 103, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.379752 140547769902912 mlp_log.py:80] :::MLL 1679230535.380 lars_opt_weight_decay: {"value": 0.0002, "metadata": {"lineno": 104, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.380624 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_base_learning_rate: {"value": 4.9, "metadata": {"lineno": 106, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.381502 140547769902912 mlp_log.py:80] :::MLL 1679230535.381 lars_opt_learning_rate_warmup_epochs: {"value": 5.0, "metadata": {"lineno": 108, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.382365 140547769902912 mlp_log.py:80] :::MLL 1679230535.382 lars_opt_end_learning_rate: {"value": 0.0001, "metadata": {"lineno": 110, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.383680 140547769902912 mlp_log.py:80] :::MLL 1679230535.384 lars_opt_learning_rate_decay_steps: {"value": 33346, "metadata": {"lineno": 115, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.384541 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_learning_rate_decay_poly_power: {"value": 2.0, "metadata": {"lineno": 117, "file": "/root/resnet50/lars_util.py"}}
+:::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.385398 140547769902912 mlp_log.py:80] :::MLL 1679230535.385 lars_opt_momentum: {"value": 0.9, "metadata": {"lineno": 119, "file": "/root/resnet50/lars_util.py"}}
+I0319 12:55:35.494630 140547769902912 resnet_ctl_imagenet_main.py:238] Warmup for 1 steps.
+I0319 12:55:35.496956 140547769902912 controller.py:340] Warmup at step 0 of 1
+I0319 12:55:35.497112 140547769902912 controller.py:345] Entering warmup loop with 1 steps, at step 0 of 1
+WARNING:tensorflow:From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
+Instructions for updating:
+rename to distribute_datasets_from_function
+W0319 12:55:35.497444 140547769902912 deprecation.py:341] From /root/resnet50/tf2_common/training/utils.py:139: StrategyBase.experimental_distribute_datasets_from_function (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
+Instructions for updating:
+rename to distribute_datasets_from_function
+I0319 12:55:35.897564 140547769902912 resnet_runnable.py:484] Entering the warmup loop.
+WARNING:tensorflow:From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
+Instructions for updating:
+experimental_compile is deprecated, use jit_compile instead
+W0319 12:55:37.124004 140547769902912 deprecation.py:545] From /usr/local/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py:464: calling function (from tensorflow.python.eager.def_function) with experimental_compile is deprecated and will be removed in a future version.
+Instructions for updating:
+experimental_compile is deprecated, use jit_compile instead
+INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
+I0319 12:55:55.412617 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
+INFO:tensorflow:batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
+I0319 12:56:48.352646 140547769902912 cross_device_ops.py:900] batch_all_reduce: 161 all-reduces with algorithm = nccl, num_packs = 1
+I0319 13:00:32.592645 140547769902912 resnet_runnable.py:497] Exiting the warmup loop.
+I0319 13:00:32.595108 140547769902912 controller.py:220] step: 1        steps_per_second: 0.00
+enter fp16 computing
+step: 1        steps_per_second: 0.00
+:::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 13:00:32.596201 140547769902912 mlp_log.py:80] :::MLL 1679230832.596 init_stop: {"value": null, "metadata": {"lineno": 258, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 13:00:32.596997 140547769902912 mlp_log.py:80] :::MLL 1679230832.597 run_start: {"value": null, "metadata": {"lineno": 267, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 13:00:32.597745 140547769902912 mlp_log.py:80] :::MLL 1679230832.598 block_start: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 268, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 13:00:32.599620 140547769902912 controller.py:247] Train at step 0 of 36423
+I0319 13:00:32.599745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 0 of 36423
+I0319 13:00:32.612586 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
+W0319 13:00:32.634842 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
+I0319 13:00:32.636068 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
+I0319 13:00:32.637336 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
+I0319 13:00:32.637444 140547769902912 imagenet_preprocessing.py:119] One hot: True
+I0319 13:08:32.765698 140547769902912 keras_utils.py:120] TimeHistory: 2676.05 examples/second between steps 0 and 513
+I0319 13:08:32.769956 140547769902912 controller.py:220] step: 513        steps_per_second: 1.07        {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
+I0319 13:08:32.770123 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 513 of 36423
+I0319 13:16:30.476807 140547769902912 keras_utils.py:120] TimeHistory: 2680.53 examples/second between steps 513 and 1026
+I0319 13:16:30.481098 140547769902912 controller.py:220] step: 1026        steps_per_second: 1.07        {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
+I0319 13:16:30.481256 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1026 of 36423
+I0319 13:24:28.062501 140547769902912 keras_utils.py:120] TimeHistory: 2681.24 examples/second between steps 1026 and 1539
+I0319 13:24:28.066748 140547769902912 controller.py:220] step: 1539        steps_per_second: 1.07        {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
+I0319 13:24:28.066913 140547769902912 controller.py:185] Start evaluation at step: 1539
+I0319 13:24:28.070569 140547769902912 imagenet_preprocessing.py:338] Sharding the dataset: input_pipeline_id=0 num_input_pipelines=1
+W0319 13:24:28.088642 140547769902912 options.py:503] options.experimental_threading is deprecated. Use options.threading instead.
+I0319 13:24:28.089705 140547769902912 imagenet_preprocessing.py:104] datasets_num_private_threads: 32
+I0319 13:24:28.089835 140547769902912 imagenet_preprocessing.py:118] Num classes: 1000
+I0319 13:24:28.089923 140547769902912 imagenet_preprocessing.py:119] One hot: True
+step: 513        steps_per_second: 1.07        {'train_loss': 101.53466, 'train_accuracy': 0.025109181}
+step: 1026        steps_per_second: 1.07        {'train_loss': 80.75745, 'train_accuracy': 0.13648738}
+step: 1539        steps_per_second: 1.07        {'train_loss': 68.72967, 'train_accuracy': 0.25144324}
+:::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:24:28.927603 140547769902912 mlp_log.py:80] :::MLL 1679232268.928 eval_start: {"value": null, "metadata": {"epoch_num": 3, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:25:01.308466 140547769902912 mlp_log.py:80] :::MLL 1679232301.308 eval_stop: {"value": null, "metadata": {"epoch_num": 3, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:25:01.317326 140547769902912 mlp_log.py:80] :::MLL 1679232301.317 eval_accuracy: {"value": 0.255840003490448, "metadata": {"epoch_num": 3, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:25:01.318364 140547769902912 mlp_log.py:80] :::MLL 1679232301.318 block_stop: {"value": null, "metadata": {"first_epoch_num": 1, "epoch_count": 3, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:25:01.319331 140547769902912 mlp_log.py:80] :::MLL 1679232301.319 block_start: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:25:01.329561 140547769902912 controller.py:220] step: 1539        evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
+I0319 13:25:01.329745 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 1539 of 36423
+I0319 13:32:58.584241 140547769902912 keras_utils.py:120] TimeHistory: 2683.07 examples/second between steps 1539 and 2052
+I0319 13:32:58.588519 140547769902912 controller.py:220] step: 2052        steps_per_second: 1.00        {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
+I0319 13:32:58.588680 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2052 of 36423
+I0319 13:40:56.833560 140547769902912 keras_utils.py:120] TimeHistory: 2677.52 examples/second between steps 2052 and 2565
+I0319 13:40:56.837803 140547769902912 controller.py:220] step: 2565        steps_per_second: 1.07        {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
+I0319 13:40:56.837963 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 2565 of 36423
+I0319 13:48:55.233101 140547769902912 keras_utils.py:120] TimeHistory: 2676.68 examples/second between steps 2565 and 3078
+I0319 13:48:55.237374 140547769902912 controller.py:220] step: 3078        steps_per_second: 1.07        {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
+I0319 13:48:55.237531 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3078 of 36423
+I0319 13:56:53.574455 140547769902912 keras_utils.py:120] TimeHistory: 2677.00 examples/second between steps 3078 and 3591
+I0319 13:56:53.578727 140547769902912 controller.py:220] step: 3591        steps_per_second: 1.07        {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
+I0319 13:56:53.578876 140547769902912 controller.py:185] Start evaluation at step: 3591
+step: 1539        evaluation metric: {'test_loss': 0.49958566, 'test_accuracy': 0.25584, 'continue_training': True}
+step: 2052        steps_per_second: 1.00        {'train_loss': 61.880257, 'train_accuracy': 0.3291465}
+step: 2565        steps_per_second: 1.07        {'train_loss': 58.142868, 'train_accuracy': 0.3752999}
+step: 3078        steps_per_second: 1.07        {'train_loss': 55.290226, 'train_accuracy': 0.41178867}
+step: 3591        steps_per_second: 1.07        {'train_loss': 52.677834, 'train_accuracy': 0.4466218}
+:::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:56:54.080654 140547769902912 mlp_log.py:80] :::MLL 1679234214.081 eval_start: {"value": null, "metadata": {"epoch_num": 7, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:57:05.254401 140547769902912 mlp_log.py:80] :::MLL 1679234225.254 eval_stop: {"value": null, "metadata": {"epoch_num": 7, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:57:05.261220 140547769902912 mlp_log.py:80] :::MLL 1679234225.261 eval_accuracy: {"value": 0.4514999985694885, "metadata": {"epoch_num": 7, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:57:05.262227 140547769902912 mlp_log.py:80] :::MLL 1679234225.262 block_stop: {"value": null, "metadata": {"first_epoch_num": 4, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:57:05.263200 140547769902912 mlp_log.py:80] :::MLL 1679234225.263 block_start: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 13:57:05.272903 140547769902912 controller.py:220] step: 3591        evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
+I0319 13:57:05.273066 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 3591 of 36423
+I0319 14:05:03.201216 140547769902912 keras_utils.py:120] TimeHistory: 2679.28 examples/second between steps 3591 and 4104
+I0319 14:05:03.205459 140547769902912 controller.py:220] step: 4104        steps_per_second: 1.05        {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
+I0319 14:05:03.205613 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4104 of 36423
+I0319 14:13:01.703775 140547769902912 keras_utils.py:120] TimeHistory: 2676.10 examples/second between steps 4104 and 4617
+I0319 14:13:01.707995 140547769902912 controller.py:220] step: 4617        steps_per_second: 1.07        {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
+I0319 14:13:01.708152 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 4617 of 36423
+I0319 14:20:58.757003 140547769902912 keras_utils.py:120] TimeHistory: 2684.23 examples/second between steps 4617 and 5130
+I0319 14:20:58.761198 140547769902912 controller.py:220] step: 5130        steps_per_second: 1.08        {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
+I0319 14:20:58.761370 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5130 of 36423
+I0319 14:28:56.838135 140547769902912 keras_utils.py:120] TimeHistory: 2678.46 examples/second between steps 5130 and 5643
+I0319 14:28:56.842247 140547769902912 controller.py:220] step: 5643        steps_per_second: 1.07        {'train_loss': 47.524445, 'train_accuracy': 0.517012}
+I0319 14:28:56.842405 140547769902912 controller.py:185] Start evaluation at step: 5643
+step: 3591        evaluation metric: {'test_loss': 0.38534293, 'test_accuracy': 0.4515, 'continue_training': True}
+step: 4104        steps_per_second: 1.05        {'train_loss': 50.85758, 'train_accuracy': 0.47165993}
+step: 4617        steps_per_second: 1.07        {'train_loss': 49.526817, 'train_accuracy': 0.48903587}
+step: 5130        steps_per_second: 1.08        {'train_loss': 48.474247, 'train_accuracy': 0.5037846}
+step: 5643        steps_per_second: 1.07        {'train_loss': 47.524445, 'train_accuracy': 0.517012}
+:::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:28:57.346966 140547769902912 mlp_log.py:80] :::MLL 1679236137.347 eval_start: {"value": null, "metadata": {"epoch_num": 11, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:29:08.307533 140547769902912 mlp_log.py:80] :::MLL 1679236148.307 eval_stop: {"value": null, "metadata": {"epoch_num": 11, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:29:08.314471 140547769902912 mlp_log.py:80] :::MLL 1679236148.314 eval_accuracy: {"value": 0.5169399976730347, "metadata": {"epoch_num": 11, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:29:08.315475 140547769902912 mlp_log.py:80] :::MLL 1679236148.315 block_stop: {"value": null, "metadata": {"first_epoch_num": 8, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:29:08.316439 140547769902912 mlp_log.py:80] :::MLL 1679236148.316 block_start: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 14:29:08.326488 140547769902912 controller.py:220] step: 5643        evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
+I0319 14:29:08.326648 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 5643 of 36423
+I0319 14:37:05.725753 140547769902912 keras_utils.py:120] TimeHistory: 2682.26 examples/second between steps 5643 and 6156
+I0319 14:37:05.729918 140547769902912 controller.py:220] step: 6156        steps_per_second: 1.05        {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
+I0319 14:37:05.730074 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6156 of 36423
+I0319 14:45:03.411590 140547769902912 keras_utils.py:120] TimeHistory: 2680.68 examples/second between steps 6156 and 6669
+I0319 14:45:03.415779 140547769902912 controller.py:220] step: 6669        steps_per_second: 1.07        {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
+I0319 14:45:03.415935 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 6669 of 36423
+I0319 14:53:02.156559 140547769902912 keras_utils.py:120] TimeHistory: 2674.74 examples/second between steps 6669 and 7182
+I0319 14:53:02.160710 140547769902912 controller.py:220] step: 7182        steps_per_second: 1.07        {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
+I0319 14:53:02.160865 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7182 of 36423
+I0319 15:01:00.511001 140547769902912 keras_utils.py:120] TimeHistory: 2676.93 examples/second between steps 7182 and 7695
+I0319 15:01:00.515219 140547769902912 controller.py:220] step: 7695        steps_per_second: 1.07        {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
+I0319 15:01:00.517019 140547769902912 controller.py:185] Start evaluation at step: 7695
+step: 5643        evaluation metric: {'test_loss': 0.34546962, 'test_accuracy': 0.51694, 'continue_training': True}
+step: 6156        steps_per_second: 1.05        {'train_loss': 46.72335, 'train_accuracy': 0.5281839}
+step: 6669        steps_per_second: 1.07        {'train_loss': 46.06021, 'train_accuracy': 0.53761417}
+step: 7182        steps_per_second: 1.07        {'train_loss': 45.366295, 'train_accuracy': 0.54672974}
+step: 7695        steps_per_second: 1.07        {'train_loss': 44.782856, 'train_accuracy': 0.5550253}
+:::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:01.002238 140547769902912 mlp_log.py:80] :::MLL 1679238061.002 eval_start: {"value": null, "metadata": {"epoch_num": 15, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:11.832513 140547769902912 mlp_log.py:80] :::MLL 1679238071.832 eval_stop: {"value": null, "metadata": {"epoch_num": 15, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:11.839387 140547769902912 mlp_log.py:80] :::MLL 1679238071.839 eval_accuracy: {"value": 0.5540599822998047, "metadata": {"epoch_num": 15, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:11.840405 140547769902912 mlp_log.py:80] :::MLL 1679238071.840 block_stop: {"value": null, "metadata": {"first_epoch_num": 12, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:11.841379 140547769902912 mlp_log.py:80] :::MLL 1679238071.841 block_start: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:01:11.851153 140547769902912 controller.py:220] step: 7695        evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
+I0319 15:01:11.851322 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 7695 of 36423
+I0319 15:09:09.903125 140547769902912 keras_utils.py:120] TimeHistory: 2678.59 examples/second between steps 7695 and 8208
+I0319 15:09:09.907292 140547769902912 controller.py:220] step: 8208        steps_per_second: 1.05        {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
+I0319 15:09:09.907462 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8208 of 36423
+I0319 15:17:08.328512 140547769902912 keras_utils.py:120] TimeHistory: 2676.53 examples/second between steps 8208 and 8721
+I0319 15:17:08.332779 140547769902912 controller.py:220] step: 8721        steps_per_second: 1.07        {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
+I0319 15:17:08.332940 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 8721 of 36423
+I0319 15:25:06.558547 140547769902912 keras_utils.py:120] TimeHistory: 2677.62 examples/second between steps 8721 and 9234
+I0319 15:25:06.562764 140547769902912 controller.py:220] step: 9234        steps_per_second: 1.07        {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
+I0319 15:25:06.562925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9234 of 36423
+I0319 15:33:04.438484 140547769902912 keras_utils.py:120] TimeHistory: 2679.59 examples/second between steps 9234 and 9747
+I0319 15:33:04.442654 140547769902912 controller.py:220] step: 9747        steps_per_second: 1.07        {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
+I0319 15:33:04.442804 140547769902912 controller.py:185] Start evaluation at step: 9747
+step: 7695        evaluation metric: {'test_loss': 0.3284506, 'test_accuracy': 0.55406, 'continue_training': True}
+step: 8208        steps_per_second: 1.05        {'train_loss': 44.193314, 'train_accuracy': 0.56363946}
+step: 8721        steps_per_second: 1.07        {'train_loss': 43.65782, 'train_accuracy': 0.5716288}
+step: 9234        steps_per_second: 1.07        {'train_loss': 43.085396, 'train_accuracy': 0.5789591}
+step: 9747        steps_per_second: 1.07        {'train_loss': 42.59366, 'train_accuracy': 0.58631825}
+:::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:04.930735 140547769902912 mlp_log.py:80] :::MLL 1679239984.931 eval_start: {"value": null, "metadata": {"epoch_num": 19, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:16.094051 140547769902912 mlp_log.py:80] :::MLL 1679239996.094 eval_stop: {"value": null, "metadata": {"epoch_num": 19, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:16.100932 140547769902912 mlp_log.py:80] :::MLL 1679239996.101 eval_accuracy: {"value": 0.6158000230789185, "metadata": {"epoch_num": 19, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:16.101949 140547769902912 mlp_log.py:80] :::MLL 1679239996.102 block_stop: {"value": null, "metadata": {"first_epoch_num": 16, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:16.102918 140547769902912 mlp_log.py:80] :::MLL 1679239996.103 block_start: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 15:33:16.112729 140547769902912 controller.py:220] step: 9747        evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
+I0319 15:33:16.112884 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 9747 of 36423
+I0319 15:41:14.392338 140547769902912 keras_utils.py:120] TimeHistory: 2677.32 examples/second between steps 9747 and 10260
+I0319 15:41:14.396505 140547769902912 controller.py:220] step: 10260        steps_per_second: 1.05        {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
+I0319 15:41:14.396659 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10260 of 36423
+I0319 15:49:11.961558 140547769902912 keras_utils.py:120] TimeHistory: 2681.33 examples/second between steps 10260 and 10773
+I0319 15:49:11.965767 140547769902912 controller.py:220] step: 10773        steps_per_second: 1.07        {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
+I0319 15:49:11.965925 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 10773 of 36423
+I0319 15:57:09.164847 140547769902912 keras_utils.py:120] TimeHistory: 2683.39 examples/second between steps 10773 and 11286
+I0319 15:57:09.168977 140547769902912 controller.py:220] step: 11286        steps_per_second: 1.08        {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
+I0319 15:57:09.169133 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11286 of 36423
+I0319 16:05:06.888276 140547769902912 keras_utils.py:120] TimeHistory: 2680.46 examples/second between steps 11286 and 11799
+I0319 16:05:06.892483 140547769902912 controller.py:220] step: 11799        steps_per_second: 1.07        {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
+I0319 16:05:06.892634 140547769902912 controller.py:185] Start evaluation at step: 11799
+step: 9747        evaluation metric: {'test_loss': 0.29739872, 'test_accuracy': 0.6158, 'continue_training': True}
+step: 10260        steps_per_second: 1.05        {'train_loss': 42.066074, 'train_accuracy': 0.59374607}
+step: 10773        steps_per_second: 1.07        {'train_loss': 41.57502, 'train_accuracy': 0.60048044}
+step: 11286        steps_per_second: 1.08        {'train_loss': 41.104015, 'train_accuracy': 0.6074765}
+step: 11799        steps_per_second: 1.07        {'train_loss': 40.675106, 'train_accuracy': 0.6140507}
+:::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:07.376655 140547769902912 mlp_log.py:80] :::MLL 1679241907.377 eval_start: {"value": null, "metadata": {"epoch_num": 23, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:18.161060 140547769902912 mlp_log.py:80] :::MLL 1679241918.161 eval_stop: {"value": null, "metadata": {"epoch_num": 23, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:18.167979 140547769902912 mlp_log.py:80] :::MLL 1679241918.168 eval_accuracy: {"value": 0.6306399703025818, "metadata": {"epoch_num": 23, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:18.168991 140547769902912 mlp_log.py:80] :::MLL 1679241918.169 block_stop: {"value": null, "metadata": {"first_epoch_num": 20, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:18.169961 140547769902912 mlp_log.py:80] :::MLL 1679241918.170 block_start: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:05:18.179913 140547769902912 controller.py:220] step: 11799        evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
+I0319 16:05:18.180072 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 11799 of 36423
+I0319 16:13:15.017472 140547769902912 keras_utils.py:120] TimeHistory: 2685.42 examples/second between steps 11799 and 12312
+I0319 16:13:15.021653 140547769902912 controller.py:220] step: 12312        steps_per_second: 1.05        {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
+I0319 16:13:15.021814 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12312 of 36423
+I0319 16:21:11.873815 140547769902912 keras_utils.py:120] TimeHistory: 2685.34 examples/second between steps 12312 and 12825
+I0319 16:21:11.877966 140547769902912 controller.py:220] step: 12825        steps_per_second: 1.08        {'train_loss': 39.75526, 'train_accuracy': 0.627093}
+I0319 16:21:11.878120 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 12825 of 36423
+I0319 16:29:08.757629 140547769902912 keras_utils.py:120] TimeHistory: 2685.19 examples/second between steps 12825 and 13338
+I0319 16:29:08.761925 140547769902912 controller.py:220] step: 13338        steps_per_second: 1.08        {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
+I0319 16:29:08.762086 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13338 of 36423
+I0319 16:37:05.957099 140547769902912 keras_utils.py:120] TimeHistory: 2683.41 examples/second between steps 13338 and 13851
+I0319 16:37:05.961228 140547769902912 controller.py:220] step: 13851        steps_per_second: 1.08        {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
+I0319 16:37:05.961388 140547769902912 controller.py:185] Start evaluation at step: 13851
+step: 11799        evaluation metric: {'test_loss': 0.29088515, 'test_accuracy': 0.63064, 'continue_training': True}
+step: 12312        steps_per_second: 1.05        {'train_loss': 40.224228, 'train_accuracy': 0.6205242}
+step: 12825        steps_per_second: 1.08        {'train_loss': 39.75526, 'train_accuracy': 0.627093}
+step: 13338        steps_per_second: 1.08        {'train_loss': 39.357185, 'train_accuracy': 0.6333713}
+step: 13851        steps_per_second: 1.08        {'train_loss': 38.924423, 'train_accuracy': 0.6391099}
+:::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:06.441277 140547769902912 mlp_log.py:80] :::MLL 1679243826.441 eval_start: {"value": null, "metadata": {"epoch_num": 27, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:17.448269 140547769902912 mlp_log.py:80] :::MLL 1679243837.448 eval_stop: {"value": null, "metadata": {"epoch_num": 27, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:17.455250 140547769902912 mlp_log.py:80] :::MLL 1679243837.455 eval_accuracy: {"value": 0.6637200117111206, "metadata": {"epoch_num": 27, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:17.456276 140547769902912 mlp_log.py:80] :::MLL 1679243837.456 block_stop: {"value": null, "metadata": {"first_epoch_num": 24, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:17.457254 140547769902912 mlp_log.py:80] :::MLL 1679243837.457 block_start: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 16:37:17.467283 140547769902912 controller.py:220] step: 13851        evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
+I0319 16:37:17.467454 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 13851 of 36423
+I0319 16:45:14.272286 140547769902912 keras_utils.py:120] TimeHistory: 2685.60 examples/second between steps 13851 and 14364
+I0319 16:45:14.276514 140547769902912 controller.py:220] step: 14364        steps_per_second: 1.05        {'train_loss': 38.50588, 'train_accuracy': 0.645977}
+I0319 16:45:14.276674 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14364 of 36423
+I0319 16:53:12.242927 140547769902912 keras_utils.py:120] TimeHistory: 2679.08 examples/second between steps 14364 and 14877
+I0319 16:53:12.247173 140547769902912 controller.py:220] step: 14877        steps_per_second: 1.07        {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
+I0319 16:53:12.247342 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 14877 of 36423
+I0319 17:01:08.925324 140547769902912 keras_utils.py:120] TimeHistory: 2686.32 examples/second between steps 14877 and 15390
+I0319 17:01:08.929522 140547769902912 controller.py:220] step: 15390        steps_per_second: 1.08        {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
+I0319 17:01:08.929681 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15390 of 36423
+I0319 17:09:05.460300 140547769902912 keras_utils.py:120] TimeHistory: 2687.14 examples/second between steps 15390 and 15903
+I0319 17:09:05.464558 140547769902912 controller.py:220] step: 15903        steps_per_second: 1.08        {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
+I0319 17:09:05.464712 140547769902912 controller.py:185] Start evaluation at step: 15903
+step: 13851        evaluation metric: {'test_loss': 0.27420917, 'test_accuracy': 0.66372, 'continue_training': True}
+step: 14364        steps_per_second: 1.05        {'train_loss': 38.50588, 'train_accuracy': 0.645977}
+step: 14877        steps_per_second: 1.07        {'train_loss': 38.042336, 'train_accuracy': 0.65228266}
+step: 15390        steps_per_second: 1.08        {'train_loss': 37.64315, 'train_accuracy': 0.65857184}
+step: 15903        steps_per_second: 1.08        {'train_loss': 37.25062, 'train_accuracy': 0.66452134}
+:::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:05.958450 140547769902912 mlp_log.py:80] :::MLL 1679245745.958 eval_start: {"value": null, "metadata": {"epoch_num": 31, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:16.709334 140547769902912 mlp_log.py:80] :::MLL 1679245756.709 eval_stop: {"value": null, "metadata": {"epoch_num": 31, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:16.716322 140547769902912 mlp_log.py:80] :::MLL 1679245756.716 eval_accuracy: {"value": 0.663860023021698, "metadata": {"epoch_num": 31, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:16.717343 140547769902912 mlp_log.py:80] :::MLL 1679245756.717 block_stop: {"value": null, "metadata": {"first_epoch_num": 28, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:16.718302 140547769902912 mlp_log.py:80] :::MLL 1679245756.718 block_start: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:09:16.728244 140547769902912 controller.py:220] step: 15903        evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
+I0319 17:09:16.728415 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 15903 of 36423
+I0319 17:17:12.753624 140547769902912 keras_utils.py:120] TimeHistory: 2690.00 examples/second between steps 15903 and 16416
+I0319 17:17:12.757766 140547769902912 controller.py:220] step: 16416        steps_per_second: 1.05        {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
+I0319 17:17:12.757923 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16416 of 36423
+I0319 17:25:08.728839 140547769902912 keras_utils.py:120] TimeHistory: 2690.31 examples/second between steps 16416 and 16929
+I0319 17:25:08.733042 140547769902912 controller.py:220] step: 16929        steps_per_second: 1.08        {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
+I0319 17:25:08.733199 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 16929 of 36423
+I0319 17:33:05.759370 140547769902912 keras_utils.py:120] TimeHistory: 2684.36 examples/second between steps 16929 and 17442
+I0319 17:33:05.763500 140547769902912 controller.py:220] step: 17442        steps_per_second: 1.08        {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
+I0319 17:33:05.763653 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17442 of 36423
+I0319 17:41:02.449225 140547769902912 keras_utils.py:120] TimeHistory: 2686.27 examples/second between steps 17442 and 17955
+I0319 17:41:02.453442 140547769902912 controller.py:220] step: 17955        steps_per_second: 1.08        {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
+I0319 17:41:02.453614 140547769902912 controller.py:185] Start evaluation at step: 17955
+step: 15903        evaluation metric: {'test_loss': 0.2736155, 'test_accuracy': 0.66386, 'continue_training': True}
+step: 16416        steps_per_second: 1.05        {'train_loss': 36.87168, 'train_accuracy': 0.6701756}
+step: 16929        steps_per_second: 1.08        {'train_loss': 36.469055, 'train_accuracy': 0.67674124}
+step: 17442        steps_per_second: 1.08        {'train_loss': 36.071156, 'train_accuracy': 0.6823971}
+step: 17955        steps_per_second: 1.08        {'train_loss': 35.67699, 'train_accuracy': 0.68873394}
+:::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:02.938170 140547769902912 mlp_log.py:80] :::MLL 1679247662.938 eval_start: {"value": null, "metadata": {"epoch_num": 35, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:13.817046 140547769902912 mlp_log.py:80] :::MLL 1679247673.817 eval_stop: {"value": null, "metadata": {"epoch_num": 35, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:13.824302 140547769902912 mlp_log.py:80] :::MLL 1679247673.824 eval_accuracy: {"value": 0.6931399703025818, "metadata": {"epoch_num": 35, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:13.825345 140547769902912 mlp_log.py:80] :::MLL 1679247673.825 block_stop: {"value": null, "metadata": {"first_epoch_num": 32, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:13.826306 140547769902912 mlp_log.py:80] :::MLL 1679247673.826 block_start: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 17:41:13.836492 140547769902912 controller.py:220] step: 17955        evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
+I0319 17:41:13.836662 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 17955 of 36423
+I0319 17:49:10.447629 140547769902912 keras_utils.py:120] TimeHistory: 2686.69 examples/second between steps 17955 and 18468
+I0319 17:49:10.451847 140547769902912 controller.py:220] step: 18468        steps_per_second: 1.05        {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
+I0319 17:49:10.452003 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18468 of 36423
+I0319 17:57:07.115317 140547769902912 keras_utils.py:120] TimeHistory: 2686.40 examples/second between steps 18468 and 18981
+I0319 17:57:07.119469 140547769902912 controller.py:220] step: 18981        steps_per_second: 1.08        {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
+I0319 17:57:07.119627 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 18981 of 36423
+I0319 18:05:03.520790 140547769902912 keras_utils.py:120] TimeHistory: 2687.88 examples/second between steps 18981 and 19494
+I0319 18:05:03.524950 140547769902912 controller.py:220] step: 19494        steps_per_second: 1.08        {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
+I0319 18:05:03.525108 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 19494 of 36423
+I0319 18:13:00.146009 140547769902912 keras_utils.py:120] TimeHistory: 2686.66 examples/second between steps 19494 and 20007
+I0319 18:13:00.150213 140547769902912 controller.py:220] step: 20007        steps_per_second: 1.08        {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
+I0319 18:13:00.150395 140547769902912 controller.py:185] Start evaluation at step: 20007
+step: 17955        evaluation metric: {'test_loss': 0.25965777, 'test_accuracy': 0.69314, 'continue_training': True}
+step: 18468        steps_per_second: 1.05        {'train_loss': 35.29787, 'train_accuracy': 0.6943679}
+step: 18981        steps_per_second: 1.08        {'train_loss': 34.908035, 'train_accuracy': 0.70074695}
+step: 19494        steps_per_second: 1.08        {'train_loss': 34.57146, 'train_accuracy': 0.7056616}
+step: 20007        steps_per_second: 1.08        {'train_loss': 34.178963, 'train_accuracy': 0.71188754}
+:::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:00.638623 140547769902912 mlp_log.py:80] :::MLL 1679249580.639 eval_start: {"value": null, "metadata": {"epoch_num": 39, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:11.473854 140547769902912 mlp_log.py:80] :::MLL 1679249591.474 eval_stop: {"value": null, "metadata": {"epoch_num": 39, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:11.482290 140547769902912 mlp_log.py:80] :::MLL 1679249591.482 eval_accuracy: {"value": 0.7071400284767151, "metadata": {"epoch_num": 39, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:11.483335 140547769902912 mlp_log.py:80] :::MLL 1679249591.483 block_stop: {"value": null, "metadata": {"first_epoch_num": 36, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:11.484290 140547769902912 mlp_log.py:80] :::MLL 1679249591.484 block_start: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:13:11.494453 140547769902912 controller.py:220] step: 20007        evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
+I0319 18:13:11.494655 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20007 of 36423
+I0319 18:21:07.807034 140547769902912 keras_utils.py:120] TimeHistory: 2688.38 examples/second between steps 20007 and 20520
+I0319 18:21:07.811231 140547769902912 controller.py:220] step: 20520        steps_per_second: 1.05        {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
+I0319 18:21:07.811421 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 20520 of 36423
+I0319 18:29:04.059954 140547769902912 keras_utils.py:120] TimeHistory: 2688.74 examples/second between steps 20520 and 21033
+I0319 18:29:04.064098 140547769902912 controller.py:220] step: 21033        steps_per_second: 1.08        {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
+I0319 18:29:04.064261 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21033 of 36423
+I0319 18:37:00.501375 140547769902912 keras_utils.py:120] TimeHistory: 2687.68 examples/second between steps 21033 and 21546
+I0319 18:37:00.505643 140547769902912 controller.py:220] step: 21546        steps_per_second: 1.08        {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
+I0319 18:37:00.505825 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 21546 of 36423
+I0319 18:44:57.448547 140547769902912 keras_utils.py:120] TimeHistory: 2684.84 examples/second between steps 21546 and 22059
+I0319 18:44:57.453764 140547769902912 controller.py:220] step: 22059        steps_per_second: 1.08        {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
+I0319 18:44:57.453983 140547769902912 controller.py:185] Start evaluation at step: 22059
+step: 20007        evaluation metric: {'test_loss': 0.25293344, 'test_accuracy': 0.70714, 'continue_training': True}
+step: 20520        steps_per_second: 1.05        {'train_loss': 33.828243, 'train_accuracy': 0.7174528}
+step: 21033        steps_per_second: 1.08        {'train_loss': 33.491177, 'train_accuracy': 0.7229907}
+step: 21546        steps_per_second: 1.08        {'train_loss': 33.12628, 'train_accuracy': 0.72929555}
+step: 22059        steps_per_second: 1.08        {'train_loss': 32.781902, 'train_accuracy': 0.7347241}
+:::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:44:57.962047 140547769902912 mlp_log.py:80] :::MLL 1679251497.962 eval_start: {"value": null, "metadata": {"epoch_num": 43, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:45:08.834538 140547769902912 mlp_log.py:80] :::MLL 1679251508.834 eval_stop: {"value": null, "metadata": {"epoch_num": 43, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:45:08.841803 140547769902912 mlp_log.py:80] :::MLL 1679251508.842 eval_accuracy: {"value": 0.7310600280761719, "metadata": {"epoch_num": 43, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:45:08.842825 140547769902912 mlp_log.py:80] :::MLL 1679251508.843 block_stop: {"value": null, "metadata": {"first_epoch_num": 40, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:45:08.843794 140547769902912 mlp_log.py:80] :::MLL 1679251508.844 block_start: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 18:45:08.853801 140547769902912 controller.py:220] step: 22059        evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
+I0319 18:45:08.853981 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22059 of 36423
+I0319 18:53:05.613945 140547769902912 keras_utils.py:120] TimeHistory: 2685.85 examples/second between steps 22059 and 22572
+I0319 18:53:05.618196 140547769902912 controller.py:220] step: 22572        steps_per_second: 1.05        {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
+I0319 18:53:05.618384 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 22572 of 36423
+I0319 19:01:03.341272 140547769902912 keras_utils.py:120] TimeHistory: 2680.44 examples/second between steps 22572 and 23085
+I0319 19:01:03.345571 140547769902912 controller.py:220] step: 23085        steps_per_second: 1.07        {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
+I0319 19:01:03.345741 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23085 of 36423
+I0319 19:09:00.970286 140547769902912 keras_utils.py:120] TimeHistory: 2680.99 examples/second between steps 23085 and 23598
+I0319 19:09:00.974560 140547769902912 controller.py:220] step: 23598        steps_per_second: 1.07        {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
+I0319 19:09:00.974729 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 23598 of 36423
+I0319 19:16:57.904174 140547769902912 keras_utils.py:120] TimeHistory: 2684.90 examples/second between steps 23598 and 24111
+I0319 19:16:57.908411 140547769902912 controller.py:220] step: 24111        steps_per_second: 1.08        {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
+I0319 19:16:57.908590 140547769902912 controller.py:185] Start evaluation at step: 24111
+step: 22059        evaluation metric: {'test_loss': 0.24191059, 'test_accuracy': 0.73106, 'continue_training': True}
+step: 22572        steps_per_second: 1.05        {'train_loss': 32.501007, 'train_accuracy': 0.7391007}
+step: 23085        steps_per_second: 1.07        {'train_loss': 32.159527, 'train_accuracy': 0.74508685}
+step: 23598        steps_per_second: 1.07        {'train_loss': 31.819632, 'train_accuracy': 0.7504209}
+step: 24111        steps_per_second: 1.08        {'train_loss': 31.538153, 'train_accuracy': 0.75502324}
+:::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:16:58.442478 140547769902912 mlp_log.py:80] :::MLL 1679253418.442 eval_start: {"value": null, "metadata": {"epoch_num": 47, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:17:09.277749 140547769902912 mlp_log.py:80] :::MLL 1679253429.278 eval_stop: {"value": null, "metadata": {"epoch_num": 47, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:17:09.285157 140547769902912 mlp_log.py:80] :::MLL 1679253429.285 eval_accuracy: {"value": 0.7399600148200989, "metadata": {"epoch_num": 47, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:17:09.286182 140547769902912 mlp_log.py:80] :::MLL 1679253429.286 block_stop: {"value": null, "metadata": {"first_epoch_num": 44, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:17:09.287138 140547769902912 mlp_log.py:80] :::MLL 1679253429.287 block_start: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:17:09.297158 140547769902912 controller.py:220] step: 24111        evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
+I0319 19:17:09.297350 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24111 of 36423
+I0319 19:25:05.843054 140547769902912 keras_utils.py:120] TimeHistory: 2687.06 examples/second between steps 24111 and 24624
+I0319 19:25:05.847337 140547769902912 controller.py:220] step: 24624        steps_per_second: 1.05        {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
+I0319 19:25:05.847517 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 24624 of 36423
+I0319 19:33:01.919262 140547769902912 keras_utils.py:120] TimeHistory: 2689.74 examples/second between steps 24624 and 25137
+I0319 19:33:01.923496 140547769902912 controller.py:220] step: 25137        steps_per_second: 1.08        {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
+I0319 19:33:01.923671 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25137 of 36423
+I0319 19:40:58.928619 140547769902912 keras_utils.py:120] TimeHistory: 2684.48 examples/second between steps 25137 and 25650
+I0319 19:40:58.932954 140547769902912 controller.py:220] step: 25650        steps_per_second: 1.08        {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
+I0319 19:40:58.933148 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 25650 of 36423
+I0319 19:48:55.050780 140547769902912 keras_utils.py:120] TimeHistory: 2689.48 examples/second between steps 25650 and 26163
+I0319 19:48:55.054964 140547769902912 controller.py:220] step: 26163        steps_per_second: 1.08        {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
+I0319 19:48:55.055132 140547769902912 controller.py:185] Start evaluation at step: 26163
+step: 24111        evaluation metric: {'test_loss': 0.23783618, 'test_accuracy': 0.73996, 'continue_training': True}
+step: 24624        steps_per_second: 1.05        {'train_loss': 31.25745, 'train_accuracy': 0.7601792}
+step: 25137        steps_per_second: 1.08        {'train_loss': 30.94866, 'train_accuracy': 0.7650783}
+step: 25650        steps_per_second: 1.08        {'train_loss': 30.675001, 'train_accuracy': 0.76950336}
+step: 26163        steps_per_second: 1.08        {'train_loss': 30.428179, 'train_accuracy': 0.7739565}
+:::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:48:55.547407 140547769902912 mlp_log.py:80] :::MLL 1679255335.547 eval_start: {"value": null, "metadata": {"epoch_num": 51, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:49:06.552730 140547769902912 mlp_log.py:80] :::MLL 1679255346.553 eval_stop: {"value": null, "metadata": {"epoch_num": 51, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:49:06.559940 140547769902912 mlp_log.py:80] :::MLL 1679255346.560 eval_accuracy: {"value": 0.7473400235176086, "metadata": {"epoch_num": 51, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:49:06.560959 140547769902912 mlp_log.py:80] :::MLL 1679255346.561 block_stop: {"value": null, "metadata": {"first_epoch_num": 48, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:49:06.561913 140547769902912 mlp_log.py:80] :::MLL 1679255346.562 block_start: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 19:49:06.571880 140547769902912 controller.py:220] step: 26163        evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
+I0319 19:49:06.572060 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26163 of 36423
+I0319 19:57:04.827503 140547769902912 keras_utils.py:120] TimeHistory: 2677.45 examples/second between steps 26163 and 26676
+I0319 19:57:04.831851 140547769902912 controller.py:220] step: 26676        steps_per_second: 1.05        {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
+I0319 19:57:04.832029 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 26676 of 36423
+I0319 20:05:01.335356 140547769902912 keras_utils.py:120] TimeHistory: 2687.31 examples/second between steps 26676 and 27189
+I0319 20:05:01.339524 140547769902912 controller.py:220] step: 27189        steps_per_second: 1.08        {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
+I0319 20:05:01.339692 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27189 of 36423
+I0319 20:12:58.183544 140547769902912 keras_utils.py:120] TimeHistory: 2685.39 examples/second between steps 27189 and 27702
+I0319 20:12:58.187861 140547769902912 controller.py:220] step: 27702        steps_per_second: 1.08        {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
+I0319 20:12:58.188049 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 27702 of 36423
+I0319 20:20:54.531436 140547769902912 keras_utils.py:120] TimeHistory: 2688.23 examples/second between steps 27702 and 28215
+I0319 20:20:54.535721 140547769902912 controller.py:220] step: 28215        steps_per_second: 1.08        {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
+I0319 20:20:54.535894 140547769902912 controller.py:185] Start evaluation at step: 28215
+step: 26163        evaluation metric: {'test_loss': 0.23394844, 'test_accuracy': 0.74734, 'continue_training': True}
+step: 26676        steps_per_second: 1.05        {'train_loss': 30.180855, 'train_accuracy': 0.7779777}
+step: 27189        steps_per_second: 1.08        {'train_loss': 29.918842, 'train_accuracy': 0.7823356}
+step: 27702        steps_per_second: 1.08        {'train_loss': 29.700476, 'train_accuracy': 0.78678167}
+step: 28215        steps_per_second: 1.08        {'train_loss': 29.481922, 'train_accuracy': 0.79037726}
+:::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:20:55.034361 140547769902912 mlp_log.py:80] :::MLL 1679257255.034 eval_start: {"value": null, "metadata": {"epoch_num": 55, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:21:05.511004 140547769902912 mlp_log.py:80] :::MLL 1679257265.511 eval_stop: {"value": null, "metadata": {"epoch_num": 55, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:21:05.518299 140547769902912 mlp_log.py:80] :::MLL 1679257265.518 eval_accuracy: {"value": 0.7565600275993347, "metadata": {"epoch_num": 55, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:21:05.519329 140547769902912 mlp_log.py:80] :::MLL 1679257265.519 block_stop: {"value": null, "metadata": {"first_epoch_num": 52, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:21:05.520274 140547769902912 mlp_log.py:80] :::MLL 1679257265.520 block_start: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 464, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:21:05.530137 140547769902912 controller.py:220] step: 28215        evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
+I0319 20:21:05.530332 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28215 of 36423
+I0319 20:29:02.549445 140547769902912 keras_utils.py:120] TimeHistory: 2684.41 examples/second between steps 28215 and 28728
+I0319 20:29:02.553695 140547769902912 controller.py:220] step: 28728        steps_per_second: 1.05        {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
+I0319 20:29:02.553875 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 28728 of 36423
+I0319 20:36:59.701035 140547769902912 keras_utils.py:120] TimeHistory: 2683.68 examples/second between steps 28728 and 29241
+I0319 20:36:59.705335 140547769902912 controller.py:220] step: 29241        steps_per_second: 1.08        {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
+I0319 20:36:59.705515 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29241 of 36423
+I0319 20:44:56.506052 140547769902912 keras_utils.py:120] TimeHistory: 2685.63 examples/second between steps 29241 and 29754
+I0319 20:44:56.510352 140547769902912 controller.py:220] step: 29754        steps_per_second: 1.08        {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
+I0319 20:44:56.510533 140547769902912 controller.py:251] Entering training loop with 513 steps, at step 29754 of 36423
+I0319 20:52:52.982735 140547769902912 keras_utils.py:120] TimeHistory: 2687.48 examples/second between steps 29754 and 30267
+I0319 20:52:52.987001 140547769902912 controller.py:220] step: 30267        steps_per_second: 1.08        {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
+I0319 20:52:52.987169 140547769902912 controller.py:185] Start evaluation at step: 30267
+step: 28215        evaluation metric: {'test_loss': 0.23033953, 'test_accuracy': 0.75656, 'continue_training': True}
+step: 28728        steps_per_second: 1.05        {'train_loss': 29.281693, 'train_accuracy': 0.7941939}
+step: 29241        steps_per_second: 1.08        {'train_loss': 29.111845, 'train_accuracy': 0.79688674}
+step: 29754        steps_per_second: 1.08        {'train_loss': 28.945818, 'train_accuracy': 0.8003339}
+step: 30267        steps_per_second: 1.08        {'train_loss': 28.79324, 'train_accuracy': 0.80263704}
+:::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:52:53.483794 140547769902912 mlp_log.py:80] :::MLL 1679259173.484 eval_start: {"value": null, "metadata": {"epoch_num": 59, "lineno": 396, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:53:04.441937 140547769902912 mlp_log.py:80] :::MLL 1679259184.442 eval_stop: {"value": null, "metadata": {"epoch_num": 59, "lineno": 434, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:53:04.452280 140547769902912 mlp_log.py:80] :::MLL 1679259184.452 eval_accuracy: {"value": 0.7594199776649475, "metadata": {"epoch_num": 59, "lineno": 443, "file": "/root/resnet50/resnet_runnable.py"}}
+:::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:53:04.453428 140547769902912 mlp_log.py:80] :::MLL 1679259184.453 block_stop: {"value": null, "metadata": {"first_epoch_num": 56, "epoch_count": 4, "lineno": 452, "file": "/root/resnet50/resnet_runnable.py"}}
+I0319 20:53:04.463417 140547769902912 controller.py:220] step: 30267        evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
+step: 30267        evaluation metric: {'test_loss': 0.22827734, 'test_accuracy': 0.75942, 'continue_training': False}
+:::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 20:53:04.464306 140547769902912 mlp_log.py:80] :::MLL 1679259184.464 run_stop: {"value": null, "metadata": {"status": "success", "lineno": 279, "file": "./resnet_ctl_imagenet_main.py"}}
+:::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 20:53:04.465044 140547769902912 mlp_log.py:80] :::MLL 1679259184.465 run_final: {"value": null, "metadata": {"lineno": 281, "file": "./resnet_ctl_imagenet_main.py"}}
+I0319 20:53:04.484413 140547769902912 resnet_ctl_imagenet_main.py:298] Run stats:
+{'eval_loss': 0.22827734, 'eval_acc': 0.75942, 'train_loss': 28.79324, 'train_acc': 0.80263704, 'step_timestamp_log': ['BatchTimestamp<batch_index: 0, timestamp: 1679230834.281045>', 'BatchTimestamp<batch_index: 513, timestamp: 1679231312.765514>', 'BatchTimestamp<batch_index: 1026, timestamp: 1679231790.4766295>', 'BatchTimestamp<batch_index: 1539, timestamp: 1679232268.0622654>', 'BatchTimestamp<batch_index: 2052, timestamp: 1679232778.5840638>', 'BatchTimestamp<batch_index: 2565, timestamp: 1679233256.8333852>', 'BatchTimestamp<batch_index: 3078, timestamp: 1679233735.232925>', 'BatchTimestamp<batch_index: 3591, timestamp: 1679234213.5742714>', 'BatchTimestamp<batch_index: 4104, timestamp: 1679234703.2010417>', 'BatchTimestamp<batch_index: 4617, timestamp: 1679235181.7035873>', 'BatchTimestamp<batch_index: 5130, timestamp: 1679235658.7568266>', 'BatchTimestamp<batch_index: 5643, timestamp: 1679236136.837958>', 'BatchTimestamp<batch_index: 6156, timestamp: 1679236625.7255764>', 'BatchTimestamp<batch_index: 6669, timestamp: 1679237103.411414>', 'BatchTimestamp<batch_index: 7182, timestamp: 1679237582.1563838>', 'BatchTimestamp<batch_index: 7695, timestamp: 1679238060.5108216>', 'BatchTimestamp<batch_index: 8208, timestamp: 1679238549.9029462>', 'BatchTimestamp<batch_index: 8721, timestamp: 1679239028.3283317>', 'BatchTimestamp<batch_index: 9234, timestamp: 1679239506.558369>', 'BatchTimestamp<batch_index: 9747, timestamp: 1679239984.4382937>', 'BatchTimestamp<batch_index: 10260, timestamp: 1679240474.3921533>', 'BatchTimestamp<batch_index: 10773, timestamp: 1679240951.96138>', 'BatchTimestamp<batch_index: 11286, timestamp: 1679241429.1646736>', 'BatchTimestamp<batch_index: 11799, timestamp: 1679241906.888098>', 'BatchTimestamp<batch_index: 12312, timestamp: 1679242395.0172863>', 'BatchTimestamp<batch_index: 12825, timestamp: 1679242871.8736327>', 'BatchTimestamp<batch_index: 13338, timestamp: 1679243348.7574499>', 'BatchTimestamp<batch_index: 13851, timestamp: 1679243825.9569237>', 'BatchTimestamp<batch_index: 14364, timestamp: 1679244314.2721043>', 'BatchTimestamp<batch_index: 14877, timestamp: 1679244792.2427475>', 'BatchTimestamp<batch_index: 15390, timestamp: 1679245268.9251325>', 'BatchTimestamp<batch_index: 15903, timestamp: 1679245745.4601164>', 'BatchTimestamp<batch_index: 16416, timestamp: 1679246232.7534444>', 'BatchTimestamp<batch_index: 16929, timestamp: 1679246708.728656>', 'BatchTimestamp<batch_index: 17442, timestamp: 1679247185.7591805>', 'BatchTimestamp<batch_index: 17955, timestamp: 1679247662.4490402>', 'BatchTimestamp<batch_index: 18468, timestamp: 1679248150.4474506>', 'BatchTimestamp<batch_index: 18981, timestamp: 1679248627.1151292>', 'BatchTimestamp<batch_index: 19494, timestamp: 1679249103.5206127>', 'BatchTimestamp<batch_index: 20007, timestamp: 1679249580.1458325>', 'BatchTimestamp<batch_index: 20520, timestamp: 1679250067.8068252>', 'BatchTimestamp<batch_index: 21033, timestamp: 1679250544.0597591>', 'BatchTimestamp<batch_index: 21546, timestamp: 1679251020.501157>', 'BatchTimestamp<batch_index: 22059, timestamp: 1679251497.4479887>', 'BatchTimestamp<batch_index: 22572, timestamp: 1679251985.6137266>', 'BatchTimestamp<batch_index: 23085, timestamp: 1679252463.3410485>', 'BatchTimestamp<batch_index: 23598, timestamp: 1679252940.9701052>', 'BatchTimestamp<batch_index: 24111, timestamp: 1679253417.9039862>', 'BatchTimestamp<batch_index: 24624, timestamp: 1679253905.8428304>', 'BatchTimestamp<batch_index: 25137, timestamp: 1679254381.919039>', 'BatchTimestamp<batch_index: 25650, timestamp: 1679254858.9284008>', 'BatchTimestamp<batch_index: 26163, timestamp: 1679255335.0505683>', 'BatchTimestamp<batch_index: 26676, timestamp: 1679255824.8272724>', 'BatchTimestamp<batch_index: 27189, timestamp: 1679256301.335169>', 'BatchTimestamp<batch_index: 27702, timestamp: 1679256778.1833446>', 'BatchTimestamp<batch_index: 28215, timestamp: 1679257254.531205>', 'BatchTimestamp<batch_index: 28728, timestamp: 1679257742.5492072>', 'BatchTimestamp<batch_index: 29241, timestamp: 1679258219.7008102>', 'BatchTimestamp<batch_index: 29754, timestamp: 1679258696.5058227>', 'BatchTimestamp<batch_index: 30267, timestamp: 1679259172.9825177>'], 'train_finish_time': 1679259184.4644349, 'avg_exp_per_second': 2683.1639506599217}
--- a/requirement.txt
+++ b/requirement.txt
+absl-py
+pandas
+numpy
+tqdm
+git+https://github.com/mlcommons/logging.git@0.7.0
+
--- a/resnet_ctl_imagenet_main.py
+++ b/resnet_ctl_imagenet_main.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+#  worker = []
+#  nodelist = os.environ["SLURM_JOB_NODELIST"]
+#  nodename = os.environ["SLURMD_NODENAME"]
+#  nodelist = hostlist.expand_hostlist(nodelist) 
+#  print('print nodelist2',nodelist)
+#  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+#  port_number =40000
+#  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+##  print('print worker_nodes',worker_nodes)
+#  for node in worker_nodes:
+#      for index in range(4):
+#          print('node',node)
+#          worker_sockets = ":".join([node, str(port_number + index )])
+#          worker.append(worker_sockets)
+#  os.environ['TF_CONFIG'] = json.dumps({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+#
+#  
+#  print({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_ctl_imagenet_main.py.multinode
+++ b/resnet_ctl_imagenet_main.py.multinode
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+  worker = []
+  nodelist = os.environ["SLURM_JOB_NODELIST"]
+  nodename = os.environ["SLURMD_NODENAME"]
+  nodelist = hostlist.expand_hostlist(nodelist) 
+  print('print nodelist2',nodelist)
+  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+  port_number =40000
+  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+#  print('print worker_nodes',worker_nodes)
+  for node in worker_nodes:
+      for index in range(4):
+          print('node',node)
+          worker_sockets = ":".join([node, str(port_number + index )])
+          worker.append(worker_sockets)
+  os.environ['TF_CONFIG'] = json.dumps({
+    'cluster': {
+        'worker': worker
+    },
+    'task': {'type': 'worker', 'index': num_index}
+  })
+
+  
+  print({
+    'cluster': {
+        'worker': worker
+    },
+    'task': {'type': 'worker', 'index': num_index}
+  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_ctl_imagenet_main.py.onenode
+++ b/resnet_ctl_imagenet_main.py.onenode
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import hostlist
+import os
+import re 
+import json
+from tf2_common.modeling import performance
+from tf2_common.training import controller
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.logs import logger
+from tf2_common.utils.misc import distribution_utils
+from tf2_common.utils.misc import keras_utils
+from tf2_common.utils.misc import model_helpers
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_runnable
+
+flags.DEFINE_boolean(name='use_tf_function', default=True,
+                     help='Wrap the train and test step inside a '
+                     'tf.function.')
+flags.DEFINE_boolean(name='single_l2_loss_op', default=False,
+                     help='Calculate L2_loss on concatenated weights, '
+                     'instead of using Keras per-layer L2 loss.')
+flags.DEFINE_boolean(name='cache_decoded_image', default=False,
+                     help='Whether or not to cache decoded images in the '
+                     'input pipeline. If this flag and `cache` is enabled, '
+                     'then TFExample protos will be parsed and then cached '
+                     'which reduces the load on hosts.')
+flags.DEFINE_boolean(name='enable_device_warmup', default=False,
+                     help='Whether or not to enable device warmup. This '
+                     'includes training on dummy data and enabling graph/XLA '
+                     'compilation before run_start.')
+flags.DEFINE_integer(name='device_warmup_steps', default=1,
+                     help='The number of steps to apply for device warmup.')
+flags.DEFINE_integer(name='num_replicas', default=32,
+                     help='The number of TPU cores to use, '
+                     'for log printout only.')
+
+
+def build_stats(runnable, time_callback):
+  """Normalizes and returns dictionary of stats.
+
+  Args:
+    runnable: The module containing all the training and evaluation metrics.
+    time_callback: Time tracking callback instance.
+
+  Returns:
+    Dictionary of normalized results.
+  """
+  stats = {}
+
+  if not runnable.flags_obj.skip_eval:
+    if runnable.test_loss:
+      stats['eval_loss'] = runnable.test_loss.result().numpy()
+    if runnable.test_accuracy:
+      stats['eval_acc'] = runnable.test_accuracy.result().numpy()
+
+    if runnable.train_loss:
+      stats['train_loss'] = runnable.train_loss.result().numpy()
+    if runnable.train_accuracy:
+      stats['train_acc'] = runnable.train_accuracy.result().numpy()
+
+  if time_callback:
+    timestamp_log = time_callback.timestamp_log
+    stats['step_timestamp_log'] = timestamp_log
+    stats['train_finish_time'] = time_callback.train_finish_time
+    if time_callback.epoch_runtime_log:
+      stats['avg_exp_per_second'] = time_callback.average_examples_per_second
+
+  return stats
+
+
+def _steps_to_run(steps_in_current_epoch, steps_per_epoch, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  if steps_per_loop == 1:
+    return steps_per_loop
+  return min(steps_per_loop, steps_per_epoch - steps_in_current_epoch)
+
+
+def run(flags_obj):
+  """Run ResNet ImageNet training and eval loop using custom training loops.
+
+  Args:
+    flags_obj: An object containing parsed flag values.
+
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  mlp_log.mlperf_print('cache_clear', True)
+  mlp_log.mlperf_print('init_start', None)
+  mlp_log.mlperf_print('submission_benchmark', 'resnet')
+  mlp_log.mlperf_print('submission_division', 'closed')
+  mlp_log.mlperf_print('submission_org', 'google')
+  mlp_log.mlperf_print(
+      'submission_platform', 'tpu-v3-{}'.format(flags_obj.num_replicas)
+      if flags_obj.tpu else 'gpu-v100-{}'.format(flags_obj.num_gpus))
+  mlp_log.mlperf_print('submission_status', 'cloud')
+ 
+  common.print_flags(flags_obj)
+
+  num_index = flags_obj.task_index
+  print('num_index',num_index)
+#  worker = []
+#  nodelist = os.environ["SLURM_JOB_NODELIST"]
+#  nodename = os.environ["SLURMD_NODENAME"]
+#  nodelist = hostlist.expand_hostlist(nodelist) 
+#  print('print nodelist2',nodelist)
+#  num_nodes = int(os.getenv("SLURM_JOB_NUM_NODES"))
+#  port_number =40000
+#  worker_nodes = [node for i, node in enumerate(nodelist) if i >= 0 ]
+##  print('print worker_nodes',worker_nodes)
+#  for node in worker_nodes:
+#      for index in range(4):
+#          print('node',node)
+#          worker_sockets = ":".join([node, str(port_number + index )])
+#          worker.append(worker_sockets)
+#  os.environ['TF_CONFIG'] = json.dumps({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+#
+#  
+#  print({
+#    'cluster': {
+#        'worker': worker
+#    },
+#    'task': {'type': 'worker', 'index': num_index}
+#  })
+  keras_utils.set_session_config(
+      enable_eager=flags_obj.enable_eager,
+      enable_xla=flags_obj.enable_xla)
+  performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj))
+
+  if tf.config.list_physical_devices('GPU'):
+    if flags_obj.tf_gpu_thread_mode:
+      datasets_num_private_threads = keras_utils.set_gpu_thread_mode_and_count(
+          per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+          gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+          num_gpus=flags_obj.num_gpus)
+      if not flags_obj.datasets_num_private_threads:
+        flags_obj.datasets_num_private_threads = datasets_num_private_threads
+    common.set_cudnn_batchnorm_mode()
+
+  # TODO(anj-s): Set data_format without using Keras.
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first'
+                   if tf.test.is_built_with_cuda() else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs,
+      tpu_address=flags_obj.tpu,
+      tpu_zone=flags_obj.tpu_zone if flags_obj.tpu else None)
+#  strategy = tf.distribute.get_strategy()
+#  print('after distribution number of replicas : {}'.format(
+#              strategy.num_replicas_in_sync))
+
+  mlp_log.mlperf_print('global_batch_size', flags_obj.batch_size)
+  mlp_log.mlperf_print('train_samples',
+                       imagenet_preprocessing.NUM_IMAGES['train'])
+  mlp_log.mlperf_print('eval_samples',
+                       imagenet_preprocessing.NUM_IMAGES['validation'])
+  mlp_log.mlperf_print(
+      'model_bn_span',
+      int(flags_obj.batch_size /
+          (flags_obj.num_replicas if flags_obj.tpu else flags_obj.num_gpus)))
+
+  per_epoch_steps, train_epochs = common.get_num_train_iterations(flags_obj)
+  eval_steps = common.get_num_eval_steps(flags_obj)
+  steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps)
+
+  logging.info(
+      'Training %d epochs, each epoch has %d steps, '
+      'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps,
+      train_epochs * per_epoch_steps, eval_steps)
+
+  time_callback = keras_utils.TimeHistory(
+      flags_obj.batch_size,
+      flags_obj.log_steps,
+      logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None)
+  with distribution_utils.get_strategy_scope(strategy):
+    runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback)
+
+  eval_interval = (
+      flags_obj.epochs_between_evals *
+      per_epoch_steps if not flags_obj.skip_eval else None)
+  eval_offset = (
+      flags_obj.eval_offset_epochs *
+      per_epoch_steps if not flags_obj.skip_eval else 0)
+  if eval_offset != 0:
+    eval_offset -= eval_interval
+  checkpoint_interval = (
+      per_epoch_steps if flags_obj.enable_checkpoint_and_export else None)
+  summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None
+
+  checkpoint_manager = tf.train.CheckpointManager(
+      runnable.checkpoint,
+      directory=flags_obj.model_dir,
+      max_to_keep=10,
+      step_counter=runnable.global_step,
+      checkpoint_interval=checkpoint_interval)
+
+  device_warmup_steps = (flags_obj.device_warmup_steps
+                         if flags_obj.enable_device_warmup else 0)
+  if flags_obj.enable_device_warmup:
+    logging.info('Warmup for %d steps.', device_warmup_steps)
+
+  resnet_controller = controller.Controller(
+      strategy,
+      runnable.train,
+      runnable.evaluate,
+      runnable.warmup,
+      global_step=runnable.global_step,
+      steps_per_loop=steps_per_loop,
+      train_steps=per_epoch_steps * train_epochs,
+      device_warmup_steps=device_warmup_steps,
+      checkpoint_manager=checkpoint_manager,
+      summary_interval=summary_interval,
+      eval_steps=eval_steps,
+      eval_interval=eval_interval,
+      eval_offset=eval_offset)
+
+  if flags_obj.enable_device_warmup:
+    resnet_controller.warmup()
+
+  mlp_log.mlperf_print('init_stop', None)
+
+  profile_steps = flags_obj.profile_steps
+  if profile_steps:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+    if profile_steps[0] < 0:
+      runnable.trace_start(-1)
+
+  time_callback.on_train_begin()
+  mlp_log.mlperf_print('run_start', None)
+  mlp_log.mlperf_print(
+      'block_start',
+      None,
+      metadata={
+          'first_epoch_num':
+              1,
+          'epoch_count':
+              (flags_obj.eval_offset_epochs if flags_obj.eval_offset_epochs != 0
+               else flags_obj.epochs_between_evals)
+      })
+  resnet_controller.train(evaluate=not flags_obj.skip_eval)
+  mlp_log.mlperf_print('run_stop', None, metadata={'status': 'success'})
+  time_callback.on_train_end()
+  mlp_log.mlperf_print('run_final', None)
+
+  stats = build_stats(runnable, time_callback)
+  return stats
+
+
+def define_imagenet_keras_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults()
+  flags.adopt_module_key_flags(common)
+
+
+def main(_):
+ # tf.keras.backend.set_floatx('float16')
+  model_helpers.apply_clean(flags.FLAGS)
+  with logger.benchmark_context(flags.FLAGS):
+    stats = run(flags.FLAGS)
+  logging.info('Run stats:\n%s', stats)
+
+
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  common.define_keras_flags()
+  app.run(main)
--- a/resnet_model.py
+++ b/resnet_model.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ResNet50 model for Keras.
+
+Adapted from tf.keras.applications.resnet50.ResNet50().
+This is ResNet model version 1.5.
+
+Related papers/blogs:
+- https://arxiv.org/abs/1512.03385
+- https://arxiv.org/pdf/1603.05027v2.pdf
+- http://torch.ch/blog/2016/02/04/resnets.html
+
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+import tensorflow as tf
+
+import imagenet_preprocessing
+from tensorflow.keras import backend
+from tensorflow.keras import initializers
+from tensorflow.keras import layers as tf_python_keras_layers
+from tensorflow.keras import models
+from tensorflow.keras import regularizers
+
+BATCH_NORM_DECAY = 0.9
+BATCH_NORM_EPSILON = 1e-5
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_float(
+    'weight_decay',
+    default=1e-4,
+    help=('Weight decay coefficiant for l2 regularization.'))
+
+flags.DEFINE_integer(
+    'num_accumulation_steps',
+    default=8,
+    help=('number of steps to accumulate with large batch size.'))
+
+layers = tf_python_keras_layers
+
+
+def change_keras_layer(use_tf_keras_layers=False):
+  """Change layers to either tf.keras.layers or tf.python.keras.layers.
+
+  Layer version of  tf.keras.layers is depends on tensorflow version, but
+  tf.python.keras.layers checks environment variable TF2_BEHAVIOR.
+  This function is a temporal function to use tf.keras.layers.
+  Currently, tf v2 batchnorm layer is slower than tf v1 batchnorm layer.
+  this function is useful for tracking benchmark result for each version.
+  This function will be removed when we use tf.keras.layers as default.
+
+  TODO(b/146939027): Remove this function when tf v2 batchnorm reaches training
+  speed parity with tf v1 batchnorm.
+
+  Args:
+      use_tf_keras_layers: whether to use tf.keras.layers.
+  """
+  global layers
+  if use_tf_keras_layers:
+    layers = tf.keras.layers
+  else:
+    layers = tf_python_keras_layers
+
+
+def _gen_l2_regularizer(use_l2_regularizer=True):
+  return regularizers.l2(FLAGS.weight_decay) if use_l2_regularizer else None
+
+
+def identity_block(input_tensor,
+                   kernel_size,
+                   filters,
+                   stage,
+                   block,
+                   use_l2_regularizer=True):
+  """The identity block is the block that has no conv layer at shortcut.
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  x = layers.add([x, input_tensor])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def conv_block(input_tensor,
+               kernel_size,
+               filters,
+               stage,
+               block,
+               strides=(2, 2),
+               use_l2_regularizer=True):
+  """A block that has a conv layer at shortcut.
+
+  Note that from stage 3,
+  the second conv layer at main path is with strides=(2, 2)
+  And the shortcut should have strides=(2, 2) as well
+
+  Args:
+    input_tensor: input tensor
+    kernel_size: default 3, the kernel size of middle conv layer at main path
+    filters: list of integers, the filters of 3 conv layer at main path
+    stage: integer, current stage label, used for generating layer names
+    block: 'a','b'..., current block label, used for generating layer names
+    strides: Strides for the second conv layer in the block.
+    use_l2_regularizer: whether to use L2 regularizer on Conv layer.
+
+  Returns:
+    Output tensor for the block.
+  """
+  filters1, filters2, filters3 = filters
+  if backend.image_data_format() == 'channels_last':
+    bn_axis = 3
+  else:
+    bn_axis = 1
+  conv_name_base = 'res' + str(stage) + block + '_branch'
+  bn_name_base = 'bn' + str(stage) + block + '_branch'
+
+  x = layers.Conv2D(
+      filters1, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2a')(
+          input_tensor)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2a')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters2,
+      kernel_size,
+      strides=strides,
+      padding='same',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2b')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2b')(
+          x)
+  x = layers.Activation('relu')(x)
+
+  x = layers.Conv2D(
+      filters3, (1, 1),
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '2c')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '2c')(
+          x)
+
+  shortcut = layers.Conv2D(
+      filters3, (1, 1),
+      strides=strides,
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name=conv_name_base + '1')(
+          input_tensor)
+  shortcut = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name=bn_name_base + '1')(
+          shortcut)
+
+  x = layers.add([x, shortcut])
+  x = layers.Activation('relu')(x)
+  return x
+
+
+def resnet50(num_classes,
+             batch_size=None,
+             use_l2_regularizer=True,
+             rescale_inputs=False):
+  """Instantiates the ResNet50 architecture.
+
+  Args:
+    num_classes: `int` number of classes for image classification.
+    batch_size: Size of the batches for each step.
+    use_l2_regularizer: whether to use L2 regularizer on Conv/Dense layer.
+    rescale_inputs: whether to rescale inputs from 0 to 1.
+
+  Returns:
+      A Keras model instance.
+  """
+  input_shape = (224, 224, 3)
+  img_input = layers.Input(shape=input_shape)
+  if rescale_inputs:
+    # Hub image modules expect inputs in the range [0, 1]. This rescales these
+    # inputs to the range expected by the trained model.
+    x = layers.Lambda(
+        lambda x: x * 255.0 - backend.constant(
+            imagenet_preprocessing.CHANNEL_MEANS,
+            shape=[1, 1, 3],
+            dtype=x.dtype),
+        name='rescale')(
+            img_input)
+  else:
+    x = img_input
+
+  if backend.image_data_format() == 'channels_first':
+    x = layers.Lambda(
+        lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
+        name='transpose')(x)
+    bn_axis = 1
+  else:  # channels_last
+    bn_axis = 3
+
+  x = layers.ZeroPadding2D(padding=(3, 3), name='conv1_pad')(x)
+  x = layers.Conv2D(
+      64, (7, 7),
+      strides=(2, 2),
+      padding='valid',
+      use_bias=False,
+      kernel_initializer='he_normal',
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='conv1')(
+          x)
+  x = layers.BatchNormalization(
+      axis=bn_axis,
+      momentum=BATCH_NORM_DECAY,
+      epsilon=BATCH_NORM_EPSILON,
+      name='bn_conv1')(
+          x)
+  x = layers.Activation('relu')(x)
+  x = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(x)
+
+  x = conv_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='a',
+      strides=(1, 1),
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [64, 64, 256],
+      stage=2,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [128, 128, 512],
+      stage=3,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='d',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='e',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [256, 256, 1024],
+      stage=4,
+      block='f',
+      use_l2_regularizer=use_l2_regularizer)
+
+  x = conv_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='a',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='b',
+      use_l2_regularizer=use_l2_regularizer)
+  x = identity_block(
+      x,
+      3, [512, 512, 2048],
+      stage=5,
+      block='c',
+      use_l2_regularizer=use_l2_regularizer)
+
+  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
+  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
+  x = layers.Dense(
+      num_classes,
+      kernel_initializer=initializers.RandomNormal(stddev=0.01),
+      kernel_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      bias_regularizer=_gen_l2_regularizer(use_l2_regularizer),
+      name='fc1000')(
+          x)
+#  print('x.dtype: %s' % x.dtype_policy)
+# 'kernel' is dense1's variable
+  #print('layers.Dense.kernel.dtype: %s' % layers.Dense.kernel.dtype.name)
+  # A softmax that is followed by the model loss must be done cannot be done
+  # in float16 due to numeric issues. So we pass dtype=float32.
+  x = layers.Activation('softmax', dtype='float32')(x)
+
+  # Create model.
+  return models.Model(img_input, x, name='resnet50')
--- a/resnet_runnable.py
+++ b/resnet_runnable.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the ImageNet dataset using custom training loops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import flags
+from absl import logging
+
+import tensorflow as tf
+
+from tf2_common.training import standard_runnable
+from tf2_common.training import utils
+from tf2_common.utils.flags import core as flags_core
+from tf2_common.utils.mlp_log import mlp_log
+import common
+import imagenet_preprocessing
+import resnet_model
+
+flags.DEFINE_boolean('trace_warmup', default=False,
+                     help='Whether or not to programmatically capture an Xprof'
+                     ' trace in the warmup loop.')
+
+
+class _UnwrapPreventer(object):
+  """Wrapper that DistributionStrategy will not unwrap.
+
+  Typically, DistributionStrategy will unwrap values when going from a cross-
+  replica context to a replica context via `call_for_each_replica`. This class
+  is a wrapper that DistributionStrategy will not unwrap, so it can be used to
+  prevent it from unwrapping a value.
+
+  TODO(reedwm): Find/implement a better way of preventing values from being
+  unwrapped by DistributionStrategy
+  """
+
+  __slots__ = ['value']
+
+  def __init__(self, value):
+    self.value = value
+
+
+class ResnetRunnable(standard_runnable.StandardRunnableWithWarmup):
+  """Implements the training and evaluation APIs for Resnet model."""
+
+  def __init__(self, flags_obj, time_callback):
+    standard_runnable.StandardRunnableWithWarmup.__init__(
+        self,
+        flags_obj.use_tf_while_loop,
+        flags_obj.use_tf_function)
+
+    self.strategy = tf.distribute.get_strategy()
+    self.flags_obj = flags_obj
+    self.dtype = flags_core.get_tf_dtype(flags_obj)
+    self.time_callback = time_callback
+
+    # Input pipeline related
+    batch_size = flags_obj.batch_size
+    if batch_size % self.strategy.num_replicas_in_sync != 0:
+      raise ValueError(
+          'Batch size must be divisible by number of replicas : {}'.format(
+              self.strategy.num_replicas_in_sync))
+
+    steps_per_epoch, train_epochs = common.get_num_train_iterations(flags_obj)
+    if train_epochs > 1:
+      train_epochs = flags_obj.train_epochs
+
+    # As auto rebatching is not supported in
+    # `experimental_distribute_datasets_from_function()` API, which is
+    # required when cloning dataset to multiple workers in eager mode,
+    # we use per-replica batch size.
+    self.batch_size = int(batch_size / self.strategy.num_replicas_in_sync)
+
+    self.synthetic_input_fn = common.get_synth_input_fn(
+        height=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        width=imagenet_preprocessing.DEFAULT_IMAGE_SIZE,
+        num_channels=imagenet_preprocessing.NUM_CHANNELS,
+        num_classes=self.flags_obj.num_classes,
+        dtype=self.dtype,
+        drop_remainder=True)
+
+    if self.flags_obj.use_synthetic_data:
+      self.input_fn = self.synthetic_input_fn
+    else:
+      self.input_fn = imagenet_preprocessing.input_fn
+
+    resnet_model.change_keras_layer(flags_obj.use_tf_keras_layers)
+    self.model = resnet_model.resnet50(
+        num_classes=self.flags_obj.num_classes,
+        batch_size=flags_obj.batch_size,
+        use_l2_regularizer=not flags_obj.single_l2_loss_op)
+
+    self.use_lars_optimizer = False
+    self.num_accumulation_steps = self.flags_obj.num_accumulation_steps
+    if self.flags_obj.optimizer == 'LARS':
+      self.use_lars_optimizer = True
+    self.optimizer, _ = common.get_optimizer(
+        flags_obj=flags_obj,
+        steps_per_epoch=steps_per_epoch,
+        train_steps=steps_per_epoch * train_epochs)
+    # Make sure iterations variable is created inside scope.
+    self.global_step = self.optimizer.iterations
+
+    if self.dtype == tf.float16:
+      print("enter fp16 computing")
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.keras.mixed_precision.LossScaleOptimizer(
+              self.optimizer, dynamic=False, initial_scale=loss_scale))
+    elif flags_obj.fp16_implementation == 'graph_rewrite':
+      # `dtype` is still float32 in this case. We built the graph in float32
+      # and let the graph rewrite change parts of it float16.
+      if not flags_obj.use_tf_function:
+        raise ValueError('--fp16_implementation=graph_rewrite requires '
+                         '--use_tf_function to be true')
+      loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128)
+      self.optimizer = (
+          tf.train.experimental.enable_mixed_precision_graph_rewrite(
+              self.optimizer, loss_scale))
+
+    self.one_hot = False
+    self.label_smoothing = flags_obj.label_smoothing
+    if self.label_smoothing and self.label_smoothing > 0:
+      self.one_hot = True
+
+    if flags_obj.report_accuracy_metrics:
+      self.train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32)
+      if self.one_hot:
+        self.train_accuracy = tf.keras.metrics.CategoricalAccuracy(
+            'train_accuracy', dtype=tf.float32)
+      else:
+        self.train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+            'train_accuracy', dtype=tf.float32)
+      self.test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)
+    else:
+      self.train_loss = None
+      self.train_accuracy = None
+      self.test_loss = None
+
+    if self.one_hot:
+      self.test_accuracy = tf.keras.metrics.CategoricalAccuracy(
+          'test_accuracy', dtype=tf.float32)
+    else:
+      self.test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(
+          'test_accuracy', dtype=tf.float32)
+    # self.test_corrects = tf.keras.metrics.Sum(
+    #     'test_corrects', dtype=tf.float32)
+    self.num_eval_steps = common.get_num_eval_steps(flags_obj)
+
+    self.checkpoint = tf.train.Checkpoint(
+        model=self.model, optimizer=self.optimizer)
+
+    # Handling epochs.
+    self.epoch_steps = steps_per_epoch
+    self.epoch_helper = utils.EpochHelper(steps_per_epoch, self.global_step)
+
+    self.steps_per_loop = flags_obj.steps_per_loop
+    profile_steps = flags_obj.profile_steps
+    if profile_steps:
+      profile_steps = [int(i) for i in profile_steps.split(',')]
+      self.trace_start_step = profile_steps[0] if profile_steps[0] >= 0 else None
+      self.trace_end_step = profile_steps[1]
+    else:
+      self.trace_start_step = None
+      self.trace_end_step = None
+
+    self.epochs_between_evals = flags_obj.epochs_between_evals
+    self.training_vars = self.model.trainable_variables
+    self.accum_grads = []
+    self.accum_grads_dtype = tf.float32
+
+    if self.num_accumulation_steps > 1:
+      for var in self.training_vars:
+        self.accum_grads.append(self.optimizer.add_weight(
+            name=var.name + '_accum',
+            shape=var.shape,
+            dtype=self.accum_grads_dtype,
+            initializer='zeros',
+            trainable=False,
+            synchronization=tf.VariableSynchronization.ON_READ,
+            aggregation=tf.VariableAggregation.SUM))
+
+  def build_train_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def build_eval_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.input_fn,
+        is_training=False,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_eval_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.eval_dataset_cache,
+        prefetch_batchs=self.flags_obj.eval_prefetch_batchs)
+
+  def build_synthetic_dataset(self):
+    """See base class."""
+    return utils.make_distributed_dataset(
+        self.strategy,
+        self.synthetic_input_fn,
+        is_training=True,
+        data_dir=self.flags_obj.data_dir,
+        batch_size=self.batch_size,
+        datasets_num_private_threads=self.flags_obj
+        .datasets_num_private_threads,
+        dtype=self.dtype,
+        drop_remainder=self.flags_obj.drop_train_remainder,
+        tf_data_experimental_slack=self.flags_obj.tf_data_experimental_slack,
+        dataset_cache=self.flags_obj.training_dataset_cache,
+        prefetch_batchs=self.flags_obj.training_prefetch_batchs)
+
+  def train_loop_begin(self):
+    """See base class."""
+    # Reset all metrics
+    if self.train_loss:
+      self.train_loss.reset_states()
+    if self.train_accuracy:
+      self.train_accuracy.reset_states()
+
+    self._epoch_begin()
+    if self.trace_start_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_start_step and
+          self.trace_start_step < next_global_step):
+        self.trace_start(global_step)
+
+    self.time_callback.on_batch_begin(self.epoch_helper.batch_index)
+
+  def train_step(self, iterator):
+    """See base class."""
+
+    @tf.function(experimental_compile=False)
+    def local_step(images, labels):
+      """Local computation of a step."""
+
+      with tf.GradientTape() as tape:
+        logits = self.model(images, training=True)
+
+        if self.one_hot:
+          prediction_loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          prediction_loss = tf.keras.losses.sparse_categorical_crossentropy(
+              labels, logits)
+        loss = tf.reduce_sum(prediction_loss) * (
+            1.0 / self.flags_obj.batch_size)
+
+        # Save ~3 seconds per epoch on GPU when skipping
+        # L2 loss computation; can only skip when using LARS
+        # Details in decription of cl/308018913
+        if not self.use_lars_optimizer:
+          num_replicas = self.strategy.num_replicas_in_sync
+
+          if self.flags_obj.single_l2_loss_op:
+            l2_loss = self.flags_obj.weight_decay * 2 * tf.add_n([
+                tf.nn.l2_loss(v)
+                for v in self.model.trainable_variables
+                if 'bn' not in v.name
+            ])
+
+            loss += (l2_loss / num_replicas)
+          else:
+            loss += (tf.reduce_sum(self.model.losses) / num_replicas)
+
+        # Scale the loss
+        if self.flags_obj.dtype == 'fp16':
+          loss = self.optimizer.get_scaled_loss(loss)
+
+      grads = tape.gradient(loss, self.model.trainable_variables)
+
+      # Unscale the grads
+      if self.flags_obj.dtype == 'fp16':
+        grads = self.optimizer.get_unscaled_gradients(grads)
+      
+      return logits, loss, grads
+
+    def _maybe_apply_grads_and_clear(distribution):
+      def _apply_grads_and_clear_for_each_replica():
+        local_replica_id = tf.get_static_value(
+            self.strategy.extended._get_local_replica_id(
+                tf.distribute.get_replica_context().replica_id_in_sync_group))
+        replica_accum_grads = []
+        for accum_grad, var in zip(self.accum_grads, self.training_vars):
+          local_accum_grad = self.strategy.experimental_local_results(
+              accum_grad)
+          replica_accum_grad = local_accum_grad[local_replica_id]
+          replica_accum_grad = tf.cast(replica_accum_grad, var.dtype)
+          replica_accum_grads.append(replica_accum_grad)
+
+        self.optimizer.apply_gradients(
+            zip(replica_accum_grads, self.training_vars))
+        for accum_grad in self.accum_grads:
+          accum_grad.assign(tf.zeros_like(accum_grad,
+                                          dtype=self.accum_grads_dtype),
+                            read_value=False)
+      def _apply_grads_and_clear():
+        distribution.extended.call_for_each_replica(
+            _apply_grads_and_clear_for_each_replica,
+            args=())
+        return self.optimizer.iterations.assign_add(0, read_value=False)
+
+      def _advance_iteration():
+        return self.optimizer.iterations.assign_add(1, read_value=False)
+
+      tf.cond(
+          tf.equal(self.optimizer.iterations % self.num_accumulation_steps,
+                   self.num_accumulation_steps - 1),
+          _apply_grads_and_clear,
+          _advance_iteration)
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits, loss, grads = local_step(images, labels)
+
+      if self.num_accumulation_steps > 1:
+        for grad, accum_grad in zip(grads, self.accum_grads):
+          accum_grad.assign_add(tf.cast(grad, self.accum_grads_dtype),
+                                read_value=False)
+        tf.distribute.get_replica_context().merge_call(
+            _maybe_apply_grads_and_clear,
+            args=())
+      else:
+        self.optimizer.apply_gradients(zip(grads, self.training_vars))
+
+      if self.train_loss:
+        self.train_loss.update_state(loss)
+      if self.train_accuracy:
+        self.train_accuracy.update_state(labels, logits)
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def train_loop_end(self):
+    """See base class."""
+    metrics = {}
+    if self.train_loss:
+      metrics['train_loss'] = self.train_loss.result()
+    if self.train_accuracy:
+      metrics['train_accuracy'] = self.train_accuracy.result()
+
+    self.time_callback.on_batch_end(self.epoch_helper.batch_index - 1)
+
+    if self.trace_end_step:
+      global_step = self.global_step.numpy()
+      next_global_step = global_step + self.steps_per_loop
+      if (global_step <= self.trace_end_step and
+          self.trace_end_step < next_global_step):
+        self.trace_end(global_step)
+
+    self._epoch_end()
+    return metrics
+
+  def eval_begin(self):
+    """See base class."""
+    if self.test_loss:
+      self.test_loss.reset_states()
+    if self.test_accuracy:
+      self.test_accuracy.reset_states()
+    # self.test_corrects.reset_states()
+
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_start', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+  def eval_step(self, iterator):
+    """See base class."""
+
+    def step_fn(inputs):
+      """Function to run on the device."""
+      images, labels = inputs
+      logits = self.model(images, training=False)
+
+      if self.test_loss:
+        if self.one_hot:
+          loss = tf.keras.losses.categorical_crossentropy(
+              labels, logits, label_smoothing=self.label_smoothing)
+        else:
+          loss = tf.keras.losses.sparse_categorical_crossentropy(labels, logits)
+        loss = tf.reduce_sum(loss) * (1.0 / self.flags_obj.batch_size)
+        self.test_loss.update_state(loss)
+
+      if self.test_accuracy:
+        self.test_accuracy.update_state(labels, logits)
+        # tf.print('labels.shape: ', labels.shape,
+        #          ', logits.shape: ', logits.shape,
+        #          ', result: ', self.test_accuracy.result())
+      # self.test_corrects.update_state(
+      #     tf.cast(
+      #         tf.reduce_sum(
+      #             tf.cast(
+      #                 tf.equal(
+      #                     tf.cast(tf.argmax(logits, axis=1), labels.dtype),
+      #                     labels), tf.int32)), tf.float32))
+
+    self.strategy.run(step_fn, args=(next(iterator),))
+
+  def eval_end(self):
+    """See base class."""
+    epoch_num = int(self.epoch_helper.current_epoch)
+    mlp_log.mlperf_print('eval_stop', None,
+                         metadata={'epoch_num': epoch_num + 1})
+
+    eval_accuracy = float(self.test_accuracy.result())
+    # eval_accuracy = float(self.test_corrects.result()
+    #                      ) / imagenet_preprocessing.NUM_IMAGES['validation']
+    # eval_accuracy = float(self.test_accuracy.result()) * \
+    #     self.flags_obj.batch_size * self.num_eval_steps / \
+    #     imagenet_preprocessing.NUM_IMAGES['validation']
+    mlp_log.mlperf_print(
+        'eval_accuracy', eval_accuracy, metadata={'epoch_num': epoch_num + 1})
+
+    first_epoch_num = max(epoch_num - self.epochs_between_evals + 1, 0)
+    epoch_count = self.epochs_between_evals
+    if first_epoch_num == 0:
+      epoch_count = self.flags_obj.eval_offset_epochs
+      if epoch_count == 0:
+        epoch_count = self.flags_obj.epochs_between_evals
+    mlp_log.mlperf_print(
+        'block_stop',
+        None,
+        metadata={
+            'first_epoch_num': first_epoch_num + 1,
+            'epoch_count': epoch_count
+        })
+
+    continue_training = True
+    if eval_accuracy >= self.flags_obj.target_accuracy:
+      continue_training = False
+    else:
+      mlp_log.mlperf_print(
+          'block_start',
+          None,
+          metadata={
+              'first_epoch_num': epoch_num + 2,
+              'epoch_count': self.epochs_between_evals
+          })
+
+    results = {}
+    if self.test_loss:
+      results['test_loss'] = self.test_loss.result()
+    if self.test_accuracy:
+      results['test_accuracy'] = self.test_accuracy.result()
+    results['continue_training'] = continue_training
+    return results
+
+  def warmup_loop_begin(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_start(-3)
+    logging.info('Entering the warmup loop.')
+
+  def warmup_loop_end(self):
+    """See base class."""
+    if self.flags_obj.trace_warmup:
+      self.trace_end(-2)
+    # Reset the state
+    self.model.reset_states()
+    tf.keras.backend.set_value(self.optimizer.iterations, 0)
+    for accum_grad in self.accum_grads:
+      accum_grad.assign(tf.zeros_like(accum_grad,
+                                      dtype=self.accum_grads_dtype),
+                        read_value=False)
+    logging.info('Exiting the warmup loop.')
+
+  def _epoch_begin(self):
+    if self.epoch_helper.epoch_begin():
+      self.time_callback.on_epoch_begin(self.epoch_helper.current_epoch)
+
+  def _epoch_end(self):
+    # mlp_log.mlperf_print('epoch_stop', None)
+    if self.epoch_helper.epoch_end():
+      self.time_callback.on_epoch_end(self.epoch_helper.current_epoch)
+
+  def trace_start(self, global_step):
+    logging.info('Starting tracing at step %d.', global_step)
+    tf.profiler.experimental.start(self.flags_obj.model_dir)
+
+  def trace_end(self, global_step):
+    logging.info('Ending trace at step %d', global_step)
+    tf.profiler.experimental.stop()
--- a/run_debug.sh
+++ b/run_debug.sh
+XLA_FLAGS="--xla_gpu_cuda_data_dir=/public/software/compiler/rocm/dtk-21.10.1/amdgcn/bitcode/ --xla_dump_hlo_pass_re=.* --xla_dump_hlo_as_html --xla_dump_to=./tmp" TF_DUMP_GRAPH_PREFIX="./tf_graph" hipprof --hip-trace python3 ./resnet_ctl_imagenet_main.py \
+--base_learning_rate=10.0 \
+--batch_size=32 \
+--nocache_decoded_image \
+--data_dir=/public/software/apps/DeepLearning/Data/ImageNet-tensorflow \
+--device_warmup_steps=1 \
+--dtype=fp32 \
+--noenable_checkpoint_and_export \
+--noenable_device_warmup \
+--enable_eager \
+--epochs_between_evals=4 \
+--noeval_dataset_cache \
+--eval_offset_epochs=2 \
+--label_smoothing=0.1 \
+--lars_epsilon=0 \
+--log_steps=125 \
+--lr_schedule=polynomial \
+--optimizer=LARS \
+--noreport_accuracy_metrics \
+--single_l2_loss_op \
+--steps_per_loop=25 \
+--train_epochs=1 \
+--notraining_dataset_cache \
+--notrace_warmup \
+--nouse_synthetic_data \
+--use_tf_function \
+--verbosity=0 \
+--warmup_epochs=5 \
+--weight_decay=0.0002 \
+--target_accuracy=0.759 \
+--momentum=0.9 \
+--num_replicas=64 \
+--num_accumulation_steps=2 \
+--num_classes=1000 \
+--noskip_eval
+
--- a/tf2_common/modeling/__pycache__/performance.cpython-36.pyc
+++ b/tf2_common/modeling/__pycache__/performance.cpython-36.pyc
--- a/tf2_common/modeling/__pycache__/performance.cpython-38.pyc
+++ b/tf2_common/modeling/__pycache__/performance.cpython-38.pyc
--- a/tf2_common/modeling/performance.py
+++ b/tf2_common/modeling/performance.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to training performance."""
+
+import tensorflow as tf
+
+
+def configure_optimizer(optimizer,
+                        use_float16=False,
+                        use_graph_rewrite=False,
+                        loss_scale="dynamic"):
+  """Configures optimizer object with performance options."""
+  if use_float16:
+    # Wraps optimizer with a LossScaleOptimizer. This is done automatically
+    # in compile() with the "mixed_float16" policy, but since we do not call
+    # compile(), we must wrap the optimizer manually.
+    optimizer = (
+        tf.keras.mixed_precision.experimental.LossScaleOptimizer(
+            optimizer, loss_scale=loss_scale))
+  if use_graph_rewrite:
+    # Note: the model dtype must be 'float32', which will ensure
+    # tf.ckeras.mixed_precision and
+    # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
+    # up.
+    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+        optimizer)
+  return optimizer
+
+
+def set_mixed_precision_policy(dtype, loss_scale=None):
+  """Sets mix precision policy."""
+  if dtype == tf.float16:
+    print("enter the tf.float16 set policy")
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_float16', loss_scale=loss_scale)
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    print('Compute dtype: %s' % policy.compute_dtype)
+    print('Variable dtype: %s' % policy.variable_dtype)
+#    tf.keras.mixed_precision.experimental.set_policy('float16')
+  elif dtype == tf.bfloat16:
+    policy = tf.keras.mixed_precision.experimental.Policy(
+        'mixed_bfloat16')
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+  elif dtype == tf.float32:
+    tf.keras.mixed_precision.experimental.set_policy('float32')
+  else:
+    raise ValueError("Unexpected dtype: %s" % dtype)
--- a/tf2_common/training/__pycache__/controller.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/controller.cpython-36.pyc
--- a/tf2_common/training/__pycache__/controller.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/controller.cpython-38.pyc
--- a/tf2_common/training/__pycache__/runnable.cpython-36.pyc
+++ b/tf2_common/training/__pycache__/runnable.cpython-36.pyc
--- a/tf2_common/training/__pycache__/runnable.cpython-38.pyc
+++ b/tf2_common/training/__pycache__/runnable.cpython-38.pyc