Address comments from Aug 7

a87bb185 · Ruomei Yan · 20101930 · a87bb185 · a87bb185 · a87bb185
Commit a87bb185 authored Aug 17, 2020 by Ruomei Yan
3 changed files
--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
@@ -905,7 +905,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.batch_size = 128 * 8
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_8_gpu_amp(self):
@@ -918,7 +918,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.fp16_implementation = 'graph_rewrite'
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_8_gpu_tweaked(self):
@@ -929,7 +929,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
-    FLAGS.batch_size = 128 * 8
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
    FLAGS.datasets_num_private_threads = 14
    self._run_and_report_benchmark()

@@ -942,7 +942,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_xla = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
-    FLAGS.batch_size = 128 * 8
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_xla_8_gpu_amp(self):
@@ -956,7 +956,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_xla = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_xla_8_gpu_tweaked(self):
@@ -982,7 +982,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_8_gpu_fp16_tweaked(self):
@@ -994,7 +994,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    FLAGS.datasets_num_private_threads = 40
    self._run_and_report_benchmark()
@@ -1009,7 +1009,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir(
        'benchmark_8_gpu_fp16_dynamic_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.loss_scale = 'dynamic'
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    FLAGS.datasets_num_private_threads = 40
@@ -1025,7 +1025,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_xla = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    self._run_and_report_benchmark()

  def benchmark_xla_8_gpu_fp16_tweaked(self):
@@ -1038,7 +1038,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.enable_xla = True
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    FLAGS.datasets_num_private_threads = 48
    self._run_and_report_benchmark()
@@ -1074,7 +1074,7 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.distribution_strategy = 'mirrored'
    FLAGS.model_dir = self._get_model_dir(
        'benchmark_xla_8_gpu_fp16_dynamic_tweaked')
-    FLAGS.batch_size = 256 * 8
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
    FLAGS.loss_scale = 'dynamic'
    FLAGS.tf_gpu_thread_mode = 'gpu_private'
    FLAGS.datasets_num_private_threads = 48

--- a/official/benchmark/models/resnet_imagenet_main.py
+++ b/official/benchmark/models/resnet_imagenet_main.py
@@ -26,8 +26,6 @@ from absl import flags
 from absl import logging
 import tensorflow as tf

-
-import tensorflow_model_optimization as tfmot
 from official.modeling import performance
 from official.utils.flags import core as flags_core
 from official.utils.misc import distribution_utils
@@ -40,34 +38,28 @@ from official.vision.image_classification.resnet import resnet_model


 def cluster_last_three_conv2d_layers(model):
+  import tensorflow_model_optimization as tfmot
  last_three_conv2d_layers =  [
      layer for layer in model.layers
-      if isinstance(layer, tf.keras.layers.Conv2D) and
-      not isinstance(layer, tf.keras.layers.DepthwiseConv2D)
-    ]
-  last_three_conv2d_layers = last_three_conv2d_layers[-3:]
+      if isinstance(layer, tf.keras.layers.Conv2D)
+    ][-3:]

  cluster_weights = tfmot.clustering.keras.cluster_weights
  CentroidInitialization = tfmot.clustering.keras.CentroidInitialization
-  clustering_params1 = {
-      'number_of_clusters': 256,
-      'cluster_centroids_init': CentroidInitialization.LINEAR
-  }
-  clustering_params2 = {
-      'number_of_clusters': 32,
-      'cluster_centroids_init': CentroidInitialization.LINEAR
-  }

  def cluster_fn(layer):
    if layer not in last_three_conv2d_layers:
-        return layer
+      return layer

-    if layer == last_three_conv2d_layers[0] or layer == last_three_conv2d_layers[1]:
-        clustered = cluster_weights(layer, **clustering_params1)
-        print("Clustered {} with {} clusters".format(layer.name, clustering_params1['number_of_clusters']))
+    if layer == last_three_conv2d_layers[0] or \
+      layer == last_three_conv2d_layers[1]:
+      clustered = cluster_weights(layer, number_of_clusters=256, \
+          cluster_centroids_init=CentroidInitialization.LINEAR)
+      print("Clustered {} with 256 clusters".format(layer.name))
    else:
-        clustered = cluster_weights(layer, **clustering_params2)
-        print("Clustered {} with {} clusters".format(layer.name, clustering_params2['number_of_clusters']))
+      clustered = cluster_weights(layer, number_of_clusters=32, \
+          cluster_centroids_init=CentroidInitialization.LINEAR)
+      print("Clustered {} with 32 clusters".format(layer.name))
    return clustered

  return tf.keras.models.clone_model(model, clone_function=cluster_fn)
@@ -151,7 +143,8 @@ def run(flags_obj):
  # This use_keras_image_data_format flags indicates whether image preprocessor
  # output format should be same as the keras backend image data format or just
  # channel-last format.
-  use_keras_image_data_format = (flags_obj.model == 'mobilenet' or 'mobilenet_pretrained')
+  use_keras_image_data_format = \
+    (flags_obj.model == 'mobilenet' or 'mobilenet_pretrained')
  train_input_dataset = input_fn(
      is_training=True,
      data_dir=flags_obj.data_dir,
@@ -183,15 +176,17 @@ def run(flags_obj):
      boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]),
      multipliers=list(p[0] for p in common.LR_SCHEDULE),
      compute_lr_on_cpu=True)
-  steps_per_epoch = (imagenet_preprocessing.NUM_IMAGES['train'] //
-                     flags_obj.batch_size)
+  steps_per_epoch = (
+      imagenet_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)

  with strategy_scope:
    if flags_obj.optimizer == 'resnet50_default':
      optimizer = common.get_optimizer(lr_schedule)
-    elif flags_obj.optimizer == 'mobilenet_default':
+    elif flags_obj.optimizer == 'mobilenet_default' or 'mobilenet_fine_tune':
      initial_learning_rate = \
          flags_obj.initial_learning_rate_per_sample * flags_obj.batch_size
+      if flags_obj.optimizer == 'mobilenet_fine_tune':
+        initial_learning_rate = 1e-5
      optimizer = tf.keras.optimizers.SGD(
          learning_rate=tf.keras.optimizers.schedules.ExponentialDecay(
              initial_learning_rate,
@@ -199,8 +194,6 @@ def run(flags_obj):
              decay_rate=flags_obj.lr_decay_factor,
              staircase=True),
          momentum=0.9)
-    elif flags_obj.optimizer == 'mobilenet_fine_tune':
-      optimizer = tf.keras.optimizers.SGD(learning_rate=1e-5, momentum=0.9)

    if flags_obj.fp16_implementation == 'graph_rewrite':
      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
@@ -217,24 +210,25 @@ def run(flags_obj):
    elif flags_obj.model == 'resnet50_v1.5':
      model = resnet_model.resnet50(
          num_classes=imagenet_preprocessing.NUM_CLASSES)
-    elif flags_obj.model == 'mobilenet':
+    elif flags_obj.model == 'mobilenet' or 'mobilenet_pretrained':
      # TODO(kimjaehong): Remove layers attribute when minimum TF version
      # support 2.0 layers by default.
+      if flags_obj.model == 'mobilenet_pretrained':
+        classes_labels = 1000
+        initial_weights = 'imagenet'
+      else:
+        classes_labels = imagenet_preprocessing.NUM_CLASSES
+        initial_weights = None
      model = tf.keras.applications.mobilenet.MobileNet(
-          weights=None,
-          classes=imagenet_preprocessing.NUM_CLASSES,
-          layers=tf.keras.layers)
-    elif flags_obj.model == 'mobilenet_pretrained':
-      model = tf.keras.applications.mobilenet.MobileNet(
-          dropout=1e-7,
-          weights='imagenet',
-          classes=1000,
+          weights=initial_weights,
+          classes=classes_labels,
          layers=tf.keras.layers)

    if flags_obj.pretrained_filepath:
      model.load_weights(flags_obj.pretrained_filepath)

    if flags_obj.pruning_method == 'polynomial_decay':
+      import tensorflow_model_optimization as tfmot
      if dtype != tf.float32:
        raise NotImplementedError(
            'Pruning is currently only supported on dtype=tf.float32.')
@@ -252,7 +246,9 @@ def run(flags_obj):
      raise NotImplementedError('Only polynomial_decay is currently supported.')

    if flags_obj.clustering_method == 'selective_clustering':
-      if dtype != tf.float32 or flags_obj.fp16_implementation == 'graph_rewrite':
+      import tensorflow_model_optimization as tfmot
+      if dtype != tf.float32 or \
+        flags_obj.fp16_implementation == 'graph_rewrite':
        raise NotImplementedError(
            'Clustering is currently only supported on dtype=tf.float32.')
      model = cluster_last_three_conv2d_layers(model)
@@ -260,11 +256,12 @@ def run(flags_obj):
      raise NotImplementedError(
          'Only selective_clustering is implemented.')

-    model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=optimizer,
-                  metrics=(['sparse_categorical_accuracy']
-                          if flags_obj.report_accuracy_metrics else None),
-                  run_eagerly=flags_obj.run_eagerly)
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=(['sparse_categorical_accuracy']
+                 if flags_obj.report_accuracy_metrics else None),
+        run_eagerly=flags_obj.run_eagerly)

  train_epochs = flags_obj.train_epochs

@@ -278,8 +275,8 @@ def run(flags_obj):
    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
    train_epochs = 1

-  num_eval_steps = (imagenet_preprocessing.NUM_IMAGES['validation'] //
-                    flags_obj.batch_size)
+  num_eval_steps = (
+      imagenet_preprocessing.NUM_IMAGES['validation'] // flags_obj.batch_size)

  validation_data = eval_input_dataset
  if flags_obj.skip_eval:
@@ -336,9 +333,10 @@ def run(flags_obj):


 def define_imagenet_keras_flags():
-  common.define_keras_flags(model=True,
-                            optimizer=True,
-                            pretrained_filepath=True)
+  common.define_keras_flags(
+      model=True,
+      optimizer=True,
+      pretrained_filepath=True)
  common.define_pruning_flags()
  common.define_clustering_flags()
  flags_core.set_defaults()
@@ -354,4 +352,4 @@ def main(_):
 if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  define_imagenet_keras_flags()
-  app.run(main)
\ No newline at end of file
+  app.run(main)
--- a/official/vision/image_classification/resnet/common.py
+++ b/official/vision/image_classification/resnet/common.py
@@ -250,14 +250,14 @@ def define_keras_flags(dynamic_loss_scale=True,
      default=False,
      help='Whether to enable Tensorboard callback.')
  flags.DEFINE_string(
-      name='profile_steps',
-      default=None,
-      help='Save profiling data to model dir at given range of global steps. The '
-      'value must be a comma separated pair of positive integers, specifying '
-      'the first and last step to profile. For example, "--profile_steps=2,4" '
-      'triggers the profiler to process 3 steps, starting from the 2nd step. '
-      'Note that profiler has a non-trivial performance overhead, and the '
-      'output file can be gigantic if profiling many steps.')
+      name='profile_steps', default=None,
+      help='Save profiling data to model dir at given range of global steps.'
+      'The value must be a comma separated pair of positive integers,'
+      'specifying the first and last step to profile. For example,'
+      '"--profile_steps=2,4" triggers the profiler to process 3 steps,'
+      'starting from the 2nd step. Note that profiler has a non-trivial'
+      'performance overhead, and the output file can be gigantic if'
+      'profiling many steps.')
  flags.DEFINE_integer(
      name='train_steps',
      default=None,