Add benchmarks with the --cloning flag to Resnet and NFC. (#6675)

* Add benchmarks with the --cloning flag to Resnet and NFC. * Renamed cloning to clone_model_in_keras_dist_strat. Dropped a few tests that aren't essential. * Fixed up the formatting after re-naming the flag to a much longer name. Thanks, lint. * Fixed the lint error in nfc_common.py

Add benchmarks with the --cloning flag to Resnet and NFC. (#6675)
* Add benchmarks with the --cloning flag to Resnet and NFC. * Renamed cloning to clone_model_in_keras_dist_strat. Dropped a few tests that aren't essential. * Fixed up the formatting after re-naming the flag to a much longer name. Thanks, lint. * Fixed the lint error in nfc_common.py
af47736d · Igor · GitHub · d087c89b · af47736d · af47736d
Unverified Commit af47736d authored Apr 29, 2019 by Igor Committed by GitHub Apr 29, 2019
7 changed files
--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -104,6 +104,8 @@ def parse_flags(flags_obj):
      "epsilon": flags_obj.epsilon,
      "match_mlperf": flags_obj.ml_perf,
      "use_xla_for_gpu": flags_obj.use_xla_for_gpu,
+      "clone_model_in_keras_dist_strat":
+          flags_obj.clone_model_in_keras_dist_strat,
      "epochs_between_evals": FLAGS.epochs_between_evals,
      "turn_off_distribution_strategy": FLAGS.turn_off_distribution_strategy,
  }
@@ -312,6 +314,13 @@ def define_ncf_flags():
  def xla_validator(flag_dict):
    return not flag_dict["use_xla_for_gpu"] or not flag_dict["tpu"]

+  flags.DEFINE_bool(
+      name="clone_model_in_keras_dist_strat",
+      default=True,
+      help=flags_core.help_wrap(
+          'If False, then the experimental code path is used that doesn\'t '
+          "clone models for distribution."))
+

 def convert_to_softmax_logits(logits):
  '''Convert the logits returned by the base model to softmax logits.

--- a/official/recommendation/ncf_keras_benchmark.py
+++ b/official/recommendation/ncf_keras_benchmark.py
@@ -113,11 +113,22 @@ class KerasNCFRealData(KerasNCFBenchmarkBase):
    self._setup()
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_cloning(self):
+    self._setup()
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
+
  def benchmark_2_gpus(self):
    self._setup()
    FLAGS.num_gpus = 2
    self._run_and_report_benchmark()

+  def benchmark_2_gpus_no_cloning(self):
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
+

 class KerasNCFSyntheticData(KerasNCFBenchmarkBase):
  """Benchmark NCF model using synthetic data."""
@@ -155,7 +166,18 @@ class KerasNCFSyntheticData(KerasNCFBenchmarkBase):
    self._setup()
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_cloning(self):
+    self._setup()
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
+
  def benchmark_2_gpus(self):
    self._setup()
    FLAGS.num_gpus = 2
    self._run_and_report_benchmark()
+
+  def benchmark_2_gpus_no_cloning(self):
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -238,7 +238,8 @@ def run_ncf(_):
    keras_model.compile(
        loss=_keras_loss,
        metrics=[_get_metric_fn(params)],
-        optimizer=optimizer)
+        optimizer=optimizer,
+        cloning=params["clone_model_in_keras_dist_strat"])

    history = keras_model.fit(train_input_dataset,
                              epochs=FLAGS.train_epochs,

--- a/official/resnet/keras/keras_cifar_benchmark.py
+++ b/official/resnet/keras/keras_cifar_benchmark.py
@@ -92,6 +92,19 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
    FLAGS.enable_eager = True
    self._run_and_report_benchmark()

+  def benchmark_2_gpu_no_cloning(self):
+    """Test keras based model with eager, distributed no-cloning."""
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_no_cloning')
+    FLAGS.dtype = 'fp32'
+    FLAGS.clone_model_in_keras_dist_strat = False
+    FLAGS.enable_eager = True
+    self._run_and_report_benchmark()
+
  def benchmark_graph_2_gpu(self):
    """Test keras based model with Keras fit and distribution strategies."""
    self._setup()
@@ -198,6 +211,16 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128 * 2  # 2 GPUs
    self._run_and_report_benchmark()

+  def benchmark_2_gpu_no_cloning(self):
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'default'
+    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_no_cloning')
+    FLAGS.batch_size = 128 * 2  # 2 GPUs
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
+
  def benchmark_graph_2_gpu(self):
    self._setup()
    FLAGS.num_gpus = 2

--- a/official/resnet/keras/keras_common.py
+++ b/official/resnet/keras/keras_common.py
@@ -291,6 +291,9 @@ def define_keras_flags():
      'help improve performance using EagerIterator and function. The codepath '
      'when enabling this feature is experimental and will be removed once the '
      'corresponding performance features are fully supported in TensorFlow.')
+  flags.DEFINE_boolean(name='clone_model_in_keras_dist_strat', default=True,
+                       help='If False, then the experimental code path is used'
+                       ' that doesn\'t clone models for distribution.')


 def get_synth_input_fn(height, width, num_channels, num_classes,

--- a/official/resnet/keras/keras_imagenet_benchmark.py
+++ b/official/resnet/keras/keras_imagenet_benchmark.py
@@ -201,6 +201,18 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_no_cloning(self):
+    """Test Keras model with 1 GPU and no-cloning."""
+    self._setup()
+
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'default'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_cloning')
+    FLAGS.batch_size = 128
+    FLAGS.clone_model_in_keras_dist_strat = False
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu(self):
    """Test Keras model with XLA and 1 GPU."""
    self._setup()
@@ -356,6 +368,18 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
    FLAGS.batch_size = 128 * 8  # 8 GPUs
    self._run_and_report_benchmark()

+  def benchmark_8_gpu_no_cloning(self):
+    """Test Keras model with 8 GPUs and no-cloning."""
+    self._setup()
+
+    FLAGS.num_gpus = 8
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'default'
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_no_cloning')
+    FLAGS.clone_model_in_keras_dist_strat = False
+    FLAGS.batch_size = 128 * 8  # 8 GPUs
+    self._run_and_report_benchmark()
+
  def benchmark_8_gpu_tweaked(self):
    """Test Keras model with manual config tuning and 8 GPUs."""
    self._setup()

--- a/official/resnet/keras/keras_imagenet_main.py
+++ b/official/resnet/keras/keras_imagenet_main.py
@@ -197,7 +197,8 @@ def run(flags_obj):

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
-                  metrics=['sparse_categorical_accuracy'])
+                  metrics=['sparse_categorical_accuracy'],
+                  cloning=flags_obj.clone_model_in_keras_dist_strat)

  callbacks = keras_common.get_callbacks(
      learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])