Add distribution strategy to keras benchmark (#5188)

* Add distribution strategy to keras benchmark * Fix comments * Fix lints

Add distribution strategy to keras benchmark (#5188)
* Add distribution strategy to keras benchmark * Fix comments * Fix lints
28863de1 · Yanhui Liang · GitHub · 6a0dda1f · 28863de1 · 28863de1
Unverified Commit 28863de1 authored Aug 29, 2018 by Yanhui Liang Committed by GitHub Aug 29, 2018
Showing with 60 additions and 19 deletions

official/keras_application_models/README.md official/keras_application_models/README.md +11 -2

official/keras_application_models/benchmark_main.py official/keras_application_models/benchmark_main.py +49 -17

No files found.
--- a/official/keras_application_models/README.md
+++ b/official/keras_application_models/README.md
@@ -19,10 +19,19 @@ Synthetic dataset is used for the benchmark.
 Two custom callbacks are provided for model benchmarking: ExamplesPerSecondCallback and LoggingMetricCallback. For each callback, `epoch_based` and `batch_based` options are available to set the benchmark level. Check [model_callbacks.py](model_callbacks.py) for more details.

 ## Running Code
-To benchmark a model, use `--model` to specify the model name, and issue the following command:
+To benchmark a model, use `--model` to specify the model name. To perform the benchmark with eager execution, issue the following command:
 ```
-python benchmark_main.py --model=resnet
+python benchmark_main.py --model resnet50 --eager
 ```
+Note that, if eager execution is enabled, only one GPU is utilized even if multiple GPUs are provided and multi_gpu_model is used.
+
+
+To use distribution strategy in the benchmark, run the following:
+```
+python benchmark_main.py --model resnet50 --dist_strat
+```
+Currently, only one of the --eager and --dist_strat arguments can be defined, as DistributionStrategy is not supported in Eager execution now.
+
 Arguments:
  * `--model`: Which model to be benchmarked. The model name is defined as the keys of `MODELS` in [benchmark_main.py](benchmark_main.py).
  * `--callbacks`: To specify a list of callbacks.

--- a/official/keras_application_models/benchmark_main.py
+++ b/official/keras_application_models/benchmark_main.py
@@ -28,6 +28,7 @@ from official.keras_application_models import dataset
 from official.keras_application_models import model_callbacks
 from official.utils.flags import core as flags_core
 from official.utils.logs import logger
+from official.utils.misc import distribution_utils

 # Define a dictionary that maps model names to their model classes inside Keras
 MODELS = {
@@ -41,9 +42,8 @@ MODELS = {
    "densenet121": tf.keras.applications.DenseNet121,
    "densenet169": tf.keras.applications.DenseNet169,
    "densenet201": tf.keras.applications.DenseNet201,
-    # TODO(b/80431378)
-    # "nasnetlarge": tf.keras.applications.NASNetLarge,
-    # "nasnetmobile": tf.keras.applications.NASNetMobile,
+    "nasnetlarge": tf.keras.applications.NASNetLarge,
+    "nasnetmobile": tf.keras.applications.NASNetMobile,
 }


@@ -76,28 +76,39 @@ def run_keras_model_benchmark(_):
  else:
    raise ValueError("Only synthetic dataset is supported!")

-  # If run with multiple GPUs
+  num_gpus = flags_core.get_num_gpus(FLAGS)
+
+  distribution = None
+  # Use distribution strategy
+  if FLAGS.dist_strat:
+    distribution = distribution_utils.get_distribution_strategy(
+        num_gpus=num_gpus)
+  elif num_gpus > 1:
+    # Run with multi_gpu_model
    # If eager execution is enabled, only one GPU is utilized even if multiple
    # GPUs are provided.
-  num_gpus = flags_core.get_num_gpus(FLAGS)
-  if num_gpus > 1:
    if FLAGS.eager:
      tf.logging.warning(
          "{} GPUs are provided, but only one GPU is utilized as "
          "eager execution is enabled.".format(num_gpus))
    model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)

+  # Adam optimizer and some other optimizers doesn't work well with
+  # distribution strategy (b/113076709)
+  # Use GradientDescentOptimizer here
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
  model.compile(loss="categorical_crossentropy",
-                optimizer=tf.train.AdamOptimizer(),
-                metrics=["accuracy"])
+                optimizer=optimizer,
+                metrics=["accuracy"],
+                distribute=distribution)

  # Create benchmark logger for benchmark logging
  run_params = {
      "batch_size": FLAGS.batch_size,
      "synthetic_data": FLAGS.use_synthetic_data,
      "train_epochs": FLAGS.train_epochs,
-      "num_train_images": FLAGS.num_images,
-      "num_eval_images": FLAGS.num_images,
+      "num_train_images": FLAGS.num_train_images,
+      "num_eval_images": FLAGS.num_eval_images,
  }

  benchmark_logger = logger.get_benchmark_logger()
@@ -118,8 +129,8 @@ def run_keras_model_benchmark(_):
      epochs=FLAGS.train_epochs,
      callbacks=callbacks,
      validation_data=val_dataset,
-      steps_per_epoch=int(np.ceil(FLAGS.num_images / FLAGS.batch_size)),
-      validation_steps=int(np.ceil(FLAGS.num_images / FLAGS.batch_size))
+      steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
+      validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
  )

  tf.logging.info("Logging the evaluation results...")
@@ -128,7 +139,7 @@ def run_keras_model_benchmark(_):
        "accuracy": history.history["val_acc"][epoch],
        "loss": history.history["val_loss"][epoch],
        tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
-            FLAGS.num_images/FLAGS.batch_size)
+            FLAGS.num_eval_images/FLAGS.batch_size)
    }
    benchmark_logger.log_evaluation_result(eval_results)

@@ -157,10 +168,16 @@ def define_keras_benchmark_flags():
          "Model to be benchmarked."))

  flags.DEFINE_integer(
-      name="num_images", default=1000,
+      name="num_train_images", default=1000,
      help=flags_core.help_wrap(
-          "The number of synthetic images for training and evaluation. The "
-          "default value is 1000."))
+          "The number of synthetic images for training. The default value is "
+          "1000."))
+
+  flags.DEFINE_integer(
+      name="num_eval_images", default=50,
+      help=flags_core.help_wrap(
+          "The number of synthetic images for evaluation. The default value is "
+          "50."))

  flags.DEFINE_boolean(
      name="eager", default=False, help=flags_core.help_wrap(
@@ -168,6 +185,12 @@ def define_keras_benchmark_flags():
          "only one GPU is utilized even if multiple GPUs are provided and "
          "multi_gpu_model is used."))

+  flags.DEFINE_boolean(
+      name="dist_strat", default=False, help=flags_core.help_wrap(
+          "To enable distribution strategy for model training and evaluation. "
+          "Number of GPUs used for distribution strategy can be set by the "
+          "argument --num_gpus."))
+
  flags.DEFINE_list(
      name="callbacks",
      default=["ExamplesPerSecondCallback", "LoggingMetricCallback"],
@@ -176,6 +199,15 @@ def define_keras_benchmark_flags():
          "callbacks. For example: `--callbacks ExamplesPerSecondCallback,"
          "LoggingMetricCallback`"))

+  @flags.multi_flags_validator(
+      ["eager", "dist_strat"],
+      message="Both --eager and --dist_strat were set. Only one can be "
+              "defined, as DistributionStrategy is not supported in Eager "
+              "execution currently.")
+  # pylint: disable=unused-variable
+  def _check_eager_dist_strat(flag_dict):
+    return not(flag_dict["eager"] and flag_dict["dist_strat"])
+

 def main(_):
  with logger.benchmark_context(FLAGS):