Unverified Commit 28863de1 authored by Yanhui Liang's avatar Yanhui Liang Committed by GitHub
Browse files

Add distribution strategy to keras benchmark (#5188)

* Add distribution strategy to keras benchmark

* Fix comments

* Fix lints
parent 6a0dda1f
......@@ -19,10 +19,19 @@ Synthetic dataset is used for the benchmark.
Two custom callbacks are provided for model benchmarking: ExamplesPerSecondCallback and LoggingMetricCallback. For each callback, `epoch_based` and `batch_based` options are available to set the benchmark level. Check [model_callbacks.py](model_callbacks.py) for more details.
## Running Code
To benchmark a model, use `--model` to specify the model name, and issue the following command:
To benchmark a model, use `--model` to specify the model name. To perform the benchmark with eager execution, issue the following command:
```
python benchmark_main.py --model=resnet
python benchmark_main.py --model resnet50 --eager
```
Note that, if eager execution is enabled, only one GPU is utilized even if multiple GPUs are provided and multi_gpu_model is used.
To use distribution strategy in the benchmark, run the following:
```
python benchmark_main.py --model resnet50 --dist_strat
```
Currently, only one of the --eager and --dist_strat arguments can be defined, as DistributionStrategy is not supported in Eager execution now.
Arguments:
* `--model`: Which model to be benchmarked. The model name is defined as the keys of `MODELS` in [benchmark_main.py](benchmark_main.py).
* `--callbacks`: To specify a list of callbacks.
......
......@@ -28,6 +28,7 @@ from official.keras_application_models import dataset
from official.keras_application_models import model_callbacks
from official.utils.flags import core as flags_core
from official.utils.logs import logger
from official.utils.misc import distribution_utils
# Define a dictionary that maps model names to their model classes inside Keras
MODELS = {
......@@ -41,9 +42,8 @@ MODELS = {
"densenet121": tf.keras.applications.DenseNet121,
"densenet169": tf.keras.applications.DenseNet169,
"densenet201": tf.keras.applications.DenseNet201,
# TODO(b/80431378)
# "nasnetlarge": tf.keras.applications.NASNetLarge,
# "nasnetmobile": tf.keras.applications.NASNetMobile,
"nasnetlarge": tf.keras.applications.NASNetLarge,
"nasnetmobile": tf.keras.applications.NASNetMobile,
}
......@@ -76,28 +76,39 @@ def run_keras_model_benchmark(_):
else:
raise ValueError("Only synthetic dataset is supported!")
# If run with multiple GPUs
num_gpus = flags_core.get_num_gpus(FLAGS)
distribution = None
# Use distribution strategy
if FLAGS.dist_strat:
distribution = distribution_utils.get_distribution_strategy(
num_gpus=num_gpus)
elif num_gpus > 1:
# Run with multi_gpu_model
# If eager execution is enabled, only one GPU is utilized even if multiple
# GPUs are provided.
num_gpus = flags_core.get_num_gpus(FLAGS)
if num_gpus > 1:
if FLAGS.eager:
tf.logging.warning(
"{} GPUs are provided, but only one GPU is utilized as "
"eager execution is enabled.".format(num_gpus))
model = tf.keras.utils.multi_gpu_model(model, gpus=num_gpus)
# Adam optimizer and some other optimizers doesn't work well with
# distribution strategy (b/113076709)
# Use GradientDescentOptimizer here
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.001)
model.compile(loss="categorical_crossentropy",
optimizer=tf.train.AdamOptimizer(),
metrics=["accuracy"])
optimizer=optimizer,
metrics=["accuracy"],
distribute=distribution)
# Create benchmark logger for benchmark logging
run_params = {
"batch_size": FLAGS.batch_size,
"synthetic_data": FLAGS.use_synthetic_data,
"train_epochs": FLAGS.train_epochs,
"num_train_images": FLAGS.num_images,
"num_eval_images": FLAGS.num_images,
"num_train_images": FLAGS.num_train_images,
"num_eval_images": FLAGS.num_eval_images,
}
benchmark_logger = logger.get_benchmark_logger()
......@@ -118,8 +129,8 @@ def run_keras_model_benchmark(_):
epochs=FLAGS.train_epochs,
callbacks=callbacks,
validation_data=val_dataset,
steps_per_epoch=int(np.ceil(FLAGS.num_images / FLAGS.batch_size)),
validation_steps=int(np.ceil(FLAGS.num_images / FLAGS.batch_size))
steps_per_epoch=int(np.ceil(FLAGS.num_train_images / FLAGS.batch_size)),
validation_steps=int(np.ceil(FLAGS.num_eval_images / FLAGS.batch_size))
)
tf.logging.info("Logging the evaluation results...")
......@@ -128,7 +139,7 @@ def run_keras_model_benchmark(_):
"accuracy": history.history["val_acc"][epoch],
"loss": history.history["val_loss"][epoch],
tf.GraphKeys.GLOBAL_STEP: (epoch + 1) * np.ceil(
FLAGS.num_images/FLAGS.batch_size)
FLAGS.num_eval_images/FLAGS.batch_size)
}
benchmark_logger.log_evaluation_result(eval_results)
......@@ -157,10 +168,16 @@ def define_keras_benchmark_flags():
"Model to be benchmarked."))
flags.DEFINE_integer(
name="num_images", default=1000,
name="num_train_images", default=1000,
help=flags_core.help_wrap(
"The number of synthetic images for training and evaluation. The "
"default value is 1000."))
"The number of synthetic images for training. The default value is "
"1000."))
flags.DEFINE_integer(
name="num_eval_images", default=50,
help=flags_core.help_wrap(
"The number of synthetic images for evaluation. The default value is "
"50."))
flags.DEFINE_boolean(
name="eager", default=False, help=flags_core.help_wrap(
......@@ -168,6 +185,12 @@ def define_keras_benchmark_flags():
"only one GPU is utilized even if multiple GPUs are provided and "
"multi_gpu_model is used."))
flags.DEFINE_boolean(
name="dist_strat", default=False, help=flags_core.help_wrap(
"To enable distribution strategy for model training and evaluation. "
"Number of GPUs used for distribution strategy can be set by the "
"argument --num_gpus."))
flags.DEFINE_list(
name="callbacks",
default=["ExamplesPerSecondCallback", "LoggingMetricCallback"],
......@@ -176,6 +199,15 @@ def define_keras_benchmark_flags():
"callbacks. For example: `--callbacks ExamplesPerSecondCallback,"
"LoggingMetricCallback`"))
@flags.multi_flags_validator(
["eager", "dist_strat"],
message="Both --eager and --dist_strat were set. Only one can be "
"defined, as DistributionStrategy is not supported in Eager "
"execution currently.")
# pylint: disable=unused-variable
def _check_eager_dist_strat(flag_dict):
return not(flag_dict["eager"] and flag_dict["dist_strat"])
def main(_):
with logger.benchmark_context(FLAGS):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment