"vscode:/vscode.git/clone" did not exist on "368ad0d38ca8bc90437e2792dd1a7e6dcf50e0b7"
Unverified Commit af47736d authored by Igor's avatar Igor Committed by GitHub
Browse files

Add benchmarks with the --cloning flag to Resnet and NFC. (#6675)

* Add benchmarks with the --cloning flag to Resnet and NFC.

* Renamed cloning to clone_model_in_keras_dist_strat. Dropped a few tests that aren't essential.

* Fixed up the formatting after re-naming the flag to a much longer  name.  Thanks, lint.
* Fixed the lint error in nfc_common.py
parent d087c89b
......@@ -104,6 +104,8 @@ def parse_flags(flags_obj):
"epsilon": flags_obj.epsilon,
"match_mlperf": flags_obj.ml_perf,
"use_xla_for_gpu": flags_obj.use_xla_for_gpu,
"clone_model_in_keras_dist_strat":
flags_obj.clone_model_in_keras_dist_strat,
"epochs_between_evals": FLAGS.epochs_between_evals,
"turn_off_distribution_strategy": FLAGS.turn_off_distribution_strategy,
}
......@@ -312,6 +314,13 @@ def define_ncf_flags():
def xla_validator(flag_dict):
return not flag_dict["use_xla_for_gpu"] or not flag_dict["tpu"]
flags.DEFINE_bool(
name="clone_model_in_keras_dist_strat",
default=True,
help=flags_core.help_wrap(
'If False, then the experimental code path is used that doesn\'t '
"clone models for distribution."))
def convert_to_softmax_logits(logits):
'''Convert the logits returned by the base model to softmax logits.
......
......@@ -113,11 +113,22 @@ class KerasNCFRealData(KerasNCFBenchmarkBase):
self._setup()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_cloning(self):
self._setup()
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
def benchmark_2_gpus(self):
self._setup()
FLAGS.num_gpus = 2
self._run_and_report_benchmark()
def benchmark_2_gpus_no_cloning(self):
self._setup()
FLAGS.num_gpus = 2
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
class KerasNCFSyntheticData(KerasNCFBenchmarkBase):
"""Benchmark NCF model using synthetic data."""
......@@ -155,7 +166,18 @@ class KerasNCFSyntheticData(KerasNCFBenchmarkBase):
self._setup()
self._run_and_report_benchmark()
def benchmark_1_gpu_no_cloning(self):
self._setup()
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
def benchmark_2_gpus(self):
self._setup()
FLAGS.num_gpus = 2
self._run_and_report_benchmark()
def benchmark_2_gpus_no_cloning(self):
self._setup()
FLAGS.num_gpus = 2
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
......@@ -238,7 +238,8 @@ def run_ncf(_):
keras_model.compile(
loss=_keras_loss,
metrics=[_get_metric_fn(params)],
optimizer=optimizer)
optimizer=optimizer,
cloning=params["clone_model_in_keras_dist_strat"])
history = keras_model.fit(train_input_dataset,
epochs=FLAGS.train_epochs,
......
......@@ -92,6 +92,19 @@ class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
FLAGS.enable_eager = True
self._run_and_report_benchmark()
def benchmark_2_gpu_no_cloning(self):
"""Test keras based model with eager, distributed no-cloning."""
self._setup()
FLAGS.num_gpus = 2
FLAGS.data_dir = self.data_dir
FLAGS.batch_size = 128
FLAGS.train_epochs = 182
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_no_cloning')
FLAGS.dtype = 'fp32'
FLAGS.clone_model_in_keras_dist_strat = False
FLAGS.enable_eager = True
self._run_and_report_benchmark()
def benchmark_graph_2_gpu(self):
"""Test keras based model with Keras fit and distribution strategies."""
self._setup()
......@@ -198,6 +211,16 @@ class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.batch_size = 128 * 2 # 2 GPUs
self._run_and_report_benchmark()
def benchmark_2_gpu_no_cloning(self):
self._setup()
FLAGS.num_gpus = 2
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'default'
FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu_no_cloning')
FLAGS.batch_size = 128 * 2 # 2 GPUs
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
def benchmark_graph_2_gpu(self):
self._setup()
FLAGS.num_gpus = 2
......
......@@ -291,6 +291,9 @@ def define_keras_flags():
'help improve performance using EagerIterator and function. The codepath '
'when enabling this feature is experimental and will be removed once the '
'corresponding performance features are fully supported in TensorFlow.')
flags.DEFINE_boolean(name='clone_model_in_keras_dist_strat', default=True,
help='If False, then the experimental code path is used'
' that doesn\'t clone models for distribution.')
def get_synth_input_fn(height, width, num_channels, num_classes,
......
......@@ -201,6 +201,18 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.batch_size = 128
self._run_and_report_benchmark()
def benchmark_1_gpu_no_cloning(self):
"""Test Keras model with 1 GPU and no-cloning."""
self._setup()
FLAGS.num_gpus = 1
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'default'
FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_cloning')
FLAGS.batch_size = 128
FLAGS.clone_model_in_keras_dist_strat = False
self._run_and_report_benchmark()
def benchmark_xla_1_gpu(self):
"""Test Keras model with XLA and 1 GPU."""
self._setup()
......@@ -356,6 +368,18 @@ class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_no_cloning(self):
"""Test Keras model with 8 GPUs and no-cloning."""
self._setup()
FLAGS.num_gpus = 8
FLAGS.enable_eager = True
FLAGS.distribution_strategy = 'default'
FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_no_cloning')
FLAGS.clone_model_in_keras_dist_strat = False
FLAGS.batch_size = 128 * 8 # 8 GPUs
self._run_and_report_benchmark()
def benchmark_8_gpu_tweaked(self):
"""Test Keras model with manual config tuning and 8 GPUs."""
self._setup()
......
......@@ -197,7 +197,8 @@ def run(flags_obj):
model.compile(loss='sparse_categorical_crossentropy',
optimizer=optimizer,
metrics=['sparse_categorical_accuracy'])
metrics=['sparse_categorical_accuracy'],
cloning=flags_obj.clone_model_in_keras_dist_strat)
callbacks = keras_common.get_callbacks(
learning_rate_schedule, imagenet_main.NUM_IMAGES['train'])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment