Unverified Commit 18e477c6 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Reorder and then add CTL XLA tests. (#7169)

parent cf1a276a
...@@ -117,8 +117,9 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -117,8 +117,9 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
return metrics return metrics
def benchmark_1_gpu(self): def benchmark_1_gpu_early_stop(self):
self._setup() self._setup()
FLAGS.early_stopping = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_early_stop(self): def benchmark_1_gpu_no_dist_strat_early_stop(self):
...@@ -127,11 +128,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -127,11 +128,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.early_stopping = True FLAGS.early_stopping = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_early_stop(self):
self._setup()
FLAGS.early_stopping = True
self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self): def benchmark_1_gpu_no_dist_strat_run_eagerly_early_stop(self):
self._setup() self._setup()
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
...@@ -145,13 +141,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -145,13 +141,6 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
# NCF with custom training loop. Works only in TF 2.0
def benchmark_1_gpu_ctl(self):
self._setup()
FLAGS.keras_use_ctl = True
self._run_and_report_benchmark()
# NCF with custom training loop. Works only in TF 2.0
def benchmark_1_gpu_ctl_early_stop(self): def benchmark_1_gpu_ctl_early_stop(self):
self._setup() self._setup()
FLAGS.keras_use_ctl = True FLAGS.keras_use_ctl = True
...@@ -165,24 +154,12 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -165,24 +154,12 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_2_gpus(self):
self._setup()
FLAGS.num_gpus = 2
self._run_and_report_benchmark()
def benchmark_2_gpus_early_stop(self): def benchmark_2_gpus_early_stop(self):
self._setup() self._setup()
FLAGS.early_stopping = True FLAGS.early_stopping = True
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_2_gpus_ctl(self):
"""NCF with custom training loop. Works only in TF 2.0."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.num_gpus = 2
self._run_and_report_benchmark()
def benchmark_2_gpus_ctl_early_stop(self): def benchmark_2_gpus_ctl_early_stop(self):
"""NCF with custom training loop. Works only in TF 2.0.""" """NCF with custom training loop. Works only in TF 2.0."""
self._setup() self._setup()
...@@ -191,33 +168,31 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -191,33 +168,31 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.num_gpus = 2 FLAGS.num_gpus = 2
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_ctl_mlperf_like(self): #############################################
"""1-GPU test to compare Google implementation with MLPerf 0.5. # Tests below with mlperf in the test name are of two types
# 1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
Using similar rules as MLPerf 0.5 # 2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
- Using Google's convergence hparams as base for 1-GPU test. #
- Fixed the number of epochs to 7, to remove the perf variance. # The purpose of both is to get a number to compare to existing results. To do
- MLPerf submission consistently converges in 7 epochs. # this the number of epochs is held constant rather than a race to a given
""" # accuracy. The accuracy validation is done by the "early_stop" tests.
self._setup() #############################################
FLAGS.keras_use_ctl = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
def benchmark_1_gpu_mlperf_like(self): def benchmark_1_gpu_mlperf_like(self):
"""1-GPU MLPerf like test with compile/fit version.""" """1 GPU using keras fit/compile."""
self._setup() self._setup()
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_mlperf_like(self): def benchmark_1_gpu_no_dist_strat_mlperf_like(self):
"""1-GPU MLPerf like test with compile/fit version without dist_strat.""" """1 GPU using compile/fit without dist_strat."""
self._setup() self._setup()
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_no_dist_strat_run_eagerly_mlperf_like(self): def benchmark_1_gpu_no_dist_strat_run_eagerly_mlperf_like(self):
"""1 GPU using compile/fit without dist_strat and force run eager."""
self._setup() self._setup()
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
FLAGS.distribution_strategy = 'off' FLAGS.distribution_strategy = 'off'
...@@ -225,22 +200,30 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -225,22 +200,30 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_xla_1_gpu_mlperf_like(self): def benchmark_xla_1_gpu_mlperf_like(self):
"""1-GPU MLPerf like test with compile/fit version w/xla.""" """1 GPU using compile/fit with XLA."""
self._setup() self._setup()
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
FLAGS.enable_xla = True FLAGS.enable_xla = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu_ctl_mlperf_like(self): def benchmark_1_gpu_ctl_mlperf_like(self):
"""8 GPU test meant to compare Google implementation. """1 GPU using CTL."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
MLPerf 0.5 top line submission using the def benchmark_xla_1_gpu_ctl_mlperf_like(self):
- hyper-parameters from the winning MLPerf0.5 submission. """1 GPU using CTL with XLA."""
- Using similar rules as MLPerf0.5
- Fixed epochs to MLPerf submission's convergence on 17 epochs
"""
self._setup() self._setup()
FLAGS.keras_use_ctl = True FLAGS.keras_use_ctl = True
FLAGS.enable_xla = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
def benchmark_8_gpu_mlperf_like(self):
"""8 GPU using keras fit/compile."""
self._setup()
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.train_epochs = 17 FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576 FLAGS.batch_size = 1048576
...@@ -250,14 +233,23 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -250,14 +233,23 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.epsilon = 1e-8 FLAGS.epsilon = 1e-8
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_8_gpu_mlperf_like(self): def benchmark_xla_8_gpu_mlperf_like(self):
"""8 GPU test meant to compare Google implementation """8 GPU using keras fit/compile with XLA."""
with MLperf top line submission using the self._setup()
hyper-parameters from the winning MLPerf0.5 submission. FLAGS.num_gpus = 8
Using similar rules as MLPerf0.5 FLAGS.enable_xla = True
Fixed epochs to MLPerf sumbmission's convergnce on 17 epochs FLAGS.train_epochs = 17
""" FLAGS.batch_size = 1048576
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
FLAGS.epsilon = 1e-8
self._run_and_report_benchmark()
def benchmark_8_gpu_ctl_mlperf_like(self):
"""8 GPU using CTL."""
self._setup() self._setup()
FLAGS.keras_use_ctl = True
FLAGS.num_gpus = 8 FLAGS.num_gpus = 8
FLAGS.train_epochs = 17 FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576 FLAGS.batch_size = 1048576
...@@ -267,6 +259,21 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -267,6 +259,21 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.epsilon = 1e-8 FLAGS.epsilon = 1e-8
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_xla_8_gpu_ctl_mlperf_like(self):
"""8 GPU using CTL with XLA."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.enable_xla = True
FLAGS.num_gpus = 8
FLAGS.train_epochs = 17
FLAGS.batch_size = 1048576
FLAGS.learning_rate = 0.0045
FLAGS.beta1 = 0.25
FLAGS.beta2 = 0.5
FLAGS.epsilon = 1e-8
self._run_and_report_benchmark()
class NCFKerasSynth(NCFKerasBenchmarkBase): class NCFKerasSynth(NCFKerasBenchmarkBase):
"""Benchmark NCF model using synthetic data.""" """Benchmark NCF model using synthetic data."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment