Commit 62184a96 authored by Toby Boyd's avatar Toby Boyd Committed by nnigania
Browse files

[NCF] Add run_eagerly for ctl. (#7229)

* Add run_eagerly for ctl.

* fix test name and do not set "default".
parent 58340818
...@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.early_stopping = True FLAGS.early_stopping = True
self._run_and_report_benchmark() self._run_and_report_benchmark()
def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.early_stopping = True
FLAGS.run_eagerly = True
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_early_stop(self): def benchmark_xla_1_gpu_ctl_early_stop(self):
self._setup() self._setup()
FLAGS.keras_use_ctl = True FLAGS.keras_use_ctl = True
...@@ -203,7 +210,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -203,7 +210,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
self._run_and_report_benchmark() self._run_and_report_benchmark()
############################################# #############################################
# Tests below with mlperf in the test name are of two types # Tests below with mlperf in the test name are of two types:
# 1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission. # 1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
# 2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters. # 2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
# #
...@@ -254,6 +261,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -254,6 +261,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like() self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
"""1 GPU using CTL with eager and distribution strategy."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.run_eagerly = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_mlperf_like(self): def benchmark_xla_1_gpu_ctl_mlperf_like(self):
"""1 GPU using CTL with XLA.""" """1 GPU using CTL with XLA."""
self._setup() self._setup()
......
...@@ -285,7 +285,6 @@ def run_ncf(_): ...@@ -285,7 +285,6 @@ def run_ncf(_):
train_input_iterator = strategy.make_dataset_iterator(train_input_dataset) train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset) eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)
@tf.function
def train_step(): def train_step():
"""Called once per step to train the model.""" """Called once per step to train the model."""
def step_fn(features): def step_fn(features):
...@@ -310,7 +309,6 @@ def run_ncf(_): ...@@ -310,7 +309,6 @@ def run_ncf(_):
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
return mean_loss return mean_loss
@tf.function
def eval_step(): def eval_step():
"""Called once per eval step to compute eval metrics.""" """Called once per eval step to compute eval metrics."""
def step_fn(features): def step_fn(features):
...@@ -330,6 +328,10 @@ def run_ncf(_): ...@@ -330,6 +328,10 @@ def run_ncf(_):
tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
return hr_sum, hr_count return hr_sum, hr_count
if not FLAGS.run_eagerly:
train_step = tf.function(train_step)
eval_step = tf.function(eval_step)
time_callback.on_train_begin() time_callback.on_train_begin()
for epoch in range(FLAGS.train_epochs): for epoch in range(FLAGS.train_epochs):
for cb in callbacks: for cb in callbacks:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment