Commit 62184a96 authored by Toby Boyd's avatar Toby Boyd Committed by nnigania
Browse files

[NCF] Add run_eagerly for ctl. (#7229)

* Add run_eagerly for ctl.

* fix test name and do not set "default".
parent 58340818
......@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.early_stopping = True
self._run_and_report_benchmark()
def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.early_stopping = True
FLAGS.run_eagerly = True
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_early_stop(self):
self._setup()
FLAGS.keras_use_ctl = True
......@@ -203,7 +210,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
self._run_and_report_benchmark()
#############################################
# Tests below with mlperf in the test name are of two types
# Tests below with mlperf in the test name are of two types:
# 1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
# 2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
#
......@@ -254,6 +261,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
"""1 GPU using CTL with eager and distribution strategy."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.run_eagerly = True
FLAGS.train_epochs = 7
self._run_and_report_benchmark()
def benchmark_xla_1_gpu_ctl_mlperf_like(self):
"""1 GPU using CTL with XLA."""
self._setup()
......
......@@ -285,7 +285,6 @@ def run_ncf(_):
train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)
@tf.function
def train_step():
"""Called once per step to train the model."""
def step_fn(features):
......@@ -310,7 +309,6 @@ def run_ncf(_):
tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
return mean_loss
@tf.function
def eval_step():
"""Called once per eval step to compute eval metrics."""
def step_fn(features):
......@@ -330,6 +328,10 @@ def run_ncf(_):
tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
return hr_sum, hr_count
if not FLAGS.run_eagerly:
train_step = tf.function(train_step)
eval_step = tf.function(eval_step)
time_callback.on_train_begin()
for epoch in range(FLAGS.train_epochs):
for cb in callbacks:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment