[NCF] Add run_eagerly for ctl. (#7229)

* Add run_eagerly for ctl. * fix test name and do not set "default".

[NCF] Add run_eagerly for ctl. (#7229)
* Add run_eagerly for ctl. * fix test name and do not set "default".
62184a96 · Toby Boyd · nnigania · 58340818 · 62184a96 · 62184a96
Commit 62184a96 authored Aug 09, 2019 by Toby Boyd Committed by nnigania Aug 09, 2019
Showing with 20 additions and 3 deletions

official/recommendation/ncf_keras_benchmark.py official/recommendation/ncf_keras_benchmark.py +16 -1

official/recommendation/ncf_keras_main.py official/recommendation/ncf_keras_main.py +4 -2

No files found.
--- a/official/recommendation/ncf_keras_benchmark.py
+++ b/official/recommendation/ncf_keras_benchmark.py
@@ -181,6 +181,13 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.early_stopping = True
    self._run_and_report_benchmark()

+  def benchmark_1_gpu_ctl_run_eagerly_early_stop(self):
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.early_stopping = True
+    FLAGS.run_eagerly = True
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu_ctl_early_stop(self):
    self._setup()
    FLAGS.keras_use_ctl = True
@@ -203,7 +210,7 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    self._run_and_report_benchmark()

 #############################################
-# Tests below with mlperf in the test name are of two types
+# Tests below with mlperf in the test name are of two types:
 #  1) 1 GPU tests are based on MLPerf 0.5 and the TensorFlow pulled submission.
 #  2) 8 GPU tests are based on MLPerf 0.5 and use NVIDIA's hyper parameters.
 #
@@ -254,6 +261,14 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark_mlperf_like()

+  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
+    """1 GPU using CTL with eager and distribution strategy."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.run_eagerly = True
+    FLAGS.train_epochs = 7
+    self._run_and_report_benchmark()
+
  def benchmark_xla_1_gpu_ctl_mlperf_like(self):
    """1 GPU using CTL with XLA."""
    self._setup()

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -285,7 +285,6 @@ def run_ncf(_):
    train_input_iterator = strategy.make_dataset_iterator(train_input_dataset)
    eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset)

-    @tf.function
    def train_step():
      """Called once per step to train the model."""
      def step_fn(features):
@@ -310,7 +309,6 @@ def run_ncf(_):
          tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
      return mean_loss

-    @tf.function
    def eval_step():
      """Called once per eval step to compute eval metrics."""
      def step_fn(features):
@@ -330,6 +328,10 @@ def run_ncf(_):
          tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
      return hr_sum, hr_count

+    if not FLAGS.run_eagerly:
+      train_step = tf.function(train_step)
+      eval_step = tf.function(eval_step)
+
    time_callback.on_train_begin()
    for epoch in range(FLAGS.train_epochs):
      for cb in callbacks: