Merge pull request #7436 from nnigania:ncf_f16

PiperOrigin-RevId: 265165355

Merge pull request #7436 from nnigania:ncf_f16
PiperOrigin-RevId: 265165355
85addcf3 · A. Unique TensorFlower · ee016fb0 · 5b0ef1fc · 85addcf3 · 85addcf3
Commit 85addcf3 authored Aug 23, 2019 by A. Unique TensorFlower
3 changed files
--- a/official/recommendation/ncf_common.py
+++ b/official/recommendation/ncf_common.py
@@ -154,8 +154,10 @@ def define_ncf_flags():
      intra_op=False,
      synthetic_data=True,
      max_train_steps=False,
-      dtype=False,
+      dtype=True,
      all_reduce_alg=False,
+      loss_scale=True,
+      dynamic_loss_scale=True,
      enable_xla=True,
      force_v2_in_keras_compile=True
  )

--- a/official/recommendation/ncf_keras_benchmark.py
+++ b/official/recommendation/ncf_keras_benchmark.py
@@ -263,6 +263,15 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark_mlperf_like()

+  def benchmark_1_gpu_ctl_fp16_mlperf_like(self):
+    """1 GPU using CTL."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
  def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
    """1 GPU using CTL with eager and distribution strategy."""
    self._setup()
@@ -279,6 +288,16 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
    FLAGS.train_epochs = 7
    self._run_and_report_benchmark_mlperf_like()

+  def benchmark_xla_1_gpu_ctl_fp16_mlperf_like(self):
+    """1 GPU using CTL with XLA."""
+    self._setup()
+    FLAGS.keras_use_ctl = True
+    FLAGS.enable_xla = True
+    FLAGS.train_epochs = 7
+    FLAGS.dtype = 'fp16'
+    FLAGS.loss_scale = 8192
+    self._run_and_report_benchmark_mlperf_like()
+
  def benchmark_8_gpu_mlperf_like(self):
    """8 GPU using keras fit/compile."""
    self._setup()

--- a/official/recommendation/ncf_keras_main.py
+++ b/official/recommendation/ncf_keras_main.py
@@ -42,6 +42,7 @@ from official.utils.logs import mlperf_helper
 from official.utils.misc import distribution_utils
 from official.utils.misc import keras_utils
 from official.utils.misc import model_helpers
+from official.utils.flags import core as flags_core
 from official.utils.misc import tpu_lib

 FLAGS = flags.FLAGS
@@ -267,6 +268,12 @@ def run_ncf(_):
          beta_1=params["beta1"],
          beta_2=params["beta2"],
          epsilon=params["epsilon"])
+      if FLAGS.dtype == "fp16":
+        optimizer = \
+          tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
+              optimizer,
+              loss_scale=flags_core.get_loss_scale(FLAGS,
+                                                   default_for_fp16="dynamic"))

      if params["keras_use_ctl"]:
        train_loss, eval_results = run_ncf_custom_training(
@@ -371,8 +378,12 @@ def run_ncf_custom_training(params,
            softmax_logits,
            sample_weight=features[rconst.VALID_POINT_MASK])
        loss *= (1.0 / params["batch_size"])
+        if FLAGS.dtype == "fp16":
+          loss = optimizer.get_scaled_loss(loss)

      grads = tape.gradient(loss, keras_model.trainable_variables)
+      if FLAGS.dtype == "fp16":
+        grads = optimizer.get_unscaled_gradients(grads)
      # Converting gradients to dense form helps in perf on GPU for NCF
      grads = neumf_model.sparse_to_dense_grads(
          list(zip(grads, keras_model.trainable_variables)))