Commit 85addcf3 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Merge pull request #7436 from nnigania:ncf_f16

PiperOrigin-RevId: 265165355
parents ee016fb0 5b0ef1fc
...@@ -154,8 +154,10 @@ def define_ncf_flags(): ...@@ -154,8 +154,10 @@ def define_ncf_flags():
intra_op=False, intra_op=False,
synthetic_data=True, synthetic_data=True,
max_train_steps=False, max_train_steps=False,
dtype=False, dtype=True,
all_reduce_alg=False, all_reduce_alg=False,
loss_scale=True,
dynamic_loss_scale=True,
enable_xla=True, enable_xla=True,
force_v2_in_keras_compile=True force_v2_in_keras_compile=True
) )
......
...@@ -263,6 +263,15 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -263,6 +263,15 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like() self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_ctl_fp16_mlperf_like(self):
"""1 GPU using CTL."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.train_epochs = 7
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 8192
self._run_and_report_benchmark_mlperf_like()
def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self): def benchmark_1_gpu_ctl_run_eagerly_mlperf_like(self):
"""1 GPU using CTL with eager and distribution strategy.""" """1 GPU using CTL with eager and distribution strategy."""
self._setup() self._setup()
...@@ -279,6 +288,16 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase): ...@@ -279,6 +288,16 @@ class NCFKerasAccuracy(NCFKerasBenchmarkBase):
FLAGS.train_epochs = 7 FLAGS.train_epochs = 7
self._run_and_report_benchmark_mlperf_like() self._run_and_report_benchmark_mlperf_like()
def benchmark_xla_1_gpu_ctl_fp16_mlperf_like(self):
"""1 GPU using CTL with XLA."""
self._setup()
FLAGS.keras_use_ctl = True
FLAGS.enable_xla = True
FLAGS.train_epochs = 7
FLAGS.dtype = 'fp16'
FLAGS.loss_scale = 8192
self._run_and_report_benchmark_mlperf_like()
def benchmark_8_gpu_mlperf_like(self): def benchmark_8_gpu_mlperf_like(self):
"""8 GPU using keras fit/compile.""" """8 GPU using keras fit/compile."""
self._setup() self._setup()
......
...@@ -42,6 +42,7 @@ from official.utils.logs import mlperf_helper ...@@ -42,6 +42,7 @@ from official.utils.logs import mlperf_helper
from official.utils.misc import distribution_utils from official.utils.misc import distribution_utils
from official.utils.misc import keras_utils from official.utils.misc import keras_utils
from official.utils.misc import model_helpers from official.utils.misc import model_helpers
from official.utils.flags import core as flags_core
from official.utils.misc import tpu_lib from official.utils.misc import tpu_lib
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
...@@ -267,6 +268,12 @@ def run_ncf(_): ...@@ -267,6 +268,12 @@ def run_ncf(_):
beta_1=params["beta1"], beta_1=params["beta1"],
beta_2=params["beta2"], beta_2=params["beta2"],
epsilon=params["epsilon"]) epsilon=params["epsilon"])
if FLAGS.dtype == "fp16":
optimizer = \
tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
optimizer,
loss_scale=flags_core.get_loss_scale(FLAGS,
default_for_fp16="dynamic"))
if params["keras_use_ctl"]: if params["keras_use_ctl"]:
train_loss, eval_results = run_ncf_custom_training( train_loss, eval_results = run_ncf_custom_training(
...@@ -371,8 +378,12 @@ def run_ncf_custom_training(params, ...@@ -371,8 +378,12 @@ def run_ncf_custom_training(params,
softmax_logits, softmax_logits,
sample_weight=features[rconst.VALID_POINT_MASK]) sample_weight=features[rconst.VALID_POINT_MASK])
loss *= (1.0 / params["batch_size"]) loss *= (1.0 / params["batch_size"])
if FLAGS.dtype == "fp16":
loss = optimizer.get_scaled_loss(loss)
grads = tape.gradient(loss, keras_model.trainable_variables) grads = tape.gradient(loss, keras_model.trainable_variables)
if FLAGS.dtype == "fp16":
grads = optimizer.get_unscaled_gradients(grads)
# Converting gradients to dense form helps in perf on GPU for NCF # Converting gradients to dense form helps in perf on GPU for NCF
grads = neumf_model.sparse_to_dense_grads( grads = neumf_model.sparse_to_dense_grads(
list(zip(grads, keras_model.trainable_variables))) list(zip(grads, keras_model.trainable_variables)))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment