Split --ml_perf into two flags. (#5615)

--ml_perf now just changes the model to make it MLPerf compliant. --output_ml_perf_compliance_logging adds the MLPerf compliance logs.

Split --ml_perf into two flags. (#5615)
--ml_perf now just changes the model to make it MLPerf compliant. --output_ml_perf_compliance_logging adds the MLPerf compliance logs.
4298c3a3 · Reed · GitHub · 2644707c · 4298c3a3 · 4298c3a3
Unverified Commit 4298c3a3 authored Oct 26, 2018 by Reed Committed by GitHub Oct 26, 2018
5 changed files
--- a/official/recommendation/data_async_generation.py
+++ b/official/recommendation/data_async_generation.py
@@ -555,7 +555,8 @@ def main(_):
    if flags.FLAGS.seed is not None:
      np.random.seed(flags.FLAGS.seed)

-    with mlperf_helper.LOGGER(enable=flags.FLAGS.ml_perf):
+    with mlperf_helper.LOGGER(
+        enable=flags.FLAGS.output_ml_perf_compliance_logging):
      mlperf_helper.set_ncf_root(os.path.split(os.path.abspath(__file__))[0])
      _generation_loop(
          num_workers=flags.FLAGS.num_workers,
@@ -623,6 +624,9 @@ def define_flags():
                            "specified, a seed will not be set.")
  flags.DEFINE_boolean(name="ml_perf", default=None,
                       help="Match MLPerf. See ncf_main.py for details.")
+  flags.DEFINE_bool(name="output_ml_perf_compliance_logging", default=None,
+                    help="Output the MLPerf compliance logging. See "
+                         "ncf_main.py for details.")

  flags.mark_flags_as_required(["data_dir", "cache_id"])


--- a/official/recommendation/data_preprocessing.py
+++ b/official/recommendation/data_preprocessing.py
@@ -461,6 +461,7 @@ def instantiate_pipeline(dataset, data_dir, batch_size, eval_batch_size,
      "redirect_logs": use_subprocess,
      "use_tf_logging": not use_subprocess,
      "ml_perf": match_mlperf,
+      "output_ml_perf_compliance_logging": mlperf_helper.LOGGER.enabled,
  }

  if use_subprocess:

--- a/official/recommendation/ncf_main.py
+++ b/official/recommendation/ncf_main.py
@@ -50,6 +50,9 @@ from official.utils.misc import distribution_utils
 from official.utils.misc import model_helpers


+FLAGS = flags.FLAGS
+
+
 def construct_estimator(num_gpus, model_dir, params, batch_size,
                        eval_batch_size):
  """Construct either an Estimator or TPUEstimator for NCF.
@@ -118,7 +121,8 @@ def construct_estimator(num_gpus, model_dir, params, batch_size,


 def main(_):
-  with logger.benchmark_context(FLAGS), mlperf_helper.LOGGER(FLAGS.ml_perf):
+  with logger.benchmark_context(FLAGS), \
+       mlperf_helper.LOGGER(FLAGS.output_ml_perf_compliance_logging):
    mlperf_helper.set_ncf_root(os.path.split(os.path.abspath(__file__))[0])
    run_ncf(FLAGS)
    mlperf_helper.stitch_ncf()
@@ -417,6 +421,18 @@ def define_ncf_flags():
          "which performs better due to the fact the sorting algorithms are "
          "not stable."))

+  flags.DEFINE_bool(
+      name="output_ml_perf_compliance_logging", default=False,
+      help=flags_core.help_wrap(
+          "If set, output the MLPerf compliance logging. This is only useful "
+          "if one is running the model for MLPerf. See "
+          "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
+          "#submission-compliance-logs for details. This uses sudo and so may "
+          "ask for your password, as root access is needed to clear the system "
+          "caches, which is required for MLPerf compliance."
+      )
+  )
+
  flags.DEFINE_integer(
      name="seed", default=None, help=flags_core.help_wrap(
          "This value will be used to seed both NumPy and TensorFlow."))
@@ -460,5 +476,4 @@ def define_ncf_flags():
 if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)
  define_ncf_flags()
-  FLAGS = flags.FLAGS
  absl_app.run(main)
--- a/official/recommendation/ncf_test.py
+++ b/official/recommendation/ncf_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function

 import math
+import mock

 import numpy as np
 import tensorflow as tf

+from absl.testing import flagsaver
 from official.recommendation import constants as rconst
+from official.recommendation import data_preprocessing
 from official.recommendation import neumf_model
 from official.recommendation import ncf_main
 from official.recommendation import stat_utils
@@ -33,6 +36,12 @@ NUM_TRAIN_NEG = 4


 class NcfTest(tf.test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):  # pylint: disable=invalid-name
+    super(NcfTest, cls).setUpClass()
+    ncf_main.define_ncf_flags()
+
  def setUp(self):
    self.top_k_old = rconst.TOP_K
    self.num_eval_negatives_old = rconst.NUM_EVAL_NEGATIVES
@@ -224,6 +233,22 @@ class NcfTest(tf.test.TestCase):
    self.assertAlmostEqual(ndcg, (1 + 2 * math.log(2) / math.log(3) +
                                  math.log(2) / math.log(4)) / 4)

+  _BASE_END_TO_END_FLAGS = {
+      "batch_size": 1024,
+      "train_epochs": 1,
+      "use_synthetic_data": True
+  }
+
+  @flagsaver.flagsaver(**_BASE_END_TO_END_FLAGS)
+  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
+  def test_end_to_end(self):
+    ncf_main.main(None)
+
+  @flagsaver.flagsaver(ml_perf=True, **_BASE_END_TO_END_FLAGS)
+  @mock.patch.object(data_preprocessing, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
+  def test_end_to_end_mlperf(self):
+    ncf_main.main(None)
+

 if __name__ == "__main__":
  tf.logging.set_verbosity(tf.logging.INFO)

--- a/official/utils/logs/mlperf_helper.py
+++ b/official/utils/logs/mlperf_helper.py
@@ -192,7 +192,8 @@ def stitch_ncf():
    return

  if LOGGER.log_file is None or not tf.gfile.Exists(LOGGER.log_file):
-    tf.logging.error("Could not find log file to stitch.")
+    tf.logging.warning("Could not find log file to stitch.")
+    return

  log_lines = []
  num_eval_users = None