Benchmark update (#4034)

* Update the benchmark logger to have default logging. 1. Create global instance of benchmark logger, which default log to tf.logging.info 2. Allow user to config the logging location. 3. Fix nits in code and comment. * Fix lint and test error. * Address review comments. * Remove the duplicated print statement.

Benchmark update (#4034)
* Update the benchmark logger to have default logging. 1. Create global instance of benchmark logger, which default log to tf.logging.info 2. Allow user to config the logging location. 3. Fix nits in code and comment. * Fix lint and test error. * Address review comments. * Remove the duplicated print statement.
21ec0e1b · Qianli Scott Zhu · GitHub · 823da318 · 21ec0e1b · 21ec0e1b
Unverified Commit 21ec0e1b authored Apr 19, 2018 by Qianli Scott Zhu Committed by GitHub Apr 19, 2018
6 changed files
--- a/official/resnet/resnet_run_loop.py
+++ b/official/resnet/resnet_run_loop.py
@@ -398,11 +398,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
          'dtype': flags.dtype
      })

-  if flags.benchmark_log_dir is not None:
-    benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
+  benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
  benchmark_logger.log_run_info('resnet')
-  else:
-    benchmark_logger = None

  for _ in range(flags.train_epochs // flags.epochs_between_evals):
    train_hooks = hooks_helper.get_train_hooks(
@@ -434,10 +431,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
    # global_step count.
    eval_results = classifier.evaluate(input_fn=input_fn_eval,
                                       steps=flags.max_train_steps)
-    print(eval_results)

-    if benchmark_logger:
-      benchmark_logger.log_estimator_evaluation_result(eval_results)
+    benchmark_logger.log_evaluation_result(eval_results)

    if model_helpers.past_stop_threshold(
        flags.stop_threshold, eval_results['accuracy']):

--- a/official/utils/logs/hooks_helper.py
+++ b/official/utils/logs/hooks_helper.py
@@ -27,6 +27,7 @@ from __future__ import print_function
 import tensorflow as tf  # pylint: disable=g-bad-import-order

 from official.utils.logs import hooks
+from official.utils.logs import logger
 from official.utils.logs import metric_hook

 _TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
@@ -140,13 +141,12 @@ def get_logging_metric_hook(benchmark_log_dir=None,
    Returns a ProfilerHook that writes out timelines that can be loaded into
    profiling tools like chrome://tracing.
  """
-  if benchmark_log_dir is None:
-    raise ValueError("metric_log_dir should be provided to use metric logger")
+  logger.config_benchmark_logger(benchmark_log_dir)
  if tensors_to_log is None:
    tensors_to_log = _TENSORS_TO_LOG
  return metric_hook.LoggingMetricHook(
      tensors=tensors_to_log,
-      log_dir=benchmark_log_dir,
+      metric_logger=logger.get_benchmark_logger(),
      every_n_secs=every_n_secs)



--- a/official/utils/logs/logger.py
+++ b/official/utils/logs/logger.py
@@ -27,6 +27,7 @@ import json
 import multiprocessing
 import numbers
 import os
+import threading

 import tensorflow as tf
 from tensorflow.python.client import device_lib
@@ -36,27 +37,48 @@ BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
 _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"


-class BenchmarkLogger(object):
-  """Class to log the benchmark information to local disk."""
+# Don't use it directly. Use get_benchmark_logger to access a logger.
+_benchmark_logger = None
+_logger_lock = threading.Lock()
+
+
+def config_benchmark_logger(logging_dir):
+  """Config the global benchmark logger"""
+  _logger_lock.acquire()
+  try:
+    global _benchmark_logger
+    if logging_dir:
+      _benchmark_logger = BenchmarkFileLogger(logging_dir)
+    else:
+      _benchmark_logger = BaseBenchmarkLogger()
+  finally:
+    _logger_lock.release()
+  return _benchmark_logger

-  def __init__(self, logging_dir):
-    self._logging_dir = logging_dir
-    if not tf.gfile.IsDirectory(self._logging_dir):
-      tf.gfile.MakeDirs(self._logging_dir)

-  def log_estimator_evaluation_result(self, eval_results):
-    """Log the evaluation result for a estimator.
+def get_benchmark_logger():
+  if not _benchmark_logger:
+    config_benchmark_logger(None)

-    The evaluate result is a directory that contains metrics defined in
+  return _benchmark_logger
+
+
+class BaseBenchmarkLogger(object):
+  """Class to log the benchmark information to STDOUT."""
+
+  def log_evaluation_result(self, eval_results):
+    """Log the evaluation result.
+
+    The evaluate result is a dictionary that contains metrics defined in
    model_fn. It also contains a entry for global_step which contains the value
    of the global step when evaluation was performed.

    Args:
-      eval_results: dict, the result of evaluate() from a estimator.
+      eval_results: dict, the result of evaluate.
    """
    if not isinstance(eval_results, dict):
-      tf.logging.warning("eval_results should be directory for logging. Got %s",
-                         type(eval_results))
+      tf.logging.warning("eval_results should be dictionary for logging. "
+                         "Got %s", type(eval_results))
      return
    global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
    for key in sorted(eval_results):
@@ -81,10 +103,45 @@ class BenchmarkLogger(object):
      tf.logging.warning(
          "Metric value to log should be a number. Got %s", type(value))
      return
-    if extras:
-      extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
-    else:
-      extras = []
+    extras = _convert_to_json_dict(extras)
+
+    tf.logging.info("Benchmark metric: "
+                    "Name %s, value %d, unit %s, global_step %d, extras %s",
+                    name, value, unit, global_step, extras)
+
+  def log_run_info(self, model_name):
+    tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
+
+
+class BenchmarkFileLogger(BaseBenchmarkLogger):
+  """Class to log the benchmark information to local disk."""
+
+  def __init__(self, logging_dir):
+    super(BenchmarkFileLogger, self).__init__()
+    self._logging_dir = logging_dir
+    if not tf.gfile.IsDirectory(self._logging_dir):
+      tf.gfile.MakeDirs(self._logging_dir)
+
+  def log_metric(self, name, value, unit=None, global_step=None, extras=None):
+    """Log the benchmark metric information to local file.
+
+    Currently the logging is done in a synchronized way. This should be updated
+    to log asynchronously.
+
+    Args:
+      name: string, the name of the metric to log.
+      value: number, the value of the metric. The value will not be logged if it
+        is not a number type.
+      unit: string, the unit of the metric, E.g "image per second".
+      global_step: int, the global_step when the metric is logged.
+      extras: map of string:string, the extra information about the metric.
+    """
+    if not isinstance(value, numbers.Number):
+      tf.logging.warning(
+          "Metric value to log should be a number. Got %s", type(value))
+      return
+    extras = _convert_to_json_dict(extras)
+
    with tf.gfile.GFile(
        os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
      metric = {
@@ -110,15 +167,7 @@ class BenchmarkLogger(object):
    Args:
      model_name: string, the name of the model.
    """
-    run_info = {
-        "model_name": model_name,
-        "machine_config": {},
-        "run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
-    _collect_tensorflow_info(run_info)
-    _collect_tensorflow_environment_variables(run_info)
-    _collect_cpu_info(run_info)
-    _collect_gpu_info(run_info)
-    _collect_memory_info(run_info)
+    run_info = _gather_run_info(model_name)

    with tf.gfile.GFile(os.path.join(
        self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
@@ -130,6 +179,20 @@ class BenchmarkLogger(object):
                           e)


+def _gather_run_info(model_name):
+  """Collect the benchmark run information for the local environment."""
+  run_info = {
+      "model_name": model_name,
+      "machine_config": {},
+      "run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
+  _collect_tensorflow_info(run_info)
+  _collect_tensorflow_environment_variables(run_info)
+  _collect_cpu_info(run_info)
+  _collect_gpu_info(run_info)
+  _collect_memory_info(run_info)
+  return run_info
+
+
 def _collect_tensorflow_info(run_info):
  run_info["tensorflow_version"] = {
      "version": tf.VERSION, "git_hash": tf.GIT_VERSION}
@@ -194,3 +257,10 @@ def _parse_gpu_model(physical_device_desc):
    if k.strip() == "name":
      return v.strip()
  return None
+
+
+def _convert_to_json_dict(input_dict):
+  if input_dict:
+    return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
+  else:
+    return []
--- a/official/utils/logs/logger_test.py
+++ b/official/utils/logs/logger_test.py
@@ -31,8 +31,50 @@ from official.utils.logs import logger

 class BenchmarkLoggerTest(tf.test.TestCase):

+  def test_get_default_benchmark_logger(self):
+    self.assertIsInstance(logger.get_benchmark_logger(),
+                          logger.BaseBenchmarkLogger)
+
+  def test_config_base_benchmark_logger(self):
+    logger.config_benchmark_logger("")
+    self.assertIsInstance(logger.get_benchmark_logger(),
+                          logger.BaseBenchmarkLogger)
+
+  def test_config_benchmark_file_logger(self):
+    logger.config_benchmark_logger("/tmp/abc")
+    self.assertIsInstance(logger.get_benchmark_logger(),
+                          logger.BenchmarkFileLogger)
+
+
+class BaseBenchmarkLoggerTest(tf.test.TestCase):
+
+  def setUp(self):
+    super(BaseBenchmarkLoggerTest, self).setUp()
+    self._actual_log = tf.logging.info
+    self.logged_message = None
+
+    def mock_log(*args, **kwargs):
+      self.logged_message = args
+      self._actual_log(*args, **kwargs)
+
+    tf.logging.info = mock_log
+
+  def tearDown(self):
+    super(BaseBenchmarkLoggerTest, self).tearDown()
+    tf.logging.info = self._actual_log
+
+  def test_log_metric(self):
+    log = logger.BaseBenchmarkLogger()
+    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
+
+    expected_log_prefix = "Benchmark metric:"
+    self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
+
+
+class BenchmarkFileLoggerTest(tf.test.TestCase):
+
  def setUp(self):
-    super(BenchmarkLoggerTest, self).setUp()
+    super(BenchmarkFileLoggerTest, self).setUp()
    # Avoid pulling extra env vars from test environment which affects the test
    # result, eg. Kokoro test has a TF_PKG env which affect the test case
    # test_collect_tensorflow_environment_variables()
@@ -40,7 +82,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
    os.environ.clear()

  def tearDown(self):
-    super(BenchmarkLoggerTest, self).tearDown()
+    super(BenchmarkFileLoggerTest, self).tearDown()
    tf.gfile.DeleteRecursively(self.get_temp_dir())
    os.environ.clear()
    os.environ.update(self.original_environ)
@@ -49,12 +91,12 @@ class BenchmarkLoggerTest(tf.test.TestCase):
    non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
    self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))

-    logger.BenchmarkLogger(non_exist_temp_dir)
+    logger.BenchmarkFileLogger(non_exist_temp_dir)
    self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))

  def test_log_metric(self):
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    log = logger.BenchmarkLogger(log_dir)
+    log = logger.BenchmarkFileLogger(log_dir)
    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})

    metric_log = os.path.join(log_dir, "metric.log")
@@ -69,7 +111,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):

  def test_log_multiple_metrics(self):
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    log = logger.BenchmarkLogger(log_dir)
+    log = logger.BenchmarkFileLogger(log_dir)
    log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
    log.log_metric("loss", 0.02, global_step=1e4)

@@ -90,9 +132,9 @@ class BenchmarkLoggerTest(tf.test.TestCase):
      self.assertEqual(loss["global_step"], 1e4)
      self.assertEqual(loss["extras"], [])

-  def test_log_non_nubmer_value(self):
+  def test_log_non_number_value(self):
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    log = logger.BenchmarkLogger(log_dir)
+    log = logger.BenchmarkFileLogger(log_dir)
    const = tf.constant(1)
    log.log_metric("accuracy", const)

@@ -104,8 +146,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
                   "global_step": 207082,
                   "accuracy": 0.9285}
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    log = logger.BenchmarkLogger(log_dir)
-    log.log_estimator_evaluation_result(eval_result)
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_evaluation_result(eval_result)

    metric_log = os.path.join(log_dir, "metric.log")
    self.assertTrue(tf.gfile.Exists(metric_log))
@@ -125,8 +167,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
  def test_log_evaluation_result_with_invalid_type(self):
    eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
    log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    log = logger.BenchmarkLogger(log_dir)
-    log.log_estimator_evaluation_result(eval_result)
+    log = logger.BenchmarkFileLogger(log_dir)
+    log.log_evaluation_result(eval_result)

    metric_log = os.path.join(log_dir, "metric.log")
    self.assertFalse(tf.gfile.Exists(metric_log))

--- a/official/utils/logs/metric_hook.py
+++ b/official/utils/logs/metric_hook.py
@@ -20,8 +20,6 @@ from __future__ import print_function

 import tensorflow as tf  # pylint: disable=g-bad-import-order

-from official.utils.logs import logger
-

 class LoggingMetricHook(tf.train.LoggingTensorHook):
  """Hook to log benchmark metric information.
@@ -35,17 +33,15 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
  whose evaluation produces a side effect such as consuming additional inputs.
  """

-  def __init__(self, tensors, log_dir=None, metric_logger=None,
+  def __init__(self, tensors, metric_logger=None,
               every_n_iter=None, every_n_secs=None, at_end=False):
    """Initializer for LoggingMetricHook.

    Args:
      tensors: `dict` that maps string-valued tags to tensors/tensor names,
          or `iterable` of tensors/tensor names.
-      log_dir: `string`, directory path that metric hook should write log to.
      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
-          hook should use to write the log. Exactly one of the `log_dir` and
-          `metric_logger` should be provided.
+          hook should use to write the log.
      every_n_iter: `int`, print the values of `tensors` once every N local
          steps taken on the current worker.
      every_n_secs: `int` or `float`, print the values of `tensors` once every N
@@ -66,13 +62,8 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
        every_n_secs=every_n_secs,
        at_end=at_end)

-    if (log_dir is None) == (metric_logger is None):
-      raise ValueError(
-          "exactly one of log_dir and metric_logger should be provided.")
-
-    if log_dir is not None:
-      self._logger = logger.BenchmarkLogger(log_dir)
-    else:
+    if metric_logger is None:
+      raise ValueError("metric_logger should be provided.")
    self._logger = metric_logger

  def begin(self):

--- a/official/utils/logs/metric_hook_test.py
+++ b/official/utils/logs/metric_hook_test.py
@@ -64,12 +64,8 @@ class LoggingMetricHookTest(tf.test.TestCase):
          tensors=['t'], every_n_iter=5, every_n_secs=5)
    with self.assertRaisesRegexp(ValueError, 'xactly one of'):
      metric_hook.LoggingMetricHook(tensors=['t'])
-    with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
+    with self.assertRaisesRegexp(ValueError, 'metric_logger'):
      metric_hook.LoggingMetricHook(tensors=['t'], every_n_iter=5)
-    with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
-      metric_hook.LoggingMetricHook(
-          tensors=['t'], every_n_iter=5, log_dir=self._log_dir,
-          metric_logger=self._logger)

  def test_print_at_end_only(self):
    with tf.Graph().as_default(), tf.Session() as sess: