Unverified Commit 21ec0e1b authored by Qianli Scott Zhu's avatar Qianli Scott Zhu Committed by GitHub
Browse files

Benchmark update (#4034)

* Update the benchmark logger to have default logging.

1. Create global instance of benchmark logger, which default log to
tf.logging.info
2. Allow user to config the logging location.
3. Fix nits in code and comment.

* Fix lint and test error.

* Address review comments.

* Remove the duplicated print statement.
parent 823da318
...@@ -398,11 +398,8 @@ def resnet_main(flags, model_function, input_function, shape=None): ...@@ -398,11 +398,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
'dtype': flags.dtype 'dtype': flags.dtype
}) })
if flags.benchmark_log_dir is not None: benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet')
benchmark_logger.log_run_info('resnet')
else:
benchmark_logger = None
for _ in range(flags.train_epochs // flags.epochs_between_evals): for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks( train_hooks = hooks_helper.get_train_hooks(
...@@ -434,10 +431,8 @@ def resnet_main(flags, model_function, input_function, shape=None): ...@@ -434,10 +431,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
# global_step count. # global_step count.
eval_results = classifier.evaluate(input_fn=input_fn_eval, eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=flags.max_train_steps) steps=flags.max_train_steps)
print(eval_results)
if benchmark_logger: benchmark_logger.log_evaluation_result(eval_results)
benchmark_logger.log_estimator_evaluation_result(eval_results)
if model_helpers.past_stop_threshold( if model_helpers.past_stop_threshold(
flags.stop_threshold, eval_results['accuracy']): flags.stop_threshold, eval_results['accuracy']):
......
...@@ -27,6 +27,7 @@ from __future__ import print_function ...@@ -27,6 +27,7 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks from official.utils.logs import hooks
from official.utils.logs import logger
from official.utils.logs import metric_hook from official.utils.logs import metric_hook
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate', _TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
...@@ -140,13 +141,12 @@ def get_logging_metric_hook(benchmark_log_dir=None, ...@@ -140,13 +141,12 @@ def get_logging_metric_hook(benchmark_log_dir=None,
Returns a ProfilerHook that writes out timelines that can be loaded into Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing. profiling tools like chrome://tracing.
""" """
if benchmark_log_dir is None: logger.config_benchmark_logger(benchmark_log_dir)
raise ValueError("metric_log_dir should be provided to use metric logger")
if tensors_to_log is None: if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG tensors_to_log = _TENSORS_TO_LOG
return metric_hook.LoggingMetricHook( return metric_hook.LoggingMetricHook(
tensors=tensors_to_log, tensors=tensors_to_log,
log_dir=benchmark_log_dir, metric_logger=logger.get_benchmark_logger(),
every_n_secs=every_n_secs) every_n_secs=every_n_secs)
......
...@@ -27,6 +27,7 @@ import json ...@@ -27,6 +27,7 @@ import json
import multiprocessing import multiprocessing
import numbers import numbers
import os import os
import threading
import tensorflow as tf import tensorflow as tf
from tensorflow.python.client import device_lib from tensorflow.python.client import device_lib
...@@ -36,27 +37,48 @@ BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log" ...@@ -36,27 +37,48 @@ BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ" _DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
class BenchmarkLogger(object): # Don't use it directly. Use get_benchmark_logger to access a logger.
"""Class to log the benchmark information to local disk.""" _benchmark_logger = None
_logger_lock = threading.Lock()
def __init__(self, logging_dir):
self._logging_dir = logging_dir
if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir)
def log_estimator_evaluation_result(self, eval_results): def config_benchmark_logger(logging_dir):
"""Log the evaluation result for a estimator. """Config the global benchmark logger"""
_logger_lock.acquire()
try:
global _benchmark_logger
if logging_dir:
_benchmark_logger = BenchmarkFileLogger(logging_dir)
else:
_benchmark_logger = BaseBenchmarkLogger()
finally:
_logger_lock.release()
return _benchmark_logger
def get_benchmark_logger():
if not _benchmark_logger:
config_benchmark_logger(None)
return _benchmark_logger
class BaseBenchmarkLogger(object):
"""Class to log the benchmark information to STDOUT."""
def log_evaluation_result(self, eval_results):
"""Log the evaluation result.
The evaluate result is a directory that contains metrics defined in The evaluate result is a dictionary that contains metrics defined in
model_fn. It also contains a entry for global_step which contains the value model_fn. It also contains a entry for global_step which contains the value
of the global step when evaluation was performed. of the global step when evaluation was performed.
Args: Args:
eval_results: dict, the result of evaluate() from a estimator. eval_results: dict, the result of evaluate.
""" """
if not isinstance(eval_results, dict): if not isinstance(eval_results, dict):
tf.logging.warning("eval_results should be directory for logging. Got %s", tf.logging.warning("eval_results should be dictionary for logging. "
type(eval_results)) "Got %s", type(eval_results))
return return
global_step = eval_results[tf.GraphKeys.GLOBAL_STEP] global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
for key in sorted(eval_results): for key in sorted(eval_results):
...@@ -81,10 +103,45 @@ class BenchmarkLogger(object): ...@@ -81,10 +103,45 @@ class BenchmarkLogger(object):
tf.logging.warning( tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value)) "Metric value to log should be a number. Got %s", type(value))
return return
if extras: extras = _convert_to_json_dict(extras)
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
else: tf.logging.info("Benchmark metric: "
extras = [] "Name %s, value %d, unit %s, global_step %d, extras %s",
name, value, unit, global_step, extras)
def log_run_info(self, model_name):
tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
class BenchmarkFileLogger(BaseBenchmarkLogger):
"""Class to log the benchmark information to local disk."""
def __init__(self, logging_dir):
super(BenchmarkFileLogger, self).__init__()
self._logging_dir = logging_dir
if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir)
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file.
Currently the logging is done in a synchronized way. This should be updated
to log asynchronously.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
if not isinstance(value, numbers.Number):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return
extras = _convert_to_json_dict(extras)
with tf.gfile.GFile( with tf.gfile.GFile(
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f: os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
metric = { metric = {
...@@ -110,15 +167,7 @@ class BenchmarkLogger(object): ...@@ -110,15 +167,7 @@ class BenchmarkLogger(object):
Args: Args:
model_name: string, the name of the model. model_name: string, the name of the model.
""" """
run_info = { run_info = _gather_run_info(model_name)
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)
with tf.gfile.GFile(os.path.join( with tf.gfile.GFile(os.path.join(
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f: self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
...@@ -130,6 +179,20 @@ class BenchmarkLogger(object): ...@@ -130,6 +179,20 @@ class BenchmarkLogger(object):
e) e)
def _gather_run_info(model_name):
"""Collect the benchmark run information for the local environment."""
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)
return run_info
def _collect_tensorflow_info(run_info): def _collect_tensorflow_info(run_info):
run_info["tensorflow_version"] = { run_info["tensorflow_version"] = {
"version": tf.VERSION, "git_hash": tf.GIT_VERSION} "version": tf.VERSION, "git_hash": tf.GIT_VERSION}
...@@ -194,3 +257,10 @@ def _parse_gpu_model(physical_device_desc): ...@@ -194,3 +257,10 @@ def _parse_gpu_model(physical_device_desc):
if k.strip() == "name": if k.strip() == "name":
return v.strip() return v.strip()
return None return None
def _convert_to_json_dict(input_dict):
if input_dict:
return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
else:
return []
...@@ -31,8 +31,50 @@ from official.utils.logs import logger ...@@ -31,8 +31,50 @@ from official.utils.logs import logger
class BenchmarkLoggerTest(tf.test.TestCase): class BenchmarkLoggerTest(tf.test.TestCase):
def test_get_default_benchmark_logger(self):
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_base_benchmark_logger(self):
logger.config_benchmark_logger("")
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_benchmark_file_logger(self):
logger.config_benchmark_logger("/tmp/abc")
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkFileLogger)
class BaseBenchmarkLoggerTest(tf.test.TestCase):
def setUp(self):
super(BaseBenchmarkLoggerTest, self).setUp()
self._actual_log = tf.logging.info
self.logged_message = None
def mock_log(*args, **kwargs):
self.logged_message = args
self._actual_log(*args, **kwargs)
tf.logging.info = mock_log
def tearDown(self):
super(BaseBenchmarkLoggerTest, self).tearDown()
tf.logging.info = self._actual_log
def test_log_metric(self):
log = logger.BaseBenchmarkLogger()
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
expected_log_prefix = "Benchmark metric:"
self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
class BenchmarkFileLoggerTest(tf.test.TestCase):
def setUp(self): def setUp(self):
super(BenchmarkLoggerTest, self).setUp() super(BenchmarkFileLoggerTest, self).setUp()
# Avoid pulling extra env vars from test environment which affects the test # Avoid pulling extra env vars from test environment which affects the test
# result, eg. Kokoro test has a TF_PKG env which affect the test case # result, eg. Kokoro test has a TF_PKG env which affect the test case
# test_collect_tensorflow_environment_variables() # test_collect_tensorflow_environment_variables()
...@@ -40,7 +82,7 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -40,7 +82,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
os.environ.clear() os.environ.clear()
def tearDown(self): def tearDown(self):
super(BenchmarkLoggerTest, self).tearDown() super(BenchmarkFileLoggerTest, self).tearDown()
tf.gfile.DeleteRecursively(self.get_temp_dir()) tf.gfile.DeleteRecursively(self.get_temp_dir())
os.environ.clear() os.environ.clear()
os.environ.update(self.original_environ) os.environ.update(self.original_environ)
...@@ -49,12 +91,12 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -49,12 +91,12 @@ class BenchmarkLoggerTest(tf.test.TestCase):
non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir") non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir)) self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))
logger.BenchmarkLogger(non_exist_temp_dir) logger.BenchmarkFileLogger(non_exist_temp_dir)
self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir)) self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
def test_log_metric(self): def test_log_metric(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir) log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"}) log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
metric_log = os.path.join(log_dir, "metric.log") metric_log = os.path.join(log_dir, "metric.log")
...@@ -69,7 +111,7 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -69,7 +111,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
def test_log_multiple_metrics(self): def test_log_multiple_metrics(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir) log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"}) log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
log.log_metric("loss", 0.02, global_step=1e4) log.log_metric("loss", 0.02, global_step=1e4)
...@@ -90,9 +132,9 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -90,9 +132,9 @@ class BenchmarkLoggerTest(tf.test.TestCase):
self.assertEqual(loss["global_step"], 1e4) self.assertEqual(loss["global_step"], 1e4)
self.assertEqual(loss["extras"], []) self.assertEqual(loss["extras"], [])
def test_log_non_nubmer_value(self): def test_log_non_number_value(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir) log = logger.BenchmarkFileLogger(log_dir)
const = tf.constant(1) const = tf.constant(1)
log.log_metric("accuracy", const) log.log_metric("accuracy", const)
...@@ -104,8 +146,8 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -104,8 +146,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
"global_step": 207082, "global_step": 207082,
"accuracy": 0.9285} "accuracy": 0.9285}
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir) log = logger.BenchmarkFileLogger(log_dir)
log.log_estimator_evaluation_result(eval_result) log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log") metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.gfile.Exists(metric_log)) self.assertTrue(tf.gfile.Exists(metric_log))
...@@ -125,8 +167,8 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -125,8 +167,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
def test_log_evaluation_result_with_invalid_type(self): def test_log_evaluation_result_with_invalid_type(self):
eval_result = "{'loss': 0.46237424, 'global_step': 207082}" eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir()) log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir) log = logger.BenchmarkFileLogger(log_dir)
log.log_estimator_evaluation_result(eval_result) log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log") metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.gfile.Exists(metric_log)) self.assertFalse(tf.gfile.Exists(metric_log))
......
...@@ -20,8 +20,6 @@ from __future__ import print_function ...@@ -20,8 +20,6 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import logger
class LoggingMetricHook(tf.train.LoggingTensorHook): class LoggingMetricHook(tf.train.LoggingTensorHook):
"""Hook to log benchmark metric information. """Hook to log benchmark metric information.
...@@ -35,17 +33,15 @@ class LoggingMetricHook(tf.train.LoggingTensorHook): ...@@ -35,17 +33,15 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
whose evaluation produces a side effect such as consuming additional inputs. whose evaluation produces a side effect such as consuming additional inputs.
""" """
def __init__(self, tensors, log_dir=None, metric_logger=None, def __init__(self, tensors, metric_logger=None,
every_n_iter=None, every_n_secs=None, at_end=False): every_n_iter=None, every_n_secs=None, at_end=False):
"""Initializer for LoggingMetricHook. """Initializer for LoggingMetricHook.
Args: Args:
tensors: `dict` that maps string-valued tags to tensors/tensor names, tensors: `dict` that maps string-valued tags to tensors/tensor names,
or `iterable` of tensors/tensor names. or `iterable` of tensors/tensor names.
log_dir: `string`, directory path that metric hook should write log to.
metric_logger: instance of `BenchmarkLogger`, the benchmark logger that metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
hook should use to write the log. Exactly one of the `log_dir` and hook should use to write the log.
`metric_logger` should be provided.
every_n_iter: `int`, print the values of `tensors` once every N local every_n_iter: `int`, print the values of `tensors` once every N local
steps taken on the current worker. steps taken on the current worker.
every_n_secs: `int` or `float`, print the values of `tensors` once every N every_n_secs: `int` or `float`, print the values of `tensors` once every N
...@@ -66,14 +62,9 @@ class LoggingMetricHook(tf.train.LoggingTensorHook): ...@@ -66,14 +62,9 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
every_n_secs=every_n_secs, every_n_secs=every_n_secs,
at_end=at_end) at_end=at_end)
if (log_dir is None) == (metric_logger is None): if metric_logger is None:
raise ValueError( raise ValueError("metric_logger should be provided.")
"exactly one of log_dir and metric_logger should be provided.") self._logger = metric_logger
if log_dir is not None:
self._logger = logger.BenchmarkLogger(log_dir)
else:
self._logger = metric_logger
def begin(self): def begin(self):
super(LoggingMetricHook, self).begin() super(LoggingMetricHook, self).begin()
......
...@@ -64,12 +64,8 @@ class LoggingMetricHookTest(tf.test.TestCase): ...@@ -64,12 +64,8 @@ class LoggingMetricHookTest(tf.test.TestCase):
tensors=['t'], every_n_iter=5, every_n_secs=5) tensors=['t'], every_n_iter=5, every_n_secs=5)
with self.assertRaisesRegexp(ValueError, 'xactly one of'): with self.assertRaisesRegexp(ValueError, 'xactly one of'):
metric_hook.LoggingMetricHook(tensors=['t']) metric_hook.LoggingMetricHook(tensors=['t'])
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'): with self.assertRaisesRegexp(ValueError, 'metric_logger'):
metric_hook.LoggingMetricHook(tensors=['t'], every_n_iter=5) metric_hook.LoggingMetricHook(tensors=['t'], every_n_iter=5)
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
metric_hook.LoggingMetricHook(
tensors=['t'], every_n_iter=5, log_dir=self._log_dir,
metric_logger=self._logger)
def test_print_at_end_only(self): def test_print_at_end_only(self):
with tf.Graph().as_default(), tf.Session() as sess: with tf.Graph().as_default(), tf.Session() as sess:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment