Unverified Commit 21ec0e1b authored by Qianli Scott Zhu's avatar Qianli Scott Zhu Committed by GitHub
Browse files

Benchmark update (#4034)

* Update the benchmark logger to have default logging.

1. Create global instance of benchmark logger, which default log to
tf.logging.info
2. Allow user to config the logging location.
3. Fix nits in code and comment.

* Fix lint and test error.

* Address review comments.

* Remove the duplicated print statement.
parent 823da318
......@@ -398,11 +398,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
'dtype': flags.dtype
})
if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir)
benchmark_logger.log_run_info('resnet')
else:
benchmark_logger = None
for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks(
......@@ -434,10 +431,8 @@ def resnet_main(flags, model_function, input_function, shape=None):
# global_step count.
eval_results = classifier.evaluate(input_fn=input_fn_eval,
steps=flags.max_train_steps)
print(eval_results)
if benchmark_logger:
benchmark_logger.log_estimator_evaluation_result(eval_results)
benchmark_logger.log_evaluation_result(eval_results)
if model_helpers.past_stop_threshold(
flags.stop_threshold, eval_results['accuracy']):
......
......@@ -27,6 +27,7 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import hooks
from official.utils.logs import logger
from official.utils.logs import metric_hook
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
......@@ -140,13 +141,12 @@ def get_logging_metric_hook(benchmark_log_dir=None,
Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing.
"""
if benchmark_log_dir is None:
raise ValueError("metric_log_dir should be provided to use metric logger")
logger.config_benchmark_logger(benchmark_log_dir)
if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG
return metric_hook.LoggingMetricHook(
tensors=tensors_to_log,
log_dir=benchmark_log_dir,
metric_logger=logger.get_benchmark_logger(),
every_n_secs=every_n_secs)
......
......@@ -27,6 +27,7 @@ import json
import multiprocessing
import numbers
import os
import threading
import tensorflow as tf
from tensorflow.python.client import device_lib
......@@ -36,27 +37,48 @@ BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
_DATE_TIME_FORMAT_PATTERN = "%Y-%m-%dT%H:%M:%S.%fZ"
class BenchmarkLogger(object):
"""Class to log the benchmark information to local disk."""
# Don't use it directly. Use get_benchmark_logger to access a logger.
_benchmark_logger = None
_logger_lock = threading.Lock()
def config_benchmark_logger(logging_dir):
"""Config the global benchmark logger"""
_logger_lock.acquire()
try:
global _benchmark_logger
if logging_dir:
_benchmark_logger = BenchmarkFileLogger(logging_dir)
else:
_benchmark_logger = BaseBenchmarkLogger()
finally:
_logger_lock.release()
return _benchmark_logger
def __init__(self, logging_dir):
self._logging_dir = logging_dir
if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir)
def log_estimator_evaluation_result(self, eval_results):
"""Log the evaluation result for a estimator.
def get_benchmark_logger():
if not _benchmark_logger:
config_benchmark_logger(None)
The evaluate result is a directory that contains metrics defined in
return _benchmark_logger
class BaseBenchmarkLogger(object):
"""Class to log the benchmark information to STDOUT."""
def log_evaluation_result(self, eval_results):
"""Log the evaluation result.
The evaluate result is a dictionary that contains metrics defined in
model_fn. It also contains a entry for global_step which contains the value
of the global step when evaluation was performed.
Args:
eval_results: dict, the result of evaluate() from a estimator.
eval_results: dict, the result of evaluate.
"""
if not isinstance(eval_results, dict):
tf.logging.warning("eval_results should be directory for logging. Got %s",
type(eval_results))
tf.logging.warning("eval_results should be dictionary for logging. "
"Got %s", type(eval_results))
return
global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
for key in sorted(eval_results):
......@@ -81,10 +103,45 @@ class BenchmarkLogger(object):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return
if extras:
extras = [{"name": k, "value": v} for k, v in sorted(extras.items())]
else:
extras = []
extras = _convert_to_json_dict(extras)
tf.logging.info("Benchmark metric: "
"Name %s, value %d, unit %s, global_step %d, extras %s",
name, value, unit, global_step, extras)
def log_run_info(self, model_name):
tf.logging.info("Benchmark run: %s", _gather_run_info(model_name))
class BenchmarkFileLogger(BaseBenchmarkLogger):
"""Class to log the benchmark information to local disk."""
def __init__(self, logging_dir):
super(BenchmarkFileLogger, self).__init__()
self._logging_dir = logging_dir
if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir)
def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file.
Currently the logging is done in a synchronized way. This should be updated
to log asynchronously.
Args:
name: string, the name of the metric to log.
value: number, the value of the metric. The value will not be logged if it
is not a number type.
unit: string, the unit of the metric, E.g "image per second".
global_step: int, the global_step when the metric is logged.
extras: map of string:string, the extra information about the metric.
"""
if not isinstance(value, numbers.Number):
tf.logging.warning(
"Metric value to log should be a number. Got %s", type(value))
return
extras = _convert_to_json_dict(extras)
with tf.gfile.GFile(
os.path.join(self._logging_dir, METRIC_LOG_FILE_NAME), "a") as f:
metric = {
......@@ -110,15 +167,7 @@ class BenchmarkLogger(object):
Args:
model_name: string, the name of the model.
"""
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)
run_info = _gather_run_info(model_name)
with tf.gfile.GFile(os.path.join(
self._logging_dir, BENCHMARK_RUN_LOG_FILE_NAME), "w") as f:
......@@ -130,6 +179,20 @@ class BenchmarkLogger(object):
e)
def _gather_run_info(model_name):
"""Collect the benchmark run information for the local environment."""
run_info = {
"model_name": model_name,
"machine_config": {},
"run_date": datetime.datetime.now().strftime(_DATE_TIME_FORMAT_PATTERN)}
_collect_tensorflow_info(run_info)
_collect_tensorflow_environment_variables(run_info)
_collect_cpu_info(run_info)
_collect_gpu_info(run_info)
_collect_memory_info(run_info)
return run_info
def _collect_tensorflow_info(run_info):
run_info["tensorflow_version"] = {
"version": tf.VERSION, "git_hash": tf.GIT_VERSION}
......@@ -194,3 +257,10 @@ def _parse_gpu_model(physical_device_desc):
if k.strip() == "name":
return v.strip()
return None
def _convert_to_json_dict(input_dict):
if input_dict:
return [{"name": k, "value": v} for k, v in sorted(input_dict.items())]
else:
return []
......@@ -31,8 +31,50 @@ from official.utils.logs import logger
class BenchmarkLoggerTest(tf.test.TestCase):
def test_get_default_benchmark_logger(self):
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_base_benchmark_logger(self):
logger.config_benchmark_logger("")
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BaseBenchmarkLogger)
def test_config_benchmark_file_logger(self):
logger.config_benchmark_logger("/tmp/abc")
self.assertIsInstance(logger.get_benchmark_logger(),
logger.BenchmarkFileLogger)
class BaseBenchmarkLoggerTest(tf.test.TestCase):
def setUp(self):
super(BaseBenchmarkLoggerTest, self).setUp()
self._actual_log = tf.logging.info
self.logged_message = None
def mock_log(*args, **kwargs):
self.logged_message = args
self._actual_log(*args, **kwargs)
tf.logging.info = mock_log
def tearDown(self):
super(BaseBenchmarkLoggerTest, self).tearDown()
tf.logging.info = self._actual_log
def test_log_metric(self):
log = logger.BaseBenchmarkLogger()
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
expected_log_prefix = "Benchmark metric:"
self.assertRegexpMatches(str(self.logged_message), expected_log_prefix)
class BenchmarkFileLoggerTest(tf.test.TestCase):
def setUp(self):
super(BenchmarkLoggerTest, self).setUp()
super(BenchmarkFileLoggerTest, self).setUp()
# Avoid pulling extra env vars from test environment which affects the test
# result, eg. Kokoro test has a TF_PKG env which affect the test case
# test_collect_tensorflow_environment_variables()
......@@ -40,7 +82,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
os.environ.clear()
def tearDown(self):
super(BenchmarkLoggerTest, self).tearDown()
super(BenchmarkFileLoggerTest, self).tearDown()
tf.gfile.DeleteRecursively(self.get_temp_dir())
os.environ.clear()
os.environ.update(self.original_environ)
......@@ -49,12 +91,12 @@ class BenchmarkLoggerTest(tf.test.TestCase):
non_exist_temp_dir = os.path.join(self.get_temp_dir(), "unknown_dir")
self.assertFalse(tf.gfile.IsDirectory(non_exist_temp_dir))
logger.BenchmarkLogger(non_exist_temp_dir)
logger.BenchmarkFileLogger(non_exist_temp_dir)
self.assertTrue(tf.gfile.IsDirectory(non_exist_temp_dir))
def test_log_metric(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
metric_log = os.path.join(log_dir, "metric.log")
......@@ -69,7 +111,7 @@ class BenchmarkLoggerTest(tf.test.TestCase):
def test_log_multiple_metrics(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log = logger.BenchmarkFileLogger(log_dir)
log.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
log.log_metric("loss", 0.02, global_step=1e4)
......@@ -90,9 +132,9 @@ class BenchmarkLoggerTest(tf.test.TestCase):
self.assertEqual(loss["global_step"], 1e4)
self.assertEqual(loss["extras"], [])
def test_log_non_nubmer_value(self):
def test_log_non_number_value(self):
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log = logger.BenchmarkFileLogger(log_dir)
const = tf.constant(1)
log.log_metric("accuracy", const)
......@@ -104,8 +146,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
"global_step": 207082,
"accuracy": 0.9285}
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log.log_estimator_evaluation_result(eval_result)
log = logger.BenchmarkFileLogger(log_dir)
log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.gfile.Exists(metric_log))
......@@ -125,8 +167,8 @@ class BenchmarkLoggerTest(tf.test.TestCase):
def test_log_evaluation_result_with_invalid_type(self):
eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log.log_estimator_evaluation_result(eval_result)
log = logger.BenchmarkFileLogger(log_dir)
log.log_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.gfile.Exists(metric_log))
......
......@@ -20,8 +20,6 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logs import logger
class LoggingMetricHook(tf.train.LoggingTensorHook):
"""Hook to log benchmark metric information.
......@@ -35,17 +33,15 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
whose evaluation produces a side effect such as consuming additional inputs.
"""
def __init__(self, tensors, log_dir=None, metric_logger=None,
def __init__(self, tensors, metric_logger=None,
every_n_iter=None, every_n_secs=None, at_end=False):
"""Initializer for LoggingMetricHook.
Args:
tensors: `dict` that maps string-valued tags to tensors/tensor names,
or `iterable` of tensors/tensor names.
log_dir: `string`, directory path that metric hook should write log to.
metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
hook should use to write the log. Exactly one of the `log_dir` and
`metric_logger` should be provided.
hook should use to write the log.
every_n_iter: `int`, print the values of `tensors` once every N local
steps taken on the current worker.
every_n_secs: `int` or `float`, print the values of `tensors` once every N
......@@ -66,13 +62,8 @@ class LoggingMetricHook(tf.train.LoggingTensorHook):
every_n_secs=every_n_secs,
at_end=at_end)
if (log_dir is None) == (metric_logger is None):
raise ValueError(
"exactly one of log_dir and metric_logger should be provided.")
if log_dir is not None:
self._logger = logger.BenchmarkLogger(log_dir)
else:
if metric_logger is None:
raise ValueError("metric_logger should be provided.")
self._logger = metric_logger
def begin(self):
......
......@@ -64,12 +64,8 @@ class LoggingMetricHookTest(tf.test.TestCase):
tensors=['t'], every_n_iter=5, every_n_secs=5)
with self.assertRaisesRegexp(ValueError, 'xactly one of'):
metric_hook.LoggingMetricHook(tensors=['t'])
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
with self.assertRaisesRegexp(ValueError, 'metric_logger'):
metric_hook.LoggingMetricHook(tensors=['t'], every_n_iter=5)
with self.assertRaisesRegexp(ValueError, 'log_dir and metric_logger'):
metric_hook.LoggingMetricHook(
tensors=['t'], every_n_iter=5, log_dir=self._log_dir,
metric_logger=self._logger)
def test_print_at_end_only(self):
with tf.Graph().as_default(), tf.Session() as sess:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment