"...git@developer.sourcefind.cn:OpenDAS/mmdetection3d.git" did not exist on "c3ef75bfa31870e0e343b57b0c100eabb47d803f"
Unverified Commit b9b44f7b authored by Qianli Scott Zhu's avatar Qianli Scott Zhu Committed by GitHub
Browse files

Resnet benchmark logging (#3704)

* Update reset model for benchmark logging.

To enable benchmark logging, just add "--hooks LoggingMetricHook"

* Benchmark logger fix for resnet.

1. Update default at_end to False for metric logger to avoid
checkpoint error.
2. Update resnet run to log final evaluation result.

* Update log output for final eval_result.

* Typo fix.

* Unset the default value for benchmark_log_dir.

Usually the benchmark should be logged to different directly for
each run. Having a default value will hide the choice from user.

* Bug fix for benchmark logger initialization.

* Fix lint error.

* Address the review comment.

1. Update the logger to cover evaluation result.
2. Move the flag to performance parser.

* Undo the change for arg_parser.
parent 8652f38d
...@@ -31,6 +31,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order ...@@ -31,6 +31,7 @@ import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import resnet_model from official.resnet import resnet_model
from official.utils.arg_parsers import parsers from official.utils.arg_parsers import parsers
from official.utils.logging import hooks_helper from official.utils.logging import hooks_helper
from official.utils.logging import logger
################################################################################ ################################################################################
...@@ -349,7 +350,9 @@ def resnet_main(flags, model_function, input_function): ...@@ -349,7 +350,9 @@ def resnet_main(flags, model_function, input_function):
for _ in range(flags.train_epochs // flags.epochs_between_evals): for _ in range(flags.train_epochs // flags.epochs_between_evals):
train_hooks = hooks_helper.get_train_hooks( train_hooks = hooks_helper.get_train_hooks(
flags.hooks, batch_size=flags.batch_size) flags.hooks,
batch_size=flags.batch_size,
benchmark_log_dir=flags.benchmark_log_dir)
print('Starting a training cycle.') print('Starting a training cycle.')
...@@ -377,6 +380,10 @@ def resnet_main(flags, model_function, input_function): ...@@ -377,6 +380,10 @@ def resnet_main(flags, model_function, input_function):
steps=flags.max_train_steps) steps=flags.max_train_steps)
print(eval_results) print(eval_results)
if flags.benchmark_log_dir is not None:
benchmark_logger = logger.BenchmarkLogger(flags.benchmark_log_dir)
benchmark_logger.log_estimator_evaluation_result(eval_results)
class ResnetArgParser(argparse.ArgumentParser): class ResnetArgParser(argparse.ArgumentParser):
"""Arguments for configuring and running a Resnet Model. """Arguments for configuring and running a Resnet Model.
...@@ -387,6 +394,7 @@ class ResnetArgParser(argparse.ArgumentParser): ...@@ -387,6 +394,7 @@ class ResnetArgParser(argparse.ArgumentParser):
parsers.BaseParser(), parsers.BaseParser(),
parsers.PerformanceParser(), parsers.PerformanceParser(),
parsers.ImageModelParser(), parsers.ImageModelParser(),
parsers.BenchmarkParser(),
]) ])
self.add_argument( self.add_argument(
......
...@@ -131,7 +131,7 @@ class BaseParser(argparse.ArgumentParser): ...@@ -131,7 +131,7 @@ class BaseParser(argparse.ArgumentParser):
"of train hooks. " "of train hooks. "
"Example: --hooks LoggingTensorHook ExamplesPerSecondHook. " "Example: --hooks LoggingTensorHook ExamplesPerSecondHook. "
"Allowed hook names (case-insensitive): LoggingTensorHook, " "Allowed hook names (case-insensitive): LoggingTensorHook, "
"ProfilerHook, ExamplesPerSecondHook. " "ProfilerHook, ExamplesPerSecondHook, LoggingMetricHook."
"See official.utils.logging.hooks_helper for details.", "See official.utils.logging.hooks_helper for details.",
metavar="<HK>" metavar="<HK>"
) )
...@@ -224,3 +224,21 @@ class ImageModelParser(argparse.ArgumentParser): ...@@ -224,3 +224,21 @@ class ImageModelParser(argparse.ArgumentParser):
"was built for CPU or GPU.", "was built for CPU or GPU.",
metavar="<CF>" metavar="<CF>"
) )
class BenchmarkParser(argparse.ArgumentParser):
"""Default parser for benchmark logging.
Args:
add_help: Create the "--help" flag. False if class instance is a parent.
benchmark_log_dir: Create a flag to specify location for benchmark logging.
"""
def __init__(self, add_help=False, benchmark_log_dir=True):
super(BenchmarkParser, self).__init__(add_help=add_help)
if benchmark_log_dir:
self.add_argument(
"--benchmark_log_dir", "-bld", default=None,
help="[default: %(default)s] The location of the benchmark logging.",
metavar="<BLD>"
)
...@@ -28,7 +28,8 @@ class TestParser(argparse.ArgumentParser): ...@@ -28,7 +28,8 @@ class TestParser(argparse.ArgumentParser):
parsers.BaseParser(), parsers.BaseParser(),
parsers.PerformanceParser(num_parallel_calls=True, inter_op=True, parsers.PerformanceParser(num_parallel_calls=True, inter_op=True,
intra_op=True, use_synthetic_data=True), intra_op=True, use_synthetic_data=True),
parsers.ImageModelParser(data_format=True) parsers.ImageModelParser(data_format=True),
parsers.BenchmarkParser(benchmark_log_dir=True)
]) ])
...@@ -58,6 +59,19 @@ class BaseTester(unittest.TestCase): ...@@ -58,6 +59,19 @@ class BaseTester(unittest.TestCase):
for key, value in defaults.items(): for key, value in defaults.items():
assert namespace_vars[key] == value assert namespace_vars[key] == value
def test_benchmark_setting(self):
defaults = dict(
hooks=["LoggingMetricHook"],
benchmark_log_dir="/tmp/12345"
)
parser = TestParser()
parser.set_defaults(**defaults)
namespace_vars = vars(parser.parse_args([]))
for key, value in defaults.items():
assert namespace_vars[key] == value
def test_booleans(self): def test_booleans(self):
"""Test to ensure boolean flags trigger as expected. """Test to ensure boolean flags trigger as expected.
""" """
......
...@@ -27,6 +27,7 @@ from __future__ import print_function ...@@ -27,6 +27,7 @@ from __future__ import print_function
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from official.utils.logging import hooks from official.utils.logging import hooks
from official.utils.logging import metric_hook
_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate', _TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate',
'cross_entropy', 'cross_entropy',
...@@ -122,9 +123,37 @@ def get_examples_per_second_hook(every_n_steps=100, ...@@ -122,9 +123,37 @@ def get_examples_per_second_hook(every_n_steps=100,
warm_steps=warm_steps) warm_steps=warm_steps)
def get_logging_metric_hook(benchmark_log_dir=None,
tensors_to_log=None,
every_n_secs=600,
**kwargs): # pylint: disable=unused-argument
"""Function to get LoggingMetricHook.
Args:
benchmark_log_dir: `string`, directory path to save the metric log.
tensors_to_log: List of tensor names or dictionary mapping labels to tensor
names. If not set, log _TENSORS_TO_LOG by default.
every_n_secs: `int`, the frequency for logging the metric. Default to every
10 mins.
Returns:
Returns a ProfilerHook that writes out timelines that can be loaded into
profiling tools like chrome://tracing.
"""
if benchmark_log_dir is None:
raise ValueError("metric_log_dir should be provided to use metric logger")
if tensors_to_log is None:
tensors_to_log = _TENSORS_TO_LOG
return metric_hook.LoggingMetricHook(
tensors=tensors_to_log,
log_dir=benchmark_log_dir,
every_n_secs=every_n_secs)
# A dictionary to map one hook name and its corresponding function # A dictionary to map one hook name and its corresponding function
HOOKS = { HOOKS = {
'loggingtensorhook': get_logging_tensor_hook, 'loggingtensorhook': get_logging_tensor_hook,
'profilerhook': get_profiler_hook, 'profilerhook': get_profiler_hook,
'examplespersecondhook': get_examples_per_second_hook, 'examplespersecondhook': get_examples_per_second_hook,
'loggingmetrichook': get_logging_metric_hook,
} }
...@@ -49,16 +49,19 @@ class BaseTest(unittest.TestCase): ...@@ -49,16 +49,19 @@ class BaseTest(unittest.TestCase):
expected_hook_name) expected_hook_name)
def test_get_train_hooks_logging_tensor_hook(self): def test_get_train_hooks_logging_tensor_hook(self):
test_hook_name = 'LoggingTensorHook' self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
self.validate_train_hook_name(test_hook_name, 'loggingtensorhook')
def test_get_train_hooks_profiler_hook(self): def test_get_train_hooks_profiler_hook(self):
test_hook_name = 'ProfilerHook' self.validate_train_hook_name('ProfilerHook', 'profilerhook')
self.validate_train_hook_name(test_hook_name, 'profilerhook')
def test_get_train_hooks_examples_per_second_hook(self): def test_get_train_hooks_examples_per_second_hook(self):
test_hook_name = 'ExamplesPerSecondHook' self.validate_train_hook_name('ExamplesPerSecondHook',
self.validate_train_hook_name(test_hook_name, 'examplespersecondhook') 'examplespersecondhook')
def test_get_logging_metric_hook(self):
test_hook_name = 'LoggingMetricHook'
self.validate_train_hook_name(test_hook_name, 'loggingmetrichook',
benchmark_log_dir='/tmp')
if __name__ == '__main__': if __name__ == '__main__':
tf.test.main() tf.test.main()
...@@ -37,6 +37,25 @@ class BenchmarkLogger(object): ...@@ -37,6 +37,25 @@ class BenchmarkLogger(object):
if not tf.gfile.IsDirectory(self._logging_dir): if not tf.gfile.IsDirectory(self._logging_dir):
tf.gfile.MakeDirs(self._logging_dir) tf.gfile.MakeDirs(self._logging_dir)
def log_estimator_evaluation_result(self, eval_results):
"""Log the evaluation result for a estimator.
The evaluate result is a directory that contains metrics defined in
model_fn. It also contains a entry for global_step which contains the value
of the global step when evaluation was performed.
Args:
eval_results: dict, the result of evaluate() from a estimator.
"""
if not isinstance(eval_results, dict):
tf.logging.warning("eval_results should be directory for logging. Got %s",
type(eval_results))
return
global_step = eval_results[tf.GraphKeys.GLOBAL_STEP]
for key in eval_results:
if key != tf.GraphKeys.GLOBAL_STEP:
self.log_metric(key, eval_results[key], global_step=global_step)
def log_metric(self, name, value, unit=None, global_step=None, extras=None): def log_metric(self, name, value, unit=None, global_step=None, extras=None):
"""Log the benchmark metric information to local file. """Log the benchmark metric information to local file.
......
...@@ -87,5 +87,37 @@ class BenchmarkLoggerTest(tf.test.TestCase): ...@@ -87,5 +87,37 @@ class BenchmarkLoggerTest(tf.test.TestCase):
metric_log = os.path.join(log_dir, "metric.log") metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.gfile.Exists(metric_log)) self.assertFalse(tf.gfile.Exists(metric_log))
def test_log_evaluation_result(self):
eval_result = {'loss': 0.46237424,
'global_step': 207082,
'accuracy': 0.9285}
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log.log_estimator_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertTrue(tf.gfile.Exists(metric_log))
with tf.gfile.GFile(metric_log) as f:
loss = json.loads(f.readline())
self.assertEqual(loss["name"], "loss")
self.assertEqual(loss["value"], 0.46237424)
self.assertEqual(loss["unit"], None)
self.assertEqual(loss["global_step"], 207082)
accuracy = json.loads(f.readline())
self.assertEqual(accuracy["name"], "accuracy")
self.assertEqual(accuracy["value"], 0.9285)
self.assertEqual(accuracy["unit"], None)
self.assertEqual(accuracy["global_step"], 207082)
def test_log_evaluation_result_with_invalid_type(self):
eval_result = "{'loss': 0.46237424, 'global_step': 207082}"
log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
log = logger.BenchmarkLogger(log_dir)
log.log_estimator_evaluation_result(eval_result)
metric_log = os.path.join(log_dir, "metric.log")
self.assertFalse(tf.gfile.Exists(metric_log))
if __name__ == "__main__": if __name__ == "__main__":
tf.test.main() tf.test.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment