"...source/git@developer.sourcefind.cn:Wenxuan/LightX2V.git" did not exist on "689e69b63864dc41c4c42ffe37ff8672b9ef8b85"
Unverified Commit dc42c482 authored by Toby Boyd's avatar Toby Boyd Committed by GitHub
Browse files

Add examples per second history to Estimator hook. (#6193)

* Add exp_per_second history to hook.

- Add tracking exp_per_second to benchmark tests.

* remove turn off dist strat.

* Average all results.
parent b66ef95e
...@@ -27,6 +27,7 @@ from absl.testing import flagsaver ...@@ -27,6 +27,7 @@ from absl.testing import flagsaver
import tensorflow as tf # pylint: disable=g-bad-import-order import tensorflow as tf # pylint: disable=g-bad-import-order
from official.resnet import cifar10_main as cifar_main from official.resnet import cifar10_main as cifar_main
from official.utils.logs import hooks
DATA_DIR = '/data/cifar10_data/cifar-10-batches-bin' DATA_DIR = '/data/cifar10_data/cifar-10-batches-bin'
...@@ -49,6 +50,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -49,6 +50,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu') flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu')
flags.FLAGS.resnet_size = 56 flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp32' flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark() self._run_and_report_benchmark()
def resnet56_fp16_1_gpu(self): def resnet56_fp16_1_gpu(self):
...@@ -61,6 +63,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -61,6 +63,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_1_gpu') flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_1_gpu')
flags.FLAGS.resnet_size = 56 flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp16' flags.FLAGS.dtype = 'fp16'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark() self._run_and_report_benchmark()
def resnet56_2_gpu(self): def resnet56_2_gpu(self):
...@@ -73,6 +76,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -73,6 +76,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
flags.FLAGS.model_dir = self._get_model_dir('resnet56_2_gpu') flags.FLAGS.model_dir = self._get_model_dir('resnet56_2_gpu')
flags.FLAGS.resnet_size = 56 flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp32' flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark() self._run_and_report_benchmark()
def resnet56_fp16_2_gpu(self): def resnet56_fp16_2_gpu(self):
...@@ -85,6 +89,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -85,6 +89,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_2_gpu') flags.FLAGS.model_dir = self._get_model_dir('resnet56_fp16_2_gpu')
flags.FLAGS.resnet_size = 56 flags.FLAGS.resnet_size = 56
flags.FLAGS.dtype = 'fp16' flags.FLAGS.dtype = 'fp16'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark() self._run_and_report_benchmark()
def unit_test(self): def unit_test(self):
...@@ -97,6 +102,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -97,6 +102,7 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu') flags.FLAGS.model_dir = self._get_model_dir('resnet56_1_gpu')
flags.FLAGS.resnet_size = 8 flags.FLAGS.resnet_size = 8
flags.FLAGS.dtype = 'fp32' flags.FLAGS.dtype = 'fp32'
flags.FLAGS.hooks = ['ExamplesPerSecondHook']
self._run_and_report_benchmark() self._run_and_report_benchmark()
def _run_and_report_benchmark(self): def _run_and_report_benchmark(self):
...@@ -104,15 +110,29 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark): ...@@ -104,15 +110,29 @@ class EstimatorCifar10BenchmarkTests(tf.test.Benchmark):
stats = cifar_main.run_cifar(flags.FLAGS) stats = cifar_main.run_cifar(flags.FLAGS)
wall_time_sec = time.time() - start_time_sec wall_time_sec = time.time() - start_time_sec
examples_per_sec_hook = None
for hook in stats['train_hooks']:
if isinstance(hook, hooks.ExamplesPerSecondHook):
examples_per_sec_hook = hook
break
eval_results = stats['eval_results']
extras = {}
extras['accuracy_top_1'] = self._json_description(
eval_results['accuracy'].item(),
priority=0)
extras['accuracy_top_5'] = self._json_description(
eval_results['accuracy_top_5'].item())
if examples_per_sec_hook:
exp_per_second_list = examples_per_sec_hook.current_examples_per_sec_list
# ExamplesPerSecondHook skips the first 10 steps.
exp_per_sec = sum(exp_per_second_list) / (len(exp_per_second_list))
extras['exp_per_second'] = self._json_description(exp_per_sec)
self.report_benchmark( self.report_benchmark(
iters=stats['global_step'], iters=eval_results['global_step'],
wall_time=wall_time_sec, wall_time=wall_time_sec,
extras={ extras=extras)
'accuracy_top_1':
self._json_description(stats['accuracy'].item(), priority=0),
'accuracy_top_5':
self._json_description(stats['accuracy_top_5'].item()),
})
def _json_description(self, def _json_description(self,
value, value,
......
...@@ -599,7 +599,13 @@ def resnet_main( ...@@ -599,7 +599,13 @@ def resnet_main(
shape, batch_size=flags_obj.batch_size, dtype=export_dtype) shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn, classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
strip_default_attrs=True) strip_default_attrs=True)
return eval_results
stats = {}
stats['eval_results'] = eval_results
stats['train_hooks'] = train_hooks
return stats
def define_resnet_flags(resnet_size_choices=None): def define_resnet_flags(resnet_size_choices=None):
"""Add flags and validators for ResNet.""" """Add flags and validators for ResNet."""
......
...@@ -73,6 +73,8 @@ class ExamplesPerSecondHook(tf.estimator.SessionRunHook): ...@@ -73,6 +73,8 @@ class ExamplesPerSecondHook(tf.estimator.SessionRunHook):
self._total_steps = 0 self._total_steps = 0
self._batch_size = batch_size self._batch_size = batch_size
self._warm_steps = warm_steps self._warm_steps = warm_steps
# List of examples per second logged every_n_steps.
self.current_examples_per_sec_list = []
def begin(self): def begin(self):
"""Called once before using the session to check global step.""" """Called once before using the session to check global step."""
...@@ -117,7 +119,8 @@ class ExamplesPerSecondHook(tf.estimator.SessionRunHook): ...@@ -117,7 +119,8 @@ class ExamplesPerSecondHook(tf.estimator.SessionRunHook):
# and training time per batch # and training time per batch
current_examples_per_sec = self._batch_size * ( current_examples_per_sec = self._batch_size * (
elapsed_steps / elapsed_time) elapsed_steps / elapsed_time)
# Logs entries to be read from hook during or after run.
self.current_examples_per_sec_list.append(current_examples_per_sec)
self._logger.log_metric( self._logger.log_metric(
"average_examples_per_sec", average_examples_per_sec, "average_examples_per_sec", average_examples_per_sec,
global_step=global_step) global_step=global_step)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment