Internal change

PiperOrigin-RevId: 372471631

Internal change
PiperOrigin-RevId: 372471631
a67c28c8 · A. Unique TensorFlower · saberkun · 027e5dc6 · a67c28c8 · a67c28c8
Commit a67c28c8 authored May 06, 2021 by A. Unique TensorFlower Committed by saberkun May 07, 2021
6 changed files
--- a/official/benchmark/base_benchmark.py
+++ b/official/benchmark/base_benchmark.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmark class for model garden models."""
+import os
+import pprint
+# Import libraries
+from absl import logging
+import tensorflow as tf
+from tensorflow.python.platform import benchmark  # pylint: disable=unused-import
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.benchmark import benchmark_lib
+from official.benchmark import benchmark_definitions
+from official.benchmark import config_utils
+from official.core import exp_factory
+from official.modeling import hyperparams
+def _get_benchmark_params(benchmark_models):
+  """Formats benchmark params into a list."""
+  parameterized_benchmark_params = []
+  for _, benchmarks in benchmark_models.items():
+    for name, params in benchmarks.items():
+      for execution_mode in ['performance', 'accuracy']:
+        benchmark_name = '{}.{}'.format(name, execution_mode)
+        benchmark_params = (
+            benchmark_name,  # First arg is used by ParameterizedBenchmark.
+            benchmark_name,
+            params['experiment_type'],
+            execution_mode,
+            params['platform'],
+            params['precision'],
+            params['metric_bounds'],
+            params.get('config_files') or [],
+            params.get('params_override') or None)
+        parameterized_benchmark_params.append(benchmark_params)
+  return parameterized_benchmark_params
+class BaseBenchmark(  # pylint: disable=undefined-variable
+    tf.test.Benchmark, metaclass=benchmark.ParameterizedBenchmark):
+  """Common Benchmark.
+     benchmark.ParameterizedBenchmark is used to auto create benchmarks from
+     benchmark method according to the benchmarks defined in
+     benchmark_definitions. The name of the new benchmark methods is
+     benchmark__{benchmark_name}. _get_benchmark_params is used to generate the
+     benchmark name and args.
+  """
+  _benchmark_parameters = _get_benchmark_params(
+      benchmark_definitions.VISION_BENCHMARKS) + _get_benchmark_params(
+          benchmark_definitions.NLP_BENCHMARKS)
+  def __init__(self,
+               output_dir=None,
+               tpu=None):
+    """Initialize class.
+    Args:
+      output_dir: Base directory to store all output for the test.
+      tpu: (optional) TPU name to use in a TPU benchmark.
+    """
+    if os.getenv('BENCHMARK_OUTPUT_DIR'):
+      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
+    elif output_dir:
+      self.output_dir = output_dir
+    else:
+      self.output_dir = '/tmp'
+    if os.getenv('BENCHMARK_TPU'):
+      self._resolved_tpu = os.getenv('BENCHMARK_TPU')
+    elif tpu:
+      self._resolved_tpu = tpu
+    else:
+      self._resolved_tpu = None
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+  def benchmark(self,
+                benchmark_name,
+                experiment_type,
+                execution_mode,
+                platform,
+                precision,
+                metric_bounds,
+                config_files,
+                params_override):
+    params = exp_factory.get_exp_config(experiment_type)
+    for config_file in config_files:
+      file_path = config_utils.get_config_path(config_file)
+      params = hyperparams.override_params_dict(
+          params, file_path, is_strict=True)
+    if params_override:
+      params = hyperparams.override_params_dict(
+          params, params_override, is_strict=True)
+    # platform in format tpu.[n]x[n] or gpu.[n]
+    if 'tpu' in platform:
+      params.runtime.distribution_strategy = 'tpu'
+      params.runtime.tpu = self._resolved_tpu
+    elif 'gpu' in platform:
+      params.runtime.num_gpus = int(platform.split('.')[-1])
+      params.runtime.distribution_strategy = 'mirrored'
+    else:
+      NotImplementedError('platform :{} is not supported'.format(platform))
+    params.runtime.mixed_precision_dtype = precision
+    params.validate()
+    params.lock()
+    tf.io.gfile.makedirs(self._get_model_dir(benchmark_name))
+    hyperparams.save_params_dict_to_yaml(
+        params,
+        os.path.join(self._get_model_dir(benchmark_name), 'params.yaml'))
+    pp = pprint.PrettyPrinter()
+    logging.info('Final experiment parameters: %s',
+                 pp.pformat(params.as_dict()))
+    benchmark_data = benchmark_lib.run_benchmark(
+        execution_mode, params, self._get_model_dir(benchmark_name))
+    metrics = []
+    if execution_mode == 'accuracy':
+      for metric_bound in metric_bounds:
+        metric = {
+            'name': metric_bound['name'],
+            'value': benchmark_data['metrics'][metric_bound['name']],
+            'min_value': metric_bound['min_value'],
+            'max_value': metric_bound['max_value']
+        }
+        metrics.append(metric)
+    metrics.append({'name': 'startup_time',
+                    'value': benchmark_data['startup_time']})
+    metrics.append({'name': 'exp_per_second',
+                    'value': benchmark_data['examples_per_second']})
+    self.report_benchmark(
+        iters=-1,
+        wall_time=benchmark_data['wall_time'],
+        metrics=metrics,
+        extras={'model_name': benchmark_name.split('.')[0],
+                'platform': platform,
+                'implementation': 'orbit.ctl',
+                'parameters': precision})
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/benchmark_definitions.py
+++ b/official/benchmark/benchmark_definitions.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model garden benchmark definitions."""
+# tf-vision benchmarks
+IMAGE_CLASSIFICATION_BENCHMARKS = {
+    'image_classification.resnet50.tpu.4x4.bf16':
+        dict(
+            experiment_type='resnet_imagenet',
+            platform='tpu.4x4',
+            precision='bfloat16',
+            metric_bounds=[{
+                'name': 'accuracy',
+                'min_value': 0.76,
+                'max_value': 0.77
+            }],
+            config_files=['official/vision/beta/configs/experiments/'
+                          'image_classification/imagenet_resnet50_tpu.yaml']),
+    'image_classification.resnet50.gpu.8.fp16':
+        dict(
+            experiment_type='resnet_imagenet',
+            platform='gpu.8',
+            precision='float16',
+            metric_bounds=[{
+                'name': 'accuracy',
+                'min_value': 0.76,
+                'max_value': 0.77
+            }],
+            config_files=['official/vision/beta/configs/experiments/'
+                          'image_classification/imagenet_resnet50_gpu.yaml'])
+}
+VISION_BENCHMARKS = {
+    'image_classification': IMAGE_CLASSIFICATION_BENCHMARKS,
+}
+NLP_BENCHMARKS = {
+}
--- a/official/benchmark/benchmark_lib.py
+++ b/official/benchmark/benchmark_lib.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFM common benchmark training driver."""
+import os
+import time
+from typing import Any, Mapping
+from absl import logging
+import orbit
+import tensorflow as tf
+from official.common import distribute_utils
+from official.core import config_definitions
+from official.core import task_factory
+from official.core import train_utils
+from official.modeling import performance
+def run_benchmark(
+    execution_mode: str,
+    params: config_definitions.ExperimentConfig,
+    model_dir: str,
+    distribution_strategy: tf.distribute.Strategy = None
+) -> Mapping[str, Any]:
+  """Runs benchmark for a specific experiment.
+  Args:
+    execution_mode: A 'str', specifying the mode. Can be 'accuracy', or
+      'performance'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    distribution_strategy: A tf.distribute.Strategy to use. If specified,
+     it will be used instead of inferring the strategy from params.
+  Returns:
+    benchmark_data: returns benchmark data in dict format.
+  """
+  # For GPU runs, allow option to set thread mode
+  if params.runtime.gpu_thread_mode:
+    os.environ['TF_GPU_THREAD_MODE'] = params.runtime.gpu_thread_mode
+    logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  strategy = distribution_strategy or distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+    trainer = train_utils.create_trainer(
+        params,
+        task,
+        train=True,
+        evaluate=(execution_mode == 'accuracy'))
+    # Initialize the model if possible, e.g., from a pre-trained checkpoint.
+    trainer.initialize()
+  steps_per_loop = params.trainer.steps_per_loop if (
+      execution_mode == 'accuracy') else 100
+  controller = orbit.Controller(
+      strategy=strategy,
+      trainer=trainer,
+      evaluator=trainer if (execution_mode == 'accuracy') else None,
+      global_step=trainer.global_step,
+      steps_per_loop=steps_per_loop)
+  logging.info('Starts to execute execution mode: %s', execution_mode)
+  with strategy.scope():
+    # Training for one loop, first loop time includes warmup time.
+    first_loop_start_time = time.time()
+    controller.train(steps=steps_per_loop)
+    first_loop_time = time.time() - first_loop_start_time
+    # Training for second loop.
+    second_loop_start_time = time.time()
+    controller.train(steps=2*steps_per_loop)
+    second_loop_time = time.time() - second_loop_start_time
+    if execution_mode == 'accuracy':
+      controller.train(steps=params.trainer.train_steps)
+      wall_time = time.time() - first_loop_time
+      eval_logs = trainer.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+      benchmark_data = {'metrics': eval_logs}
+    elif execution_mode == 'performance':
+      benchmark_data = {}
+    else:
+      raise NotImplementedError(
+          'The benchmark execution mode is not implemented: %s' %
+          execution_mode)
+    # First training loop time contains startup time plus training time, while
+    # second training loop time is purely training time. Startup time can be
+    # recovered by subtracting second trianing loop time from first training
+    # loop time.
+    startup_time = first_loop_time - second_loop_time
+    wall_time = time.time() - first_loop_start_time
+    examples_per_second = steps_per_loop * params.task.train_data.global_batch_size / second_loop_time
+    benchmark_data.update(
+        dict(
+            examples_per_second=examples_per_second,
+            wall_time=wall_time,
+            startup_time=startup_time))
+    return benchmark_data
--- a/official/benchmark/benchmark_lib_test.py
+++ b/official/benchmark/benchmark_lib_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow_models.official.benchmark.benchmark_lib."""
+# pylint: disable=g-direct-tensorflow-import
+from absl.testing import parameterized
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.benchmark import benchmark_lib
+from official.core import exp_factory
+from official.modeling import hyperparams
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+class BenchmarkLibTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(BenchmarkLibTest, self).setUp()
+    self._test_config = {
+        'trainer': {
+            'steps_per_loop': 10,
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            },
+            'continuous_eval_timeout': 5,
+            'train_steps': 20,
+            'validation_steps': 10
+        },
+    }
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          execution_mode=['performance', 'accuracy'],
+      ))
+  def test_benchmark(self, distribution, execution_mode):
+    model_dir = self.get_temp_dir()
+    params = exp_factory.get_exp_config('mock')
+    params = hyperparams.override_params_dict(
+        params, self._test_config, is_strict=True)
+    benchmark_data = benchmark_lib.run_benchmark(execution_mode,
+                                                 params,
+                                                 model_dir,
+                                                 distribution)
+    self.assertIn('examples_per_second', benchmark_data)
+    self.assertIn('wall_time', benchmark_data)
+    self.assertIn('startup_time', benchmark_data)
+    if execution_mode == 'accuracy':
+      self.assertIn('metrics', benchmark_data)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/config_utils.py
+++ b/official/benchmark/config_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks config utils."""
+import os
+def get_config_path(
+    config_file: str,
+    base_dir: str = '') -> str:
+  """Gets the absolute path of the config file."""
+  return os.path.join(base_dir, config_file)
--- a/official/benchmark/resnet_ctl_imagenet_benchmark.py
+++ b/official/benchmark/resnet_ctl_imagenet_benchmark.py
@@ -334,6 +334,18 @@ class Resnet50CtlBenchmarkBase(CtlBenchmark):
    FLAGS.dtype = 'fp16'
    self._run_and_report_benchmark()
+  def benchmark_xla_8_gpu_fp16(self):
+    """Test Keras model with 8 GPUs with tf.keras mixed precision."""
+    self._setup()
+    FLAGS.num_gpus = 8
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
+    FLAGS.batch_size = 256 * 8  # 8 GPUs
+    FLAGS.dtype = 'fp16'
+    FLAGS.enable_xla = True
+    self._run_and_report_benchmark()
  def benchmark_8_gpu_eager(self):
    """Test Keras model with 8 GPUs, eager, fp32."""
    self._setup()