Update code to v2.8.0

9485aa1d · qianyj · 89cfa348 · f5fc733a · 9485aa1d · 9485aa1d
Commit 9485aa1d authored Nov 28, 2023 by qianyj
20 changed files
--- a/official/benchmark/base_benchmark.py
+++ b/official/benchmark/base_benchmark.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmark class for model garden models."""
+import os
+import pprint
+# Import libraries
+from absl import logging
+import gin
+import tensorflow as tf
+from tensorflow.python.platform import benchmark  # pylint: disable=unused-import
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.benchmark import benchmark_lib
+from official.benchmark import benchmark_definitions
+from official.benchmark import config_utils
+from official.core import exp_factory
+from official.modeling import hyperparams
+def _get_benchmark_params(benchmark_models, eval_tflite=False):
+  """Formats benchmark params into a list."""
+  parameterized_benchmark_params = []
+  for _, benchmarks in benchmark_models.items():
+    for name, params in benchmarks.items():
+      if eval_tflite:
+        execution_modes = ['performance', 'tflite_accuracy']
+      else:
+        execution_modes = ['performance', 'accuracy']
+      for execution_mode in execution_modes:
+        benchmark_name = '{}.{}'.format(name, execution_mode)
+        benchmark_params = (
+            benchmark_name,  # First arg is used by ParameterizedBenchmark.
+            benchmark_name,
+            params.get('benchmark_function') or benchmark_lib.run_benchmark,
+            params['experiment_type'],
+            execution_mode,
+            params['platform'],
+            params['precision'],
+            params['metric_bounds'],
+            params.get('config_files') or [],
+            params.get('params_override') or None,
+            params.get('gin_file') or [])
+        parameterized_benchmark_params.append(benchmark_params)
+  return parameterized_benchmark_params
+class BaseBenchmark(  # pylint: disable=undefined-variable
+    tf.test.Benchmark, metaclass=benchmark.ParameterizedBenchmark):
+  """Common Benchmark.
+     benchmark.ParameterizedBenchmark is used to auto create benchmarks from
+     benchmark method according to the benchmarks defined in
+     benchmark_definitions. The name of the new benchmark methods is
+     benchmark__{benchmark_name}. _get_benchmark_params is used to generate the
+     benchmark name and args.
+  """
+  _benchmark_parameters = _get_benchmark_params(
+      benchmark_definitions.VISION_BENCHMARKS) + _get_benchmark_params(
+          benchmark_definitions.NLP_BENCHMARKS) + _get_benchmark_params(
+              benchmark_definitions.QAT_BENCHMARKS, True)
+  def __init__(self,
+               output_dir=None,
+               tpu=None):
+    """Initialize class.
+    Args:
+      output_dir: Base directory to store all output for the test.
+      tpu: (optional) TPU name to use in a TPU benchmark.
+    """
+    if os.getenv('BENCHMARK_OUTPUT_DIR'):
+      self.output_dir = os.getenv('BENCHMARK_OUTPUT_DIR')
+    elif output_dir:
+      self.output_dir = output_dir
+    else:
+      self.output_dir = '/tmp'
+    if os.getenv('BENCHMARK_TPU'):
+      self._resolved_tpu = os.getenv('BENCHMARK_TPU')
+    elif tpu:
+      self._resolved_tpu = tpu
+    else:
+      self._resolved_tpu = None
+  def _get_model_dir(self, folder_name):
+    """Returns directory to store info, e.g. saved model and event log."""
+    return os.path.join(self.output_dir, folder_name)
+  def benchmark(self,
+                benchmark_name,
+                benchmark_function,
+                experiment_type,
+                execution_mode,
+                platform,
+                precision,
+                metric_bounds,
+                config_files,
+                params_override,
+                gin_file):
+    with gin.unlock_config():
+      gin.parse_config_files_and_bindings(
+          [config_utils.get_config_path(g) for g in gin_file], None)
+    params = exp_factory.get_exp_config(experiment_type)
+    for config_file in config_files:
+      file_path = config_utils.get_config_path(config_file)
+      params = hyperparams.override_params_dict(
+          params, file_path, is_strict=True)
+    if params_override:
+      params = hyperparams.override_params_dict(
+          params, params_override, is_strict=True)
+    # platform in format tpu.[n]x[n] or gpu.[n]
+    if 'tpu' in platform:
+      params.runtime.distribution_strategy = 'tpu'
+      params.runtime.tpu = self._resolved_tpu
+    elif 'gpu' in platform:
+      params.runtime.num_gpus = int(platform.split('.')[-1])
+      params.runtime.distribution_strategy = 'mirrored'
+    else:
+      NotImplementedError('platform :{} is not supported'.format(platform))
+    params.runtime.mixed_precision_dtype = precision
+    params.validate()
+    params.lock()
+    tf.io.gfile.makedirs(self._get_model_dir(benchmark_name))
+    hyperparams.save_params_dict_to_yaml(
+        params,
+        os.path.join(self._get_model_dir(benchmark_name), 'params.yaml'))
+    pp = pprint.PrettyPrinter()
+    logging.info('Final experiment parameters: %s',
+                 pp.pformat(params.as_dict()))
+    benchmark_data = benchmark_function(
+        execution_mode, params, self._get_model_dir(benchmark_name))
+    metrics = []
+    if execution_mode in ['accuracy', 'tflite_accuracy']:
+      for metric_bound in metric_bounds:
+        metric = {
+            'name': metric_bound['name'],
+            'value': benchmark_data['metrics'][metric_bound['name']],
+            'min_value': metric_bound['min_value'],
+            'max_value': metric_bound['max_value']
+        }
+        metrics.append(metric)
+    metrics.append({'name': 'startup_time',
+                    'value': benchmark_data['startup_time']})
+    metrics.append({'name': 'exp_per_second',
+                    'value': benchmark_data['examples_per_second']})
+    self.report_benchmark(
+        iters=-1,
+        wall_time=benchmark_data['wall_time'],
+        metrics=metrics,
+        extras={'model_name': benchmark_name.split('.')[0],
+                'platform': platform,
+                'implementation': 'orbit.ctl',
+                'parameters': precision})
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/benchmark_definitions.py
+++ b/official/benchmark/benchmark_definitions.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model garden benchmark definitions."""
+# tf-vision benchmarks
+IMAGE_CLASSIFICATION_BENCHMARKS = {
+    'image_classification.resnet50.tpu.4x4.bf16':
+        dict(
+            experiment_type='resnet_imagenet',
+            platform='tpu.4x4',
+            precision='bfloat16',
+            metric_bounds=[{
+                'name': 'accuracy',
+                'min_value': 0.76,
+                'max_value': 0.77
+            }],
+            config_files=['official/vision/beta/configs/experiments/'
+                          'image_classification/imagenet_resnet50_tpu.yaml']),
+    'image_classification.resnet50.gpu.8.fp16':
+        dict(
+            experiment_type='resnet_imagenet',
+            platform='gpu.8',
+            precision='float16',
+            metric_bounds=[{
+                'name': 'accuracy',
+                'min_value': 0.76,
+                'max_value': 0.77
+            }],
+            config_files=['official/vision/beta/configs/experiments/'
+                          'image_classification/imagenet_resnet50_gpu.yaml'])
+}
+VISION_BENCHMARKS = {
+    'image_classification': IMAGE_CLASSIFICATION_BENCHMARKS,
+}
+NLP_BENCHMARKS = {
+}
+QAT_BENCHMARKS = {
+}
--- a/official/benchmark/benchmark_lib.py
+++ b/official/benchmark/benchmark_lib.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFM common benchmark training driver."""
+import os
+import time
+from typing import Any, Mapping
+from absl import logging
+import orbit
+import tensorflow as tf
+from official.benchmark import tflite_utils
+from official.common import distribute_utils
+from official.core import config_definitions
+from official.core import task_factory
+from official.core import train_utils
+from official.modeling import performance
+from official.modeling.fast_training import stage_lib
+def run_benchmark(
+    execution_mode: str,
+    params: config_definitions.ExperimentConfig,
+    model_dir: str,
+    distribution_strategy: tf.distribute.Strategy = None
+) -> Mapping[str, Any]:
+  """Runs benchmark for a specific experiment.
+  Args:
+    execution_mode: A 'str', specifying the mode. Can be 'accuracy',
+      'performance', or 'tflite_accuracy'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    distribution_strategy: A tf.distribute.Strategy to use. If specified,
+     it will be used instead of inferring the strategy from params.
+  Returns:
+    benchmark_data: returns benchmark data in dict format.
+  Raises:
+    NotImplementedError: If try to use unsupported setup.
+  """
+  # For GPU runs, allow option to set thread mode
+  if params.runtime.gpu_thread_mode:
+    os.environ['TF_GPU_THREAD_MODE'] = params.runtime.gpu_thread_mode
+    logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
+  # can have significant impact on model speeds by utilizing float16 in case of
+  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
+  # dtype is float16
+  if params.runtime.mixed_precision_dtype:
+    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
+  strategy = distribution_strategy or distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  with strategy.scope():
+    task = task_factory.get_task(params.task, logging_dir=model_dir)
+    trainer = train_utils.create_trainer(
+        params,
+        task,
+        train=True,
+        evaluate=(execution_mode == 'accuracy'))
+    # Initialize the model if possible, e.g., from a pre-trained checkpoint.
+    trainer.initialize()
+  steps_per_loop = params.trainer.steps_per_loop if (
+      execution_mode in ['accuracy', 'tflite_accuracy']) else 100
+  controller = orbit.Controller(
+      strategy=strategy,
+      trainer=trainer,
+      evaluator=trainer if (execution_mode == 'accuracy') else None,
+      global_step=trainer.global_step,
+      steps_per_loop=steps_per_loop)
+  logging.info('Starts to execute execution mode: %s', execution_mode)
+  with strategy.scope():
+    # Training for one loop, first loop time includes warmup time.
+    first_loop_start_time = time.time()
+    controller.train(steps=steps_per_loop)
+    first_loop_time = time.time() - first_loop_start_time
+    # Training for second loop.
+    second_loop_start_time = time.time()
+    controller.train(steps=2*steps_per_loop)
+    second_loop_time = time.time() - second_loop_start_time
+    if execution_mode == 'accuracy':
+      controller.train(steps=params.trainer.train_steps)
+      wall_time = time.time() - first_loop_time
+      eval_logs = trainer.evaluate(
+          tf.convert_to_tensor(params.trainer.validation_steps))
+      benchmark_data = {'metrics': eval_logs}
+    elif execution_mode == 'performance':
+      benchmark_data = {}
+    elif execution_mode == 'tflite_accuracy':
+      eval_logs = tflite_utils.train_and_evaluate(
+          params, task, trainer, controller)
+      benchmark_data = {'metrics': eval_logs}
+    else:
+      raise NotImplementedError(
+          'The benchmark execution mode is not implemented: %s' %
+          execution_mode)
+    # First training loop time contains startup time plus training time, while
+    # second training loop time is purely training time. Startup time can be
+    # recovered by subtracting second trianing loop time from first training
+    # loop time.
+    startup_time = first_loop_time - second_loop_time
+    wall_time = time.time() - first_loop_start_time
+    examples_per_second = steps_per_loop * params.task.train_data.global_batch_size / second_loop_time
+    benchmark_data.update(
+        dict(
+            examples_per_second=examples_per_second,
+            wall_time=wall_time,
+            startup_time=startup_time))
+    return benchmark_data
+def run_fast_training_benchmark(
+    execution_mode: str,
+    params: config_definitions.ExperimentConfig,
+    model_dir: str,
+    distribution_strategy: tf.distribute.Strategy = None
+) -> Mapping[str, Any]:
+  """Runs benchmark for a fast training experiment.
+  This benchmark tests and only tests the binary
+  tensorflow_models/official/modeling/fast_training/train.py
+  Args:
+    execution_mode: A 'str', specifying the mode. Can be 'accuracy',
+      'performance', or 'tflite_accuracy'.
+    params: ExperimentConfig instance.
+    model_dir: A 'str', a path to store model checkpoints and summaries.
+    distribution_strategy: A tf.distribute.Strategy to use. If specified,
+     it will be used instead of inferring the strategy from params.
+  Returns:
+    benchmark_data: returns benchmark data in dict format.
+  Raises:
+    NotImplementedError: If try to use unsupported setup.
+  """
+  if execution_mode == 'performance':
+    logging.warn('Fast training benchmark does not support execution_mode == '
+                 'performance. This benchmark run will be skipped..')
+    return dict(examples_per_second=0.0,
+                wall_time=0.0,
+                startup_time=0.0)
+  strategy = distribution_strategy or distribute_utils.get_distribution_strategy(
+      distribution_strategy=params.runtime.distribution_strategy,
+      all_reduce_alg=params.runtime.all_reduce_alg,
+      num_gpus=params.runtime.num_gpus,
+      tpu_address=params.runtime.tpu)
+  first_loop_start_time = time.time()
+  _, eval_logs = stage_lib.run_progressive_experiment(
+      distribution_strategy=strategy,
+      mode='train',
+      params=params,
+      model_dir=model_dir,
+      run_post_eval=True)
+  wall_time = time.time() - first_loop_start_time
+  return dict(metrics=eval_logs, wall_time=wall_time,
+              startup_time=0.0, examples_per_second=0.0)
--- a/official/benchmark/benchmark_lib_test.py
+++ b/official/benchmark/benchmark_lib_test.py
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow_models.official.benchmark.benchmark_lib."""
+# pylint: disable=g-direct-tensorflow-import
+from absl.testing import parameterized
+import gin
+import tensorflow as tf
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.common import registry_imports  # pylint: disable=unused-import
+from official.benchmark import benchmark_lib
+from official.core import exp_factory
+from official.modeling import hyperparams
+def all_strategy_combinations():
+  return combinations.combine(
+      distribution=[
+          strategy_combinations.default_strategy,
+          strategy_combinations.cloud_tpu_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+      ],)
+class BenchmarkLibTest(tf.test.TestCase, parameterized.TestCase):
+  def setUp(self):
+    super(BenchmarkLibTest, self).setUp()
+    self._test_config = {
+        'trainer': {
+            'steps_per_loop': 10,
+            'optimizer_config': {
+                'optimizer': {
+                    'type': 'sgd'
+                },
+                'learning_rate': {
+                    'type': 'constant'
+                }
+            },
+            'continuous_eval_timeout': 5,
+            'train_steps': 20,
+            'validation_steps': 10
+        },
+    }
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          execution_mode=['performance', 'accuracy'],
+      ))
+  def test_benchmark(self, distribution, execution_mode):
+    model_dir = self.get_temp_dir()
+    params = exp_factory.get_exp_config('mock')
+    params = hyperparams.override_params_dict(
+        params, self._test_config, is_strict=True)
+    benchmark_data = benchmark_lib.run_benchmark(execution_mode,
+                                                 params,
+                                                 model_dir,
+                                                 distribution)
+    self.assertIn('examples_per_second', benchmark_data)
+    self.assertIn('wall_time', benchmark_data)
+    self.assertIn('startup_time', benchmark_data)
+    if execution_mode == 'accuracy':
+      self.assertIn('metrics', benchmark_data)
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.default_strategy,
+              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          execution_mode=['performance', 'accuracy'],
+      ))
+  def test_fast_training_benchmark(self, distribution, execution_mode):
+    model_dir = self.get_temp_dir()
+    with gin.unlock_config():
+      gin.parse_config_files_and_bindings(
+          None,
+          "get_initialize_fn.stacking_pattern = 'dense_{:layer_id}/'\n"
+          "StageParamProgressor.stage_overrides = ("
+          "    {'trainer': {'train_steps': 1}},"
+          "    {'trainer': {'train_steps': 2}},"
+          ")")
+    params = exp_factory.get_exp_config('mock')
+    params = hyperparams.override_params_dict(
+        params, self._test_config, is_strict=True)
+    benchmark_data = benchmark_lib.run_fast_training_benchmark(execution_mode,
+                                                               params,
+                                                               model_dir,
+                                                               distribution)
+    if execution_mode == 'performance':
+      self.assertEqual(dict(examples_per_second=0.0,
+                            wall_time=0.0,
+                            startup_time=0.0),
+                       benchmark_data)
+    else:
+      self.assertIn('wall_time', benchmark_data)
+      self.assertIn('metrics', benchmark_data)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/benchmark_wrappers.py
+++ b/official/benchmark/benchmark_wrappers.py
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utils to annotate and trace benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import flags
+from absl import logging
+from absl.testing import flagsaver
+FLAGS = flags.FLAGS
+flags.DEFINE_multi_string(
+    'benchmark_method_flags', None,
+    'Optional list of runtime flags of the form key=value. Specify '
+    'multiple times to specify different flags. These will override the FLAGS '
+    'object directly after hardcoded settings in individual benchmark methods '
+    'before they call _run_and_report benchmark. Example if we set '
+    '--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
+    'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
+    'it\'ll only run for 10 steps. This is useful for '
+    'debugging/profiling workflows.')
+def enable_runtime_flags(decorated_func):
+  """Sets attributes from --benchmark_method_flags for method execution.
+  @enable_runtime_flags decorator temporarily adds flags passed in via
+  --benchmark_method_flags and runs the decorated function in that context.
+  A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
+  method in the snippet below with FLAGS.train_steps=5 for debugging (without
+  modifying the benchmark code).
+  class ModelBenchmark():
+    @benchmark_wrappers.enable_runtime_flags
+    def _run_and_report_benchmark(self):
+      # run benchmark ...
+      # report benchmark results ...
+    def benchmark_method(self):
+      FLAGS.train_steps = 1000
+      ...
+      self._run_and_report_benchmark()
+  Args:
+    decorated_func: The method that runs the benchmark after previous setup
+      execution that set some flags.
+  Returns:
+    new_func: The same method which executes in a temporary context where flag
+      overrides from --benchmark_method_flags are active.
+  """
+  def runner(*args, **kwargs):
+    """Creates a temporary context to activate --benchmark_method_flags."""
+    if FLAGS.benchmark_method_flags:
+      saved_flag_values = flagsaver.save_flag_values()
+      for key_value in FLAGS.benchmark_method_flags:
+        key, value = key_value.split('=', 1)
+        try:
+          numeric_float = float(value)
+          numeric_int = int(numeric_float)
+          if abs(numeric_int) == abs(numeric_float):
+            flag_value = numeric_int
+          else:
+            flag_value = numeric_float
+        except ValueError:
+          flag_value = value
+        logging.info('Setting --%s=%s', key, flag_value)
+        setattr(FLAGS, key, flag_value)
+    else:
+      saved_flag_values = None
+    try:
+      result = decorated_func(*args, **kwargs)
+      return result
+    finally:
+      if saved_flag_values:
+        flagsaver.restore_flag_values(saved_flag_values)
+  return runner
--- a/official/benchmark/bert_benchmark.py
+++ b/official/benchmark/bert_benchmark.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes BERT benchmarks and accuracy tests."""
+import functools
+import json
+import math
+import os
+import time
+from absl import flags
+from absl.testing import flagsaver
+import tensorflow as tf
+from official.benchmark import benchmark_wrappers
+from official.benchmark import bert_benchmark_utils as benchmark_utils
+from official.benchmark import owner_utils
+from official.common import distribute_utils
+from official.nlp.bert import configs
+from official.nlp.bert import run_classifier
+# pylint: disable=line-too-long
+PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
+CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
+CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
+CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
+MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
+# pylint: enable=line-too-long
+TMP_DIR = os.getenv('TMPDIR')
+FLAGS = flags.FLAGS
+class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
+  """Base class to hold methods common to test classes in the module."""
+  def __init__(self, output_dir=None, tpu=None):
+    super(BertClassifyBenchmarkBase, self).__init__(output_dir, tpu=tpu)
+    self.num_epochs = None
+    self.num_steps_per_epoch = None
+    FLAGS.steps_per_loop = 1
+  @flagsaver.flagsaver
+  def _run_bert_classifier(self, callbacks=None, use_ds=True):
+    """Starts BERT classification task."""
+    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+      input_meta_data = json.loads(reader.read().decode('utf-8'))
+    bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
+    epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs
+    if self.num_steps_per_epoch:
+      steps_per_epoch = self.num_steps_per_epoch
+    else:
+      train_data_size = input_meta_data['train_data_size']
+      steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
+    warmup_steps = int(epochs * steps_per_epoch * 0.1)
+    eval_steps = int(
+        math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+    if self.tpu:
+      strategy = distribute_utils.get_distribution_strategy(
+          distribution_strategy='tpu', tpu_address=self.tpu)
+    else:
+      strategy = distribute_utils.get_distribution_strategy(
+          distribution_strategy='mirrored' if use_ds else 'off',
+          num_gpus=self.num_gpus)
+    max_seq_length = input_meta_data['max_seq_length']
+    train_input_fn = run_classifier.get_dataset_fn(
+        FLAGS.train_data_path,
+        max_seq_length,
+        FLAGS.train_batch_size,
+        is_training=True)
+    eval_input_fn = run_classifier.get_dataset_fn(
+        FLAGS.eval_data_path,
+        max_seq_length,
+        FLAGS.eval_batch_size,
+        is_training=False)
+    _, summary = run_classifier.run_bert_classifier(
+        strategy,
+        bert_config,
+        input_meta_data,
+        FLAGS.model_dir,
+        epochs,
+        steps_per_epoch,
+        FLAGS.steps_per_loop,
+        eval_steps,
+        warmup_steps,
+        FLAGS.learning_rate,
+        FLAGS.init_checkpoint,
+        train_input_fn,
+        eval_input_fn,
+        training_callbacks=False,
+        custom_callbacks=callbacks)
+    return summary
+class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
+  """Short benchmark performance tests for BERT model.
+  Tests BERT classification performance in different GPU, TPU configurations.
+  The naming convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` for GPUs and
+  `benchmark_(topology)_tpu_(dataset type)` for TPUs.
+  """
+  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
+    super(BertClassifyBenchmarkReal, self).__init__(
+        output_dir=output_dir, tpu=tpu)
+    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
+    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
+    self.bert_config_file = MODEL_CONFIG_FILE_PATH
+    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
+    # Since we only care about performance metrics, we limit
+    # the number of training steps and epochs to prevent unnecessarily
+    # long tests.
+    self.num_steps_per_epoch = 100
+    self.num_epochs = 1
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                training_summary_path,
+                                min_accuracy=0,
+                                max_accuracy=1,
+                                use_ds=True):
+    """Starts BERT performance benchmark test."""
+    start_time_sec = time.time()
+    summary = self._run_bert_classifier(
+        callbacks=[self.timer_callback], use_ds=use_ds)
+    wall_time_sec = time.time() - start_time_sec
+    # Since we do not load from any pretrained checkpoints, we ignore all
+    # accuracy metrics.
+    summary.pop('eval_metrics', None)
+    summary['start_time_sec'] = start_time_sec
+    super(BertClassifyBenchmarkReal, self)._report_benchmark(
+        stats=summary,
+        wall_time_sec=wall_time_sec,
+        min_accuracy=min_accuracy,
+        max_accuracy=max_accuracy)
+  def benchmark_1_gpu_mrpc(self):
+    """Test BERT model performance with 1 GPU."""
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 4
+    FLAGS.eval_batch_size = 4
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+  def benchmark_1_gpu_mrpc_xla(self):
+    """Test BERT model performance with 1 GPU."""
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_xla')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 4
+    FLAGS.eval_batch_size = 4
+    FLAGS.enable_xla = True
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+  def benchmark_1_gpu_mrpc_no_dist_strat(self):
+    """Test BERT model performance with 1 GPU, no distribution strategy."""
+    self._setup()
+    self.num_gpus = 1
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_no_dist_strat')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 4
+    FLAGS.eval_batch_size = 4
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path, use_ds=False)
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_8_gpu_mrpc(self):
+    """Test BERT model performance with 8 GPUs."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_2x2_tpu_mrpc(self):
+    """Test BERT model performance with 2x2 TPU."""
+    self._setup()
+    FLAGS.steps_per_loop = 50
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.train_batch_size = 32
+    FLAGS.eval_batch_size = 32
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path, use_ds=False)
+class BertClassifyAccuracy(BertClassifyBenchmarkBase):
+  """Short accuracy test for BERT model.
+  Tests BERT classification task model accuracy. The naming
+  convention of below test cases follow
+  `benchmark_(number of gpus)_gpu_(dataset type)` format.
+  """
+  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
+    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
+    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
+    self.bert_config_file = MODEL_CONFIG_FILE_PATH
+    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
+    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
+    super(BertClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self,
+                                training_summary_path,
+                                min_accuracy=0.84,
+                                max_accuracy=0.88):
+    """Starts BERT accuracy benchmark test."""
+    start_time_sec = time.time()
+    summary = self._run_bert_classifier(callbacks=[self.timer_callback])
+    wall_time_sec = time.time() - start_time_sec
+    super(BertClassifyAccuracy, self)._report_benchmark(
+        stats=summary,
+        wall_time_sec=wall_time_sec,
+        min_accuracy=min_accuracy,
+        max_accuracy=max_accuracy)
+  def _setup(self):
+    super(BertClassifyAccuracy, self)._setup()
+    FLAGS.train_data_path = self.train_data_path
+    FLAGS.eval_data_path = self.eval_data_path
+    FLAGS.input_meta_data_path = self.input_meta_data_path
+    FLAGS.bert_config_file = self.bert_config_file
+    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_8_gpu_mrpc(self):
+    """Run BERT model accuracy test with 8 GPUs.
+    Due to comparatively small cardinality of  MRPC dataset, training
+    accuracy metric has high variance between trainings. As so, we
+    set the wide range of allowed accuracy (84% to 88%).
+    """
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+  def benchmark_8_gpu_mrpc_xla(self):
+    """Run BERT model accuracy test with 8 GPUs with XLA."""
+    self._setup()
+    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc_xla')
+    FLAGS.enable_xla = True
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+  @owner_utils.Owner('tf-model-garden')
+  def benchmark_2x2_tpu_mrpc(self):
+    """Run BERT model accuracy test on 2x2 TPU."""
+    self._setup()
+    FLAGS.steps_per_loop = 50
+    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
+    summary_path = os.path.join(FLAGS.model_dir,
+                                'summaries/training_summary.txt')
+    self._run_and_report_benchmark(summary_path)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/bert_benchmark_utils.py
+++ b/official/benchmark/bert_benchmark_utils.py
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions or classes shared between BERT benchmarks."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import time
+# pylint: disable=g-bad-import-order
+import numpy as np
+from absl import flags
+import tensorflow as tf
+# pylint: enable=g-bad-import-order
+from official.utils.flags import core as flags_core
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+FLAGS = flags.FLAGS
+class BenchmarkTimerCallback(tf.keras.callbacks.Callback):
+  """Callback that records time it takes to run each batch."""
+  def __init__(self, num_batches_to_skip=10):
+    super(BenchmarkTimerCallback, self).__init__()
+    self.batch_start_times = {}
+    self.batch_stop_times = {}
+  def on_batch_begin(self, batch, logs=None):
+    self.batch_start_times[batch] = time.time()
+  def on_batch_end(self, batch, logs=None):
+    # If there are multiple steps_per_loop, the end batch index will not be the
+    # same as the starting index. Use the last starting index instead.
+    if batch not in self.batch_start_times:
+      batch = max(self.batch_start_times.keys())
+    self.batch_stop_times[batch] = time.time()
+  def get_examples_per_sec(self, batch_size, num_batches_to_skip=1):
+    batch_durations = []
+    for batch in self.batch_start_times:
+      if batch in self.batch_stop_times and batch >= num_batches_to_skip:
+        batch_durations.append(self.batch_stop_times[batch] -
+                               self.batch_start_times[batch])
+    return batch_size / np.mean(batch_durations)
+  def get_startup_time(self, program_start_time):
+    return self.batch_start_times[0] - program_start_time
+class BertBenchmarkBase(PerfZeroBenchmark):
+  """Base class to hold methods common to test classes."""
+  local_flags = None
+  def __init__(self, output_dir=None, tpu=None, **kwargs):
+    super(BertBenchmarkBase, self).__init__(
+        output_dir=output_dir, tpu=tpu, **kwargs)
+    self.num_gpus = 8
+    self.timer_callback = None
+  def _setup(self):
+    """Sets up and resets flags before each test."""
+    super(BertBenchmarkBase, self)._setup()
+    self.timer_callback = BenchmarkTimerCallback()
+  def _report_benchmark(self, stats, wall_time_sec, min_accuracy, max_accuracy):
+    """Report benchmark results by writing to local protobuf file.
+    Args:
+      stats: dict returned from BERT models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      min_accuracy: Minimum classification accuracy constraint to verify
+        correctness of the model.
+      max_accuracy: Maximum classification accuracy constraint to verify
+        correctness of the model.
+    """
+    metrics = [{
+        'name': 'training_loss',
+        'value': stats['train_loss'],
+    }]
+    if self.timer_callback:
+      metrics.append({
+          'name':
+              'exp_per_second',
+          'value':
+              self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
+                                                       FLAGS.steps_per_loop)
+      })
+    else:
+      metrics.append({
+          'name': 'exp_per_second',
+          'value': 0.0,
+      })
+    if self.timer_callback and 'start_time_sec' in stats:
+      metrics.append({
+          'name': 'startup_time',
+          'value': self.timer_callback.get_startup_time(stats['start_time_sec'])
+      })
+    if 'eval_metrics' in stats:
+      metrics.append({
+          'name': 'eval_accuracy',
+          'value': stats['eval_metrics'],
+          'min_value': min_accuracy,
+          'max_value': max_accuracy,
+      })
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=stats['total_training_steps'],
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
--- a/official/benchmark/bert_pretrain_benchmark.py
+++ b/official/benchmark/bert_pretrain_benchmark.py
--- a/official/benchmark/bert_squad_benchmark.py
+++ b/official/benchmark/bert_squad_benchmark.py
--- a/official/benchmark/config_utils.py
+++ b/official/benchmark/config_utils.py
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks config utils."""
+import os
+def get_config_path(
+    config_file: str,
+    base_dir: str = '') -> str:
+  """Gets the absolute path of the config file."""
+  return os.path.join(base_dir, config_file)
--- a/official/benchmark/datastore/schema/benchmark_metric.json
+++ b/official/benchmark/datastore/schema/benchmark_metric.json
+[
+  {
+    "description": "The ID of the benchmark run, where this metric should tie to.",
+    "mode": "REQUIRED",
+    "name": "run_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
+    "mode": "REQUIRED",
+    "name": "name",
+    "type": "STRING"
+  },
+  {
+    "description": "The unit of the metric. E.g. MB per sec.",
+    "mode": "NULLABLE",
+    "name": "unit",
+    "type": "STRING"
+  },
+  {
+    "description": "The value of the metric.",
+    "mode": "NULLABLE",
+    "name": "value",
+    "type": "FLOAT"
+  },
+  {
+    "description": "The timestamp when the metric is recorded.",
+    "mode": "REQUIRED",
+    "name": "timestamp",
+    "type": "TIMESTAMP"
+  },
+  {
+    "description": "The global step when this metric is recorded.",
+    "mode": "NULLABLE",
+    "name": "global_step",
+    "type": "INTEGER"
+  },
+  {
+    "description": "Free format metadata for the extra information about the metric.",
+    "mode": "REPEATED",
+    "name": "extras",
+    "type": "RECORD",
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ]
+  }
+]
--- a/official/benchmark/datastore/schema/benchmark_run.json
+++ b/official/benchmark/datastore/schema/benchmark_run.json
+[
+  {
+    "description": "The UUID of the run for the benchmark.",
+    "mode": "REQUIRED",
+    "name": "model_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
+    "mode": "REQUIRED",
+    "name": "model_name",
+    "type": "STRING"
+  },
+  {
+    "description": "The date when the test of the model is started",
+    "mode": "REQUIRED",
+    "name": "run_date",
+    "type": "TIMESTAMP"
+  },
+  {
+    "description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
+    "mode": "NULLABLE",
+    "name": "test_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The tensorflow version information.",
+    "fields": [
+      {
+        "description": "Version of the tensorflow. E.g. 1.7.0-rc0",
+        "mode": "REQUIRED",
+        "name": "version",
+        "type": "STRING"
+      },
+      {
+        "description": "Git Hash of the tensorflow",
+        "mode": "NULLABLE",
+        "name": "git_hash",
+        "type": "STRING"
+      },
+      {
+        "description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
+        "mode": "NULLABLE",
+        "name": "channel",
+        "type": "STRING"
+      },
+      {
+        "description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
+        "mode": "NULLABLE",
+        "name": "build_type",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REQUIRED",
+    "name": "tensorflow_version",
+    "type": "RECORD"
+  },
+  {
+    "description": "The arbitrary attribute of the model.",
+    "fields": [
+      {
+        "description": "The name of the attribute.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the attribute.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "attribute",
+    "type": "RECORD"
+  },
+  {
+    "description": "Environment variables when the benchmark run is executed.",
+    "fields": [
+      {
+        "description": "The name of the variable.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the variable.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "environment_variable",
+    "type": "RECORD"
+  },
+  {
+    "description": "TF Environment variables when the benchmark run is executed.",
+    "fields": [
+      {
+        "description": "The name of the variable.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The value of the variable.",
+        "mode": "NULLABLE",
+        "name": "value",
+        "type": "STRING"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "tensorflow_environment_variables",
+    "type": "RECORD"
+  },
+  {
+    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
+    "fields": [
+      {
+        "description": "The name of the parameter.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The string value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "string_value",
+        "type": "STRING"
+      },
+      {
+        "description": "The bool value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "bool_value",
+        "type": "STRING"
+      },
+      {
+        "description": "The int/long value of the parameter.",
+        "mode": "NULLABLE",
+        "name": "long_value",
+        "type": "INTEGER"
+      },
+      {
+        "description": "The double/float value of parameter.",
+        "mode": "NULLABLE",
+        "name": "float_value",
+        "type": "FLOAT"
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "run_parameters",
+    "type": "RECORD"
+  },
+  {
+    "description": "The dataset that run with the benchmark.",
+    "mode": "NULLABLE",
+    "name": "dataset",
+    "type": "RECORD",
+    "fields": [
+      {
+        "description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
+        "mode": "REQUIRED",
+        "name": "name",
+        "type": "STRING"
+      },
+      {
+        "description": "The arbitrary attribute of the dataset.",
+        "fields": [
+          {
+            "description": "The name of the attribute.",
+            "mode": "REQUIRED",
+            "name": "name",
+            "type": "STRING"
+          },
+          {
+            "description": "The value of the attribute.",
+            "mode": "NULLABLE",
+            "name": "value",
+            "type": "STRING"
+          }
+        ],
+        "mode": "REPEATED",
+        "name": "attribute",
+        "type": "RECORD"
+      }
+    ]
+  },
+  {
+    "description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
+    "mode": "NULLABLE",
+    "name": "test_environment",
+    "type": "STRING"
+  },
+  {
+    "description": "The machine configuration of the benchmark run.",
+    "mode": "NULLABLE",
+    "name": "machine_config",
+    "type": "RECORD",
+    "fields": [
+      {
+        "description": "The platform information of the benchmark run.",
+        "mode": "NULLABLE",
+        "name": "platform_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "description": "Eg: 64bit.",
+            "mode": "NULLABLE",
+            "name": "bits",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: ELF.",
+            "mode": "NULLABLE",
+            "name": "linkage",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: i386.",
+            "mode": "NULLABLE",
+            "name": "machine",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: 3.13.0-76-generic.",
+            "mode": "NULLABLE",
+            "name": "release",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: Linux.",
+            "mode": "NULLABLE",
+            "name": "system",
+            "type": "STRING"
+          },
+          {
+            "description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
+            "mode": "NULLABLE",
+            "name": "version",
+            "type": "STRING"
+          }
+        ]
+      },
+      {
+        "description": "The CPU information of the benchmark run.",
+        "mode": "NULLABLE",
+        "name": "cpu_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "mode": "NULLABLE",
+            "name": "num_cores",
+            "type": "INTEGER"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "num_cores_allowed",
+            "type": "INTEGER"
+          },
+          {
+            "description" : "How fast are those CPUs.",
+            "mode": "NULLABLE",
+            "name": "mhz_per_cpu",
+            "type": "FLOAT"
+          },
+          {
+            "description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
+            "mode": "NULLABLE",
+            "name": "cpu_info",
+            "type": "STRING"
+          },
+          {
+            "description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
+            "mode": "NULLABLE",
+            "name": "cpu_governor",
+            "type": "STRING"
+          },
+          {
+            "description": "Cache size of the CPUs.",
+            "mode": "NULLABLE",
+            "name": "cache_size",
+            "type": "RECORD",
+            "fields": [
+              {
+                "mode": "NULLABLE",
+                "name": "level",
+                "type": "STRING"
+              },
+              {
+                "mode": "NULLABLE",
+                "name": "size",
+                "type": "INTEGER"
+              }
+            ]
+          }
+        ]
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "gpu_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "mode": "NULLABLE",
+            "name": "count",
+            "type": "INTEGER"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "model",
+            "type": "STRING"
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "cuda_version",
+            "type": "STRING"
+          }
+        ]
+      },
+      {
+        "description": "The cloud instance inforation if the benchmark run is executed on cloud",
+        "mode": "NULLABLE",
+        "name": "cloud_info",
+        "type": "RECORD",
+        "fields": [
+          {
+            "description": "The instance type, E.g. n1-standard-4.",
+            "mode": "NULLABLE",
+            "name": "instance_type",
+            "type": "STRING"
+          },
+          {
+            "description": "The arbitrary attribute of the cloud info.",
+            "fields": [
+              {
+                "description": "The name of the attribute.",
+                "mode": "REQUIRED",
+                "name": "name",
+                "type": "STRING"
+              },
+              {
+                "description": "The value of the attribute.",
+                "mode": "NULLABLE",
+                "name": "value",
+                "type": "STRING"
+              }
+            ],
+            "mode": "REPEATED",
+            "name": "attribute",
+            "type": "RECORD"
+          }
+        ]
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "memory_total",
+        "type": "INTEGER"
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "memory_available",
+        "type": "STRING"
+      }
+    ]
+  }
+]
--- a/official/benchmark/datastore/schema/benchmark_run_status.json
+++ b/official/benchmark/datastore/schema/benchmark_run_status.json
+[
+  {
+    "description": "The UUID of the run for the benchmark.",
+    "mode": "REQUIRED",
+    "name": "run_id",
+    "type": "STRING"
+  },
+  {
+    "description": "The status of the run for the benchmark. Eg, running, failed, success",
+    "mode": "REQUIRED",
+    "name": "status",
+    "type": "STRING"
+  }
+]
\ No newline at end of file
--- a/official/benchmark/keras_benchmark.py
+++ b/official/benchmark/keras_benchmark.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Keras benchmarks and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
+from official.utils.flags import core as flags_core
+class KerasBenchmark(PerfZeroBenchmark):
+  """Base benchmark class with methods to simplify testing."""
+  def __init__(self,
+               output_dir=None,
+               default_flags=None,
+               flag_methods=None,
+               tpu=None):
+    super(KerasBenchmark, self).__init__(
+        output_dir=output_dir,
+        default_flags=default_flags,
+        flag_methods=flag_methods,
+        tpu=tpu)
+  def _report_benchmark(self,
+                        stats,
+                        wall_time_sec,
+                        top_1_max=None,
+                        top_1_min=None,
+                        log_steps=None,
+                        total_batch_size=None,
+                        warmup=1,
+                        start_time_sec=None):
+    """Report benchmark results by writing to local protobuf file.
+    Args:
+      stats: dict returned from keras models with known entries.
+      wall_time_sec: the during of the benchmark execution in seconds
+      top_1_max: highest passing level for top_1 accuracy.
+      top_1_min: lowest passing level for top_1 accuracy.
+      log_steps: How often the log was created for stats['step_timestamp_log'].
+      total_batch_size: Global batch-size.
+      warmup: number of entries in stats['step_timestamp_log'] to ignore.
+      start_time_sec: the start time of the program in seconds since epoch
+    """
+    metrics = []
+    if 'accuracy_top_1' in stats:
+      metrics.append({
+          'name': 'accuracy_top_1',
+          'value': stats['accuracy_top_1'],
+          'min_value': top_1_min,
+          'max_value': top_1_max
+      })
+      metrics.append({
+          'name': 'top_1_train_accuracy',
+          'value': stats['training_accuracy_top_1']
+      })
+    if (warmup and 'step_timestamp_log' in stats and
+        len(stats['step_timestamp_log']) > warmup):
+      # first entry in the time_log is start of step 1. The rest of the
+      # entries are the end of each step recorded
+      time_log = stats['step_timestamp_log']
+      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
+      num_examples = (
+          total_batch_size * log_steps * (len(time_log) - warmup - 1))
+      examples_per_sec = num_examples / elapsed
+      metrics.append({'name': 'exp_per_second', 'value': examples_per_sec})
+    if 'avg_exp_per_second' in stats:
+      metrics.append({
+          'name': 'avg_exp_per_second',
+          'value': stats['avg_exp_per_second']
+      })
+    if start_time_sec and 'step_timestamp_log' in stats:
+      time_log = stats['step_timestamp_log']
+      # time_log[0] is recorded at the beginning of the first step.
+      startup_time = time_log[0].timestamp - start_time_sec
+      metrics.append({'name': 'startup_time', 'value': startup_time})
+    flags_str = flags_core.get_nondefault_flags_as_str()
+    self.report_benchmark(
+        iters=-1,
+        wall_time=wall_time_sec,
+        metrics=metrics,
+        extras={'flags': flags_str})
--- a/official/benchmark/keras_cifar_benchmark.py
+++ b/official/benchmark/keras_cifar_benchmark.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executes Keras benchmarks and accuracy tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import time
+from absl import flags
+import tensorflow as tf
+from official.benchmark import keras_benchmark
+from official.benchmark import benchmark_wrappers
+from official.benchmark.models import resnet_cifar_main
+MIN_TOP_1_ACCURACY = 0.929
+MAX_TOP_1_ACCURACY = 0.938
+FLAGS = flags.FLAGS
+CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
+class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
+  """Accuracy tests for ResNet56 Keras CIFAR-10."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    """A benchmark class.
+    Args:
+      output_dir: directory where to output e.g. log files
+      root_data_dir: directory under which to look for dataset
+      **kwargs: arbitrary named arguments. This is needed to make the
+        constructor forward compatible in case PerfZero provides more named
+        arguments before updating the constructor.
+    """
+    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
+    flag_methods = [resnet_cifar_main.define_cifar_flags]
+    super(Resnet56KerasAccuracy, self).__init__(
+        output_dir=output_dir, flag_methods=flag_methods)
+  def _setup(self):
+    super(Resnet56KerasAccuracy, self)._setup()
+    FLAGS.use_tensor_lr = False
+  def benchmark_graph_1_gpu(self):
+    """Test keras based model with Keras fit and distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.dtype = 'fp32'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu(self):
+    """Test keras based model with eager and distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    self._run_and_report_benchmark()
+  def benchmark_cpu(self):
+    """Test keras based model on CPU."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_dist_strat(self):
+    """Test keras based model on CPU without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_dist_strat_run_eagerly(self):
+    """Test keras based model on CPU w/forced eager and no dist_strat."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_cpu_no_dist_strat_run_eagerly')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Test keras based model with eager and no dist strat."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
+    """Test keras based model w/forced eager and no dist_strat."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_graph_1_gpu_no_dist_strat(self):
+    """Test keras based model with Keras fit but not distribution strategies."""
+    self._setup()
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.num_gpus = 1
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.dtype = 'fp32'
+    self._run_and_report_benchmark()
+  def benchmark_2_gpu(self):
+    """Test keras based model with eager and distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    self._run_and_report_benchmark()
+  def benchmark_graph_2_gpu(self):
+    """Test keras based model with Keras fit and distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.data_dir = self.data_dir
+    FLAGS.batch_size = 128
+    FLAGS.train_epochs = 182
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
+    FLAGS.dtype = 'fp32'
+    self._run_and_report_benchmark()
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_cifar_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    super(Resnet56KerasAccuracy, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        top_1_min=MIN_TOP_1_ACCURACY,
+        top_1_max=MAX_TOP_1_ACCURACY,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=100)
+class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
+  """Short performance tests for ResNet56 via Keras and CIFAR-10."""
+  def __init__(self, output_dir=None, default_flags=None):
+    flag_methods = [resnet_cifar_main.define_cifar_flags]
+    super(Resnet56KerasBenchmarkBase, self).__init__(
+        output_dir=output_dir,
+        flag_methods=flag_methods,
+        default_flags=default_flags)
+  @benchmark_wrappers.enable_runtime_flags
+  def _run_and_report_benchmark(self):
+    start_time_sec = time.time()
+    stats = resnet_cifar_main.run(FLAGS)
+    wall_time_sec = time.time() - start_time_sec
+    super(Resnet56KerasBenchmarkBase, self)._report_benchmark(
+        stats,
+        wall_time_sec,
+        total_batch_size=FLAGS.batch_size,
+        log_steps=FLAGS.log_steps)
+  def benchmark_1_gpu(self):
+    """Test 1 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_xla(self):
+    """Test 1 gpu with xla enabled."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = False
+    FLAGS.enable_xla = True
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+  def benchmark_graph_1_gpu(self):
+    """Test 1 gpu graph."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.run_eagerly = False
+    FLAGS.distribution_strategy = 'one_device'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat(self):
+    """Test 1 gpu without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+  def benchmark_graph_1_gpu_no_dist_strat(self):
+    """Test 1 gpu graph mode without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    self._run_and_report_benchmark()
+  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
+    """Test 1 gpu without distribution strategy and forced eager."""
+    self._setup()
+    FLAGS.num_gpus = 1
+    FLAGS.batch_size = 128
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_1_gpu_no_dist_strat_run_eagerly')
+    FLAGS.dtype = 'fp32'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.distribution_strategy = 'off'
+    self._run_and_report_benchmark()
+  def benchmark_2_gpu(self):
+    """Test 2 gpu."""
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = False
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
+    FLAGS.batch_size = 128 * 2  # 2 GPUs
+    self._run_and_report_benchmark()
+  def benchmark_graph_2_gpu(self):
+    """Test 2 gpu graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 2
+    FLAGS.enable_eager = False
+    FLAGS.run_eagerly = False
+    FLAGS.distribution_strategy = 'mirrored'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
+    FLAGS.batch_size = 128 * 2  # 2 GPUs
+    self._run_and_report_benchmark()
+  def benchmark_cpu(self):
+    """Test cpu."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.enable_eager = True
+    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_graph_cpu(self):
+    """Test cpu graph mode."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.enable_eager = False
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_dist_strat_run_eagerly(self):
+    """Test cpu without distribution strategy and forced eager."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.enable_eager = True
+    FLAGS.run_eagerly = True
+    FLAGS.model_dir = self._get_model_dir(
+        'benchmark_cpu_no_dist_strat_run_eagerly')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_cpu_no_dist_strat(self):
+    """Test cpu without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.enable_eager = True
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+  def benchmark_graph_cpu_no_dist_strat(self):
+    """Test cpu graph mode without distribution strategies."""
+    self._setup()
+    FLAGS.num_gpus = 0
+    FLAGS.enable_eager = False
+    FLAGS.distribution_strategy = 'off'
+    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu_no_dist_strat')
+    FLAGS.batch_size = 128
+    FLAGS.data_format = 'channels_last'
+    self._run_and_report_benchmark()
+class Resnet56KerasBenchmarkSynth(Resnet56KerasBenchmarkBase):
+  """Synthetic benchmarks for ResNet56 and Keras."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    default_flags = {}
+    default_flags['skip_eval'] = True
+    default_flags['use_synthetic_data'] = True
+    default_flags['train_steps'] = 110
+    default_flags['log_steps'] = 10
+    default_flags['use_tensor_lr'] = False
+    super(Resnet56KerasBenchmarkSynth, self).__init__(
+        output_dir=output_dir, default_flags=default_flags)
+class Resnet56KerasBenchmarkReal(Resnet56KerasBenchmarkBase):
+  """Real data benchmarks for ResNet56 and Keras."""
+  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
+    default_flags = {}
+    default_flags['skip_eval'] = True
+    default_flags['data_dir'] = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
+    default_flags['train_steps'] = 110
+    default_flags['log_steps'] = 10
+    default_flags['use_tensor_lr'] = False
+    super(Resnet56KerasBenchmarkReal, self).__init__(
+        output_dir=output_dir, default_flags=default_flags)
+if __name__ == '__main__':
+  tf.test.main()
--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
--- a/official/benchmark/models/__init__.py
+++ b/official/benchmark/models/__init__.py
--- a/official/benchmark/models/cifar_preprocessing.py
+++ b/official/benchmark/models/cifar_preprocessing.py
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Provides utilities to Cifar-10 dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+from absl import logging
+import tensorflow as tf
+from official.legacy.image_classification.resnet import imagenet_preprocessing
+HEIGHT = 32
+WIDTH = 32
+NUM_CHANNELS = 3
+_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
+# The record is the image plus a one-byte label
+_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
+# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
+NUM_IMAGES = {
+    'train': 50000,
+    'validation': 10000,
+}
+_NUM_DATA_FILES = 5
+NUM_CLASSES = 10
+def parse_record(raw_record, is_training, dtype):
+  """Parses a record containing a training example of an image.
+  The input record is parsed into a label and image, and the image is passed
+  through preprocessing steps (cropping, flipping, and so on).
+  This method converts the label to one hot to fit the loss function.
+  Args:
+    raw_record: scalar Tensor tf.string containing a serialized Example protocol
+      buffer.
+    is_training: A boolean denoting whether the input is for training.
+    dtype: Data type to use for input images.
+  Returns:
+    Tuple with processed image tensor and one-hot-encoded label tensor.
+  """
+  # Convert bytes to a vector of uint8 that is record_bytes long.
+  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
+  # The first byte represents the label, which we convert from uint8 to int32
+  # and then to one-hot.
+  label = tf.cast(record_vector[0], tf.int32)
+  # The remaining bytes after the label represent the image, which we reshape
+  # from [depth * height * width] to [depth, height, width].
+  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
+                           [NUM_CHANNELS, HEIGHT, WIDTH])
+  # Convert from [depth, height, width] to [height, width, depth], and cast as
+  # float32.
+  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
+  image = preprocess_image(image, is_training)
+  image = tf.cast(image, dtype)
+  return image, label
+def preprocess_image(image, is_training):
+  """Preprocess a single image of layout [height, width, depth]."""
+  if is_training:
+    # Resize the image to add four extra pixels on each side.
+    image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
+    # Randomly crop a [HEIGHT, WIDTH] section of the image.
+    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
+    # Randomly flip the image horizontally.
+    image = tf.image.random_flip_left_right(image)
+  # Subtract off the mean and divide by the variance of the pixels.
+  image = tf.image.per_image_standardization(image)
+  return image
+def get_filenames(is_training, data_dir):
+  """Returns a list of filenames."""
+  assert tf.io.gfile.exists(data_dir), (
+      'Run cifar10_download_and_extract.py first to download and extract the '
+      'CIFAR-10 data.')
+  if is_training:
+    return [
+        os.path.join(data_dir, 'data_batch_%d.bin' % i)
+        for i in range(1, _NUM_DATA_FILES + 1)
+    ]
+  else:
+    return [os.path.join(data_dir, 'test_batch.bin')]
+def input_fn(is_training,
+             data_dir,
+             batch_size,
+             dtype=tf.float32,
+             datasets_num_private_threads=None,
+             parse_record_fn=parse_record,
+             input_context=None,
+             drop_remainder=False):
+  """Input function which provides batches for train or eval.
+  Args:
+    is_training: A boolean denoting whether the input is for training.
+    data_dir: The directory containing the input data.
+    batch_size: The number of samples per batch.
+    dtype: Data type to use for images/features
+    datasets_num_private_threads: Number of private threads for tf.data.
+    parse_record_fn: Function to use for parsing the records.
+    input_context: A `tf.distribute.InputContext` object passed in by
+      `tf.distribute.Strategy`.
+    drop_remainder: A boolean indicates whether to drop the remainder of the
+      batches. If True, the batch dimension will be static.
+  Returns:
+    A dataset that can be used for iteration.
+  """
+  filenames = get_filenames(is_training, data_dir)
+  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
+  if input_context:
+    logging.info(
+        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
+        input_context.input_pipeline_id, input_context.num_input_pipelines)
+    dataset = dataset.shard(input_context.num_input_pipelines,
+                            input_context.input_pipeline_id)
+  return imagenet_preprocessing.process_record_dataset(
+      dataset=dataset,
+      is_training=is_training,
+      batch_size=batch_size,
+      shuffle_buffer=NUM_IMAGES['train'],
+      parse_record_fn=parse_record_fn,
+      dtype=dtype,
+      datasets_num_private_threads=datasets_num_private_threads,
+      drop_remainder=drop_remainder)
--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Runs a ResNet model on the Cifar-10 dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import numpy as np
+import tensorflow as tf
+from official.benchmark.models import cifar_preprocessing
+from official.benchmark.models import resnet_cifar_model
+from official.benchmark.models import synthetic_util
+from official.common import distribute_utils
+from official.legacy.image_classification.resnet import common
+from official.utils.flags import core as flags_core
+from official.utils.misc import keras_utils
+LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
+    (0.1, 91), (0.01, 136), (0.001, 182)
+]
+def learning_rate_schedule(current_epoch,
+                           current_batch,
+                           batches_per_epoch,
+                           batch_size):
+  """Handles linear scaling rule and LR decay.
+  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
+  provided scaling factor.
+  Args:
+    current_epoch: integer, current epoch indexed from 0.
+    current_batch: integer, current batch in the current epoch, indexed from 0.
+    batches_per_epoch: integer, number of steps in an epoch.
+    batch_size: integer, total batch sized.
+  Returns:
+    Adjusted learning rate.
+  """
+  del current_batch, batches_per_epoch  # not used
+  initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
+  learning_rate = initial_learning_rate
+  for mult, start_epoch in LR_SCHEDULE:
+    if current_epoch >= start_epoch:
+      learning_rate = initial_learning_rate * mult
+    else:
+      break
+  return learning_rate
+class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
+  """Callback to update learning rate on every batch (not epoch boundaries).
+  N.B. Only support Keras optimizers, not TF optimizers.
+  Attributes:
+      schedule: a function that takes an epoch index and a batch index as input
+          (both integer, indexed from 0) and returns a new learning rate as
+          output (float).
+  """
+  def __init__(self, schedule, batch_size, steps_per_epoch):
+    super(LearningRateBatchScheduler, self).__init__()
+    self.schedule = schedule
+    self.steps_per_epoch = steps_per_epoch
+    self.batch_size = batch_size
+    self.epochs = -1
+    self.prev_lr = -1
+  def on_epoch_begin(self, epoch, logs=None):
+    if not hasattr(self.model.optimizer, 'learning_rate'):
+      raise ValueError('Optimizer must have a "learning_rate" attribute.')
+    self.epochs += 1
+  def on_batch_begin(self, batch, logs=None):
+    """Executes before step begins."""
+    lr = self.schedule(self.epochs,
+                       batch,
+                       self.steps_per_epoch,
+                       self.batch_size)
+    if not isinstance(lr, (float, np.float32, np.float64)):
+      raise ValueError('The output of the "schedule" function should be float.')
+    if lr != self.prev_lr:
+      self.model.optimizer.learning_rate = lr  # lr should be a float here
+      self.prev_lr = lr
+      logging.debug(
+          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
+          'change learning rate to %s.', self.epochs, batch, lr)
+def run(flags_obj):
+  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.
+  Args:
+    flags_obj: An object containing parsed flag values.
+  Raises:
+    ValueError: If fp16 is passed as it is not currently supported.
+  Returns:
+    Dictionary of training and eval stats.
+  """
+  keras_utils.set_session_config(
+      enable_xla=flags_obj.enable_xla)
+  # Execute flag override logic for better model performance
+  if flags_obj.tf_gpu_thread_mode:
+    keras_utils.set_gpu_thread_mode_and_count(
+        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
+        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
+        num_gpus=flags_obj.num_gpus,
+        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
+  common.set_cudnn_batchnorm_mode()
+  dtype = flags_core.get_tf_dtype(flags_obj)
+  if dtype == 'fp16':
+    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
+                     'value(fp32).')
+  data_format = flags_obj.data_format
+  if data_format is None:
+    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
+                   else 'channels_last')
+  tf.keras.backend.set_image_data_format(data_format)
+  strategy = distribute_utils.get_distribution_strategy(
+      distribution_strategy=flags_obj.distribution_strategy,
+      num_gpus=flags_obj.num_gpus,
+      all_reduce_alg=flags_obj.all_reduce_alg,
+      num_packs=flags_obj.num_packs)
+  if strategy:
+    # flags_obj.enable_get_next_as_optional controls whether enabling
+    # get_next_as_optional behavior in DistributedIterator. If true, last
+    # partial batch can be supported.
+    strategy.extended.experimental_enable_get_next_as_optional = (
+        flags_obj.enable_get_next_as_optional
+    )
+  strategy_scope = distribute_utils.get_strategy_scope(strategy)
+  if flags_obj.use_synthetic_data:
+    synthetic_util.set_up_synthetic_data()
+    input_fn = common.get_synth_input_fn(
+        height=cifar_preprocessing.HEIGHT,
+        width=cifar_preprocessing.WIDTH,
+        num_channels=cifar_preprocessing.NUM_CHANNELS,
+        num_classes=cifar_preprocessing.NUM_CLASSES,
+        dtype=flags_core.get_tf_dtype(flags_obj),
+        drop_remainder=True)
+  else:
+    synthetic_util.undo_set_up_synthetic_data()
+    input_fn = cifar_preprocessing.input_fn
+  train_input_dataset = input_fn(
+      is_training=True,
+      data_dir=flags_obj.data_dir,
+      batch_size=flags_obj.batch_size,
+      parse_record_fn=cifar_preprocessing.parse_record,
+      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
+      dtype=dtype,
+      # Setting drop_remainder to avoid the partial batch logic in normalization
+      # layer, which triggers tf.where and leads to extra memory copy of input
+      # sizes between host and GPU.
+      drop_remainder=(not flags_obj.enable_get_next_as_optional))
+  eval_input_dataset = None
+  if not flags_obj.skip_eval:
+    eval_input_dataset = input_fn(
+        is_training=False,
+        data_dir=flags_obj.data_dir,
+        batch_size=flags_obj.batch_size,
+        parse_record_fn=cifar_preprocessing.parse_record)
+  steps_per_epoch = (
+      cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
+  lr_schedule = 0.1
+  if flags_obj.use_tensor_lr:
+    initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
+    lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
+        values=[initial_learning_rate] +
+        list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
+  with strategy_scope:
+    optimizer = common.get_optimizer(lr_schedule)
+    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=optimizer,
+        metrics=(['sparse_categorical_accuracy']
+                 if flags_obj.report_accuracy_metrics else None),
+        run_eagerly=flags_obj.run_eagerly)
+  train_epochs = flags_obj.train_epochs
+  callbacks = common.get_callbacks()
+  if not flags_obj.use_tensor_lr:
+    lr_callback = LearningRateBatchScheduler(
+        schedule=learning_rate_schedule,
+        batch_size=flags_obj.batch_size,
+        steps_per_epoch=steps_per_epoch)
+    callbacks.append(lr_callback)
+  # if mutliple epochs, ignore the train_steps flag.
+  if train_epochs <= 1 and flags_obj.train_steps:
+    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
+    train_epochs = 1
+  num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
+                    flags_obj.batch_size)
+  validation_data = eval_input_dataset
+  if flags_obj.skip_eval:
+    if flags_obj.set_learning_phase_to_train:
+      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
+      # not using distribution strategy.
+      tf.keras.backend.set_learning_phase(1)
+    num_eval_steps = None
+    validation_data = None
+  if not strategy and flags_obj.explicit_gpu_placement:
+    # TODO(b/135607227): Add device scope automatically in Keras training loop
+    # when not using distribition strategy.
+    no_dist_strat_device = tf.device('/device:GPU:0')
+    no_dist_strat_device.__enter__()
+  history = model.fit(train_input_dataset,
+                      epochs=train_epochs,
+                      steps_per_epoch=steps_per_epoch,
+                      callbacks=callbacks,
+                      validation_steps=num_eval_steps,
+                      validation_data=validation_data,
+                      validation_freq=flags_obj.epochs_between_evals,
+                      verbose=2)
+  eval_output = None
+  if not flags_obj.skip_eval:
+    eval_output = model.evaluate(eval_input_dataset,
+                                 steps=num_eval_steps,
+                                 verbose=2)
+  if not strategy and flags_obj.explicit_gpu_placement:
+    no_dist_strat_device.__exit__()
+  stats = common.build_stats(history, eval_output, callbacks)
+  return stats
+def define_cifar_flags():
+  common.define_keras_flags()
+  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
+                          model_dir='/tmp/cifar10_model',
+                          epochs_between_evals=10,
+                          batch_size=128)
+def main(_):
+  return run(flags.FLAGS)
+if __name__ == '__main__':
+  logging.set_verbosity(logging.INFO)
+  define_cifar_flags()
+  app.run(main)
--- a/official/benchmark/models/resnet_cifar_model.py
+++ b/official/benchmark/models/resnet_cifar_model.py