Remove benchmark folder from the master branch. They are stale. (#9085)

c5e107ff · Hongkun Yu · GitHub · 266c7f43 · 266c7f43 · 266c7f43
Unverified Commit c5e107ff authored Aug 10, 2020 by Hongkun Yu Committed by GitHub Aug 10, 2020
20 changed files
--- a/official/benchmark/__init__.py
+++ b/official/benchmark/__init__.py
--- a/official/benchmark/benchmark_wrappers.py
+++ b/official/benchmark/benchmark_wrappers.py
-# Lint as: python3
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils to annotate and trace benchmarks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_multi_string(
-    'benchmark_method_flags', None,
-    'Optional list of runtime flags of the form key=value. Specify '
-    'multiple times to specify different flags. These will override the FLAGS '
-    'object directly after hardcoded settings in individual benchmark methods '
-    'before they call _run_and_report benchmark. Example if we set '
-    '--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
-    'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
-    'it\'ll only run for 10 steps. This is useful for '
-    'debugging/profiling workflows.')
-
-
-def enable_runtime_flags(decorated_func):
-  """Sets attributes from --benchmark_method_flags for method execution.
-
-  @enable_runtime_flags decorator temporarily adds flags passed in via
-  --benchmark_method_flags and runs the decorated function in that context.
-
-  A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
-  method in the snippet below with FLAGS.train_steps=5 for debugging (without
-  modifying the benchmark code).
-
-  class ModelBenchmark():
-
-    @benchmark_wrappers.enable_runtime_flags
-    def _run_and_report_benchmark(self):
-      # run benchmark ...
-      # report benchmark results ...
-
-    def benchmark_method(self):
-      FLAGS.train_steps = 1000
-      ...
-      self._run_and_report_benchmark()
-
-  Args:
-    decorated_func: The method that runs the benchmark after previous setup
-      execution that set some flags.
-
-  Returns:
-    new_func: The same method which executes in a temporary context where flag
-      overrides from --benchmark_method_flags are active.
-  """
-
-  def runner(*args, **kwargs):
-    """Creates a temporary context to activate --benchmark_method_flags."""
-    if FLAGS.benchmark_method_flags:
-      saved_flag_values = flagsaver.save_flag_values()
-      for key_value in FLAGS.benchmark_method_flags:
-        key, value = key_value.split('=', 1)
-        try:
-          numeric_float = float(value)
-          numeric_int = int(numeric_float)
-          if abs(numeric_int) == abs(numeric_float):
-            flag_value = numeric_int
-          else:
-            flag_value = numeric_float
-        except ValueError:
-          flag_value = value
-        logging.info('Setting --%s=%s', key, flag_value)
-        setattr(FLAGS, key, flag_value)
-    else:
-      saved_flag_values = None
-    try:
-      result = decorated_func(*args, **kwargs)
-      return result
-    finally:
-      if saved_flag_values:
-        flagsaver.restore_flag_values(saved_flag_values)
-
-  return runner
--- a/official/benchmark/bert_benchmark.py
+++ b/official/benchmark/bert_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes BERT benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import json
-import math
-import os
-import time
-
-# pylint: disable=g-bad-import-order
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import bert_benchmark_utils as benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.bert import configs
-from official.nlp.bert import run_classifier
-from official.utils.misc import distribution_utils
-from official.benchmark import benchmark_wrappers
-
-# pylint: disable=line-too-long
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
-CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
-CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
-CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
-MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
-# pylint: enable=line-too-long
-
-TMP_DIR = os.getenv('TMPDIR')
-FLAGS = flags.FLAGS
-
-
-class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, output_dir=None, tpu=None):
-    super(BertClassifyBenchmarkBase, self).__init__(output_dir, tpu=tpu)
-    self.num_epochs = None
-    self.num_steps_per_epoch = None
-    FLAGS.steps_per_loop = 1
-
-  @flagsaver.flagsaver
-  def _run_bert_classifier(self, callbacks=None, use_ds=True):
-    """Starts BERT classification task."""
-    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-      input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-    bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-    epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs
-    if self.num_steps_per_epoch:
-      steps_per_epoch = self.num_steps_per_epoch
-    else:
-      train_data_size = input_meta_data['train_data_size']
-      steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
-    warmup_steps = int(epochs * steps_per_epoch * 0.1)
-    eval_steps = int(
-        math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
-    if self.tpu:
-      strategy = distribution_utils.get_distribution_strategy(
-          distribution_strategy='tpu', tpu_address=self.tpu)
-    else:
-      strategy = distribution_utils.get_distribution_strategy(
-          distribution_strategy='mirrored' if use_ds else 'off',
-          num_gpus=self.num_gpus)
-
-    max_seq_length = input_meta_data['max_seq_length']
-    train_input_fn = run_classifier.get_dataset_fn(
-        FLAGS.train_data_path,
-        max_seq_length,
-        FLAGS.train_batch_size,
-        is_training=True)
-    eval_input_fn = run_classifier.get_dataset_fn(
-        FLAGS.eval_data_path,
-        max_seq_length,
-        FLAGS.eval_batch_size,
-        is_training=False)
-    _, summary = run_classifier.run_bert_classifier(
-        strategy,
-        bert_config,
-        input_meta_data,
-        FLAGS.model_dir,
-        epochs,
-        steps_per_epoch,
-        FLAGS.steps_per_loop,
-        eval_steps,
-        warmup_steps,
-        FLAGS.learning_rate,
-        FLAGS.init_checkpoint,
-        train_input_fn,
-        eval_input_fn,
-        training_callbacks=False,
-        custom_callbacks=callbacks)
-    return summary
-
-
-class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
-  """Short benchmark performance tests for BERT model.
-
-  Tests BERT classification performance in different GPU, TPU configurations.
-  The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` for GPUs and
-  `benchmark_(topology)_tpu_(dataset type)` for TPUs.
-  """
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    super(BertClassifyBenchmarkReal, self).__init__(
-        output_dir=output_dir, tpu=tpu)
-
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.bert_config_file = MODEL_CONFIG_FILE_PATH
-    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
-
-    # Since we only care about performance metrics, we limit
-    # the number of training steps and epochs to prevent unnecessarily
-    # long tests.
-    self.num_steps_per_epoch = 100
-    self.num_epochs = 1
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0,
-                                max_accuracy=1,
-                                use_ds=True):
-    """Starts BERT performance benchmark test."""
-    start_time_sec = time.time()
-    summary = self._run_bert_classifier(
-        callbacks=[self.timer_callback], use_ds=use_ds)
-    wall_time_sec = time.time() - start_time_sec
-
-    # Since we do not load from any pretrained checkpoints, we ignore all
-    # accuracy metrics.
-    summary.pop('eval_metrics', None)
-    summary['start_time_sec'] = start_time_sec
-
-    super(BertClassifyBenchmarkReal, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def benchmark_1_gpu_mrpc(self):
-    """Test BERT model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_mrpc_xla(self):
-    """Test BERT model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_xla')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-    FLAGS.enable_xla = True
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_mrpc_no_dist_strat(self):
-    """Test BERT model performance with 1 GPU, no distribution strategy."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_no_dist_strat')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_mrpc(self):
-    """Test BERT model performance with 8 GPUs."""
-
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_amp_mrpc_no_dist_strat(self):
-    """Performance for 1 GPU no DS with automatic mixed precision."""
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_amp_mrpc_no_dist_strat')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  def benchmark_8_gpu_amp_mrpc(self):
-    """Test BERT model performance with 8 GPUs with automatic mixed precision."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 32
-    FLAGS.eval_batch_size = 32
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_mrpc(self):
-    """Test BERT model performance with 2x2 TPU."""
-
-    self._setup()
-    FLAGS.steps_per_loop = 50
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 32
-    FLAGS.eval_batch_size = 32
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-
-class BertClassifyAccuracy(BertClassifyBenchmarkBase):
-  """Short accuracy test for BERT model.
-
-  Tests BERT classification task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.bert_config_file = MODEL_CONFIG_FILE_PATH
-    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(BertClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0.84,
-                                max_accuracy=0.88):
-    """Starts BERT accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    summary = self._run_bert_classifier(callbacks=[self.timer_callback])
-    wall_time_sec = time.time() - start_time_sec
-
-    super(BertClassifyAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(BertClassifyAccuracy, self)._setup()
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_mrpc(self):
-    """Run BERT model accuracy test with 8 GPUs.
-
-    Due to comparatively small cardinality of  MRPC dataset, training
-    accuracy metric has high variance between trainings. As so, we
-    set the wide range of allowed accuracy (84% to 88%).
-    """
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_8_gpu_mrpc_xla(self):
-    """Run BERT model accuracy test with 8 GPUs with XLA."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc_xla')
-    FLAGS.enable_xla = True
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_mrpc(self):
-    """Run BERT model accuracy test on 2x2 TPU."""
-    self._setup()
-    FLAGS.steps_per_loop = 50
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/bert_benchmark_utils.py
+++ b/official/benchmark/bert_benchmark_utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions or classes shared between BERT benchmarks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-# pylint: disable=g-bad-import-order
-import numpy as np
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.utils.flags import core as flags_core
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-FLAGS = flags.FLAGS
-
-
-class BenchmarkTimerCallback(tf.keras.callbacks.Callback):
-  """Callback that records time it takes to run each batch."""
-
-  def __init__(self, num_batches_to_skip=10):
-    super(BenchmarkTimerCallback, self).__init__()
-    self.batch_start_times = {}
-    self.batch_stop_times = {}
-
-  def on_batch_begin(self, batch, logs=None):
-    self.batch_start_times[batch] = time.time()
-
-  def on_batch_end(self, batch, logs=None):
-    # If there are multiple steps_per_loop, the end batch index will not be the
-    # same as the starting index. Use the last starting index instead.
-    if batch not in self.batch_start_times:
-      batch = max(self.batch_start_times.keys())
-
-    self.batch_stop_times[batch] = time.time()
-
-  def get_examples_per_sec(self, batch_size, num_batches_to_skip=1):
-    batch_durations = []
-    for batch in self.batch_start_times:
-      if batch in self.batch_stop_times and batch >= num_batches_to_skip:
-        batch_durations.append(self.batch_stop_times[batch] -
-                               self.batch_start_times[batch])
-    return batch_size / np.mean(batch_durations)
-
-  def get_startup_time(self, program_start_time):
-    return self.batch_start_times[0] - program_start_time
-
-
-class BertBenchmarkBase(PerfZeroBenchmark):
-  """Base class to hold methods common to test classes."""
-  local_flags = None
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    super(BertBenchmarkBase, self).__init__(
-        output_dir=output_dir, tpu=tpu, **kwargs)
-    self.num_gpus = 8
-    self.timer_callback = None
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    super(BertBenchmarkBase, self)._setup()
-    self.timer_callback = BenchmarkTimerCallback()
-
-  def _report_benchmark(self, stats, wall_time_sec, min_accuracy, max_accuracy):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from BERT models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      min_accuracy: Minimum classification accuracy constraint to verify
-        correctness of the model.
-      max_accuracy: Maximum classification accuracy constraint to verify
-        correctness of the model.
-    """
-    metrics = [{
-        'name': 'training_loss',
-        'value': stats['train_loss'],
-    }]
-    if self.timer_callback:
-      metrics.append({
-          'name':
-              'exp_per_second',
-          'value':
-              self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
-                                                       FLAGS.steps_per_loop)
-      })
-    else:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': 0.0,
-      })
-    if self.timer_callback and 'start_time_sec' in stats:
-      metrics.append({
-          'name': 'startup_time',
-          'value': self.timer_callback.get_startup_time(stats['start_time_sec'])
-      })
-
-    if 'eval_metrics' in stats:
-      metrics.append({
-          'name': 'eval_accuracy',
-          'value': stats['eval_metrics'],
-          'min_value': min_accuracy,
-          'max_value': max_accuracy,
-      })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=stats['total_training_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
--- a/official/benchmark/bert_pretrain_benchmark.py
+++ b/official/benchmark/bert_pretrain_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for bert pretraining."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import json
-import os
-import time
-from typing import Optional
-
-from absl import flags
-from absl import logging
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import bert_benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.bert import run_pretraining
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-
-# Pretrain masked lanauge modeling accuracy range:
-MIN_MLM_ACCURACY = 0.635
-MAX_MLM_ACCURACY = 0.645
-
-# Pretrain next sentence prediction accuracy range:
-MIN_NSP_ACCURACY = 0.94
-MAX_NSP_ACCURACY = 0.96
-
-BERT_PRETRAIN_FILES_SEQ128 = 'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
-BERT_BASE_CONFIG_FILE = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
-
-FLAGS = flags.FLAGS
-
-
-class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
-  """Benchmark accuracy tests for BERT Pretraining."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               tpu: Optional[str] = None,
-               **kwargs):
-    """Inits BertPretrainAccuracyBenchmark class.
-
-    Args:
-      output_dir: Directory where to output e.g. log files
-      tpu: TPU name to use in a TPU benchmark.
-      **kwargs: Additional keyword arguments.
-    """
-    super(BertPretrainAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, tpu=tpu, **kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, summary_path: str, report_accuracy: bool):
-    """Runs and reports the benchmark given the provided configuration."""
-    distribution = distribution_utils.get_distribution_strategy(
-        distribution_strategy='tpu', tpu_address=self.tpu)
-    logging.info('Flags: %s', flags_core.get_nondefault_flags_as_str())
-    start_time_sec = time.time()
-    run_pretraining.run_bert_pretrain(
-        strategy=distribution, custom_callbacks=self.timer_callback)
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-    self._report_benchmark(summary, start_time_sec, wall_time_sec,
-                           report_accuracy)
-
-  def _report_benchmark(self, summary, start_time_sec, wall_time_sec,
-                        report_accuracy):
-    metrics = [{
-        'name': 'train_loss',
-        'value': summary['train_loss'],
-    }, {
-        'name':
-            'exp_per_second',
-        'value':
-            self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
-                                                     FLAGS.steps_per_loop)
-    }, {
-        'name': 'startup_time',
-        'value': self.timer_callback.get_startup_time(start_time_sec)
-    }]
-    if report_accuracy:
-      metrics.extend([{
-          'name': 'masked_lm_accuracy',
-          'value': summary['masked_lm_accuracy'],
-          'min_value': MIN_MLM_ACCURACY,
-          'max_value': MAX_MLM_ACCURACY,
-      }, {
-          'name': 'next_sentence_accuracy',
-          'value': summary['next_sentence_accuracy'],
-          'min_value': MIN_NSP_ACCURACY,
-          'max_value': MAX_NSP_ACCURACY,
-      }])
-    self.report_benchmark(
-        iters=summary['total_training_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_core.get_nondefault_flags_as_str()})
-
-  def _specify_common_flags(self):
-    FLAGS.bert_config_file = BERT_BASE_CONFIG_FILE
-    FLAGS.train_batch_size = 512
-    FLAGS.learning_rate = 1e-4
-    FLAGS.warmup_steps = 10000
-    FLAGS.steps_per_loop = 10000
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.input_files = BERT_PRETRAIN_FILES_SEQ128
-    FLAGS.max_seq_length = 128
-    FLAGS.max_predictions_per_seq = 20
-    FLAGS.dtype = 'bf16'
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps(self):
-    """Test bert pretraining with 8x8 TPU for 500k steps."""
-    # This is used for accuracy test.
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 500000
-    FLAGS.num_train_epochs = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Set train_summary_interval to -1 to disable training summary, because
-    # writing summary to gcs may fail and summaries are not needed for this
-    # accuracy benchmark test.
-    FLAGS.train_summary_interval = -1
-    self._run_and_report_benchmark(summary_path=summary_path,
-                                   report_accuracy=True)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps(self):
-    """Test bert pretraining with 2x2 TPU for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.train_batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_2x2_tpu_bf16_seq128_10k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Disable accuracy check.
-    self._run_and_report_benchmark(
-        summary_path=summary_path, report_accuracy=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir(self):
-    """Test bert pretraining with 2x2 TPU with MLIR for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.train_batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_2x2_tpu_bf16_seq128_10k_steps_mlir')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    tf.config.experimental.enable_mlir_bridge()
-    # Disable accuracy check.
-    self._run_and_report_benchmark(
-        summary_path=summary_path, report_accuracy=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_4x4_tpu_bf16_seq128_10k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Disable accuracy check.
-    self._run_and_report_benchmark(
-        summary_path=summary_path, report_accuracy=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir(self):
-    """Test bert pretraining with 4x4 TPU with MLIR for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_4x4_tpu_bf16_seq128_10k_steps_mlir')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    tf.config.experimental.enable_mlir_bridge()
-    # Disable accuracy check.
-    self._run_and_report_benchmark(
-        summary_path=summary_path, report_accuracy=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_8x8_tpu_bf16_seq128_10k_steps(self):
-    """Test bert pretraining with 8x8 TPU for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_8x8_tpu_bf16_seq128_10k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Disable accuracy check.
-    self._run_and_report_benchmark(summary_path=summary_path,
-                                   report_accuracy=False)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/bert_squad_benchmark.py
+++ b/official/benchmark/bert_squad_benchmark.py
--- a/official/benchmark/datastore/schema/benchmark_metric.json
+++ b/official/benchmark/datastore/schema/benchmark_metric.json
-[
-  {
-    "description": "The ID of the benchmark run, where this metric should tie to.",
-    "mode": "REQUIRED",
-    "name": "run_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
-    "mode": "REQUIRED",
-    "name": "name",
-    "type": "STRING"
-  },
-  {
-    "description": "The unit of the metric. E.g. MB per sec.",
-    "mode": "NULLABLE",
-    "name": "unit",
-    "type": "STRING"
-  },
-  {
-    "description": "The value of the metric.",
-    "mode": "NULLABLE",
-    "name": "value",
-    "type": "FLOAT"
-  },
-  {
-    "description": "The timestamp when the metric is recorded.",
-    "mode": "REQUIRED",
-    "name": "timestamp",
-    "type": "TIMESTAMP"
-  },
-  {
-    "description": "The global step when this metric is recorded.",
-    "mode": "NULLABLE",
-    "name": "global_step",
-    "type": "INTEGER"
-  },
-  {
-    "description": "Free format metadata for the extra information about the metric.",
-    "mode": "REPEATED",
-    "name": "extras",
-    "type": "RECORD",
-    "fields": [
-      {
-        "mode": "NULLABLE",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ]
-  }
-]
--- a/official/benchmark/datastore/schema/benchmark_run.json
+++ b/official/benchmark/datastore/schema/benchmark_run.json
-[
-  {
-    "description": "The UUID of the run for the benchmark.",
-    "mode": "REQUIRED",
-    "name": "model_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
-    "mode": "REQUIRED",
-    "name": "model_name",
-    "type": "STRING"
-  },
-  {
-    "description": "The date when the test of the model is started",
-    "mode": "REQUIRED",
-    "name": "run_date",
-    "type": "TIMESTAMP"
-  },
-  {
-    "description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
-    "mode": "NULLABLE",
-    "name": "test_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The tensorflow version information.",
-    "fields": [
-      {
-        "description": "Version of the tensorflow. E.g. 1.7.0-rc0",
-        "mode": "REQUIRED",
-        "name": "version",
-        "type": "STRING"
-      },
-      {
-        "description": "Git Hash of the tensorflow",
-        "mode": "NULLABLE",
-        "name": "git_hash",
-        "type": "STRING"
-      },
-      {
-        "description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
-        "mode": "NULLABLE",
-        "name": "channel",
-        "type": "STRING"
-      },
-      {
-        "description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
-        "mode": "NULLABLE",
-        "name": "build_type",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REQUIRED",
-    "name": "tensorflow_version",
-    "type": "RECORD"
-  },
-  {
-    "description": "The arbitrary attribute of the model.",
-    "fields": [
-      {
-        "description": "The name of the attribute.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the attribute.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "attribute",
-    "type": "RECORD"
-  },
-  {
-    "description": "Environment variables when the benchmark run is executed.",
-    "fields": [
-      {
-        "description": "The name of the variable.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the variable.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "environment_variable",
-    "type": "RECORD"
-  },
-  {
-    "description": "TF Environment variables when the benchmark run is executed.",
-    "fields": [
-      {
-        "description": "The name of the variable.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the variable.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "tensorflow_environment_variables",
-    "type": "RECORD"
-  },
-  {
-    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
-    "fields": [
-      {
-        "description": "The name of the parameter.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The string value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "string_value",
-        "type": "STRING"
-      },
-      {
-        "description": "The bool value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "bool_value",
-        "type": "STRING"
-      },
-      {
-        "description": "The int/long value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "long_value",
-        "type": "INTEGER"
-      },
-      {
-        "description": "The double/float value of parameter.",
-        "mode": "NULLABLE",
-        "name": "float_value",
-        "type": "FLOAT"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "run_parameters",
-    "type": "RECORD"
-  },
-  {
-    "description": "The dataset that run with the benchmark.",
-    "mode": "NULLABLE",
-    "name": "dataset",
-    "type": "RECORD",
-    "fields": [
-      {
-        "description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The arbitrary attribute of the dataset.",
-        "fields": [
-          {
-            "description": "The name of the attribute.",
-            "mode": "REQUIRED",
-            "name": "name",
-            "type": "STRING"
-          },
-          {
-            "description": "The value of the attribute.",
-            "mode": "NULLABLE",
-            "name": "value",
-            "type": "STRING"
-          }
-        ],
-        "mode": "REPEATED",
-        "name": "attribute",
-        "type": "RECORD"
-      }
-    ]
-  },
-  {
-    "description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
-    "mode": "NULLABLE",
-    "name": "test_environment",
-    "type": "STRING"
-  },
-  {
-    "description": "The machine configuration of the benchmark run.",
-    "mode": "NULLABLE",
-    "name": "machine_config",
-    "type": "RECORD",
-    "fields": [
-      {
-        "description": "The platform information of the benchmark run.",
-        "mode": "NULLABLE",
-        "name": "platform_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "description": "Eg: 64bit.",
-            "mode": "NULLABLE",
-            "name": "bits",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: ELF.",
-            "mode": "NULLABLE",
-            "name": "linkage",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: i386.",
-            "mode": "NULLABLE",
-            "name": "machine",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: 3.13.0-76-generic.",
-            "mode": "NULLABLE",
-            "name": "release",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: Linux.",
-            "mode": "NULLABLE",
-            "name": "system",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
-            "mode": "NULLABLE",
-            "name": "version",
-            "type": "STRING"
-          }
-        ]
-      },
-      {
-        "description": "The CPU information of the benchmark run.",
-        "mode": "NULLABLE",
-        "name": "cpu_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "mode": "NULLABLE",
-            "name": "num_cores",
-            "type": "INTEGER"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "num_cores_allowed",
-            "type": "INTEGER"
-          },
-          {
-            "description" : "How fast are those CPUs.",
-            "mode": "NULLABLE",
-            "name": "mhz_per_cpu",
-            "type": "FLOAT"
-          },
-          {
-            "description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
-            "mode": "NULLABLE",
-            "name": "cpu_info",
-            "type": "STRING"
-          },
-          {
-            "description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
-            "mode": "NULLABLE",
-            "name": "cpu_governor",
-            "type": "STRING"
-          },
-          {
-            "description": "Cache size of the CPUs.",
-            "mode": "NULLABLE",
-            "name": "cache_size",
-            "type": "RECORD",
-            "fields": [
-              {
-                "mode": "NULLABLE",
-                "name": "level",
-                "type": "STRING"
-              },
-              {
-                "mode": "NULLABLE",
-                "name": "size",
-                "type": "INTEGER"
-              }
-            ]
-          }
-        ]
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "gpu_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "mode": "NULLABLE",
-            "name": "count",
-            "type": "INTEGER"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "model",
-            "type": "STRING"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "cuda_version",
-            "type": "STRING"
-          }
-        ]
-      },
-      {
-        "description": "The cloud instance inforation if the benchmark run is executed on cloud",
-        "mode": "NULLABLE",
-        "name": "cloud_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "description": "The instance type, E.g. n1-standard-4.",
-            "mode": "NULLABLE",
-            "name": "instance_type",
-            "type": "STRING"
-          },
-          {
-            "description": "The arbitrary attribute of the cloud info.",
-            "fields": [
-              {
-                "description": "The name of the attribute.",
-                "mode": "REQUIRED",
-                "name": "name",
-                "type": "STRING"
-              },
-              {
-                "description": "The value of the attribute.",
-                "mode": "NULLABLE",
-                "name": "value",
-                "type": "STRING"
-              }
-            ],
-            "mode": "REPEATED",
-            "name": "attribute",
-            "type": "RECORD"
-          }
-        ]
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "memory_total",
-        "type": "INTEGER"
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "memory_available",
-        "type": "STRING"
-      }
-    ]
-  }
-]
--- a/official/benchmark/datastore/schema/benchmark_run_status.json
+++ b/official/benchmark/datastore/schema/benchmark_run_status.json
-[
-  {
-    "description": "The UUID of the run for the benchmark.",
-    "mode": "REQUIRED",
-    "name": "run_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The status of the run for the benchmark. Eg, running, failed, success",
-    "mode": "REQUIRED",
-    "name": "status",
-    "type": "STRING"
-  }
-]
\ No newline at end of file
--- a/official/benchmark/keras_benchmark.py
+++ b/official/benchmark/keras_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.utils.flags import core as flags_core
-
-
-class KerasBenchmark(PerfZeroBenchmark):
-  """Base benchmark class with methods to simplify testing."""
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               flag_methods=None,
-               tpu=None):
-    super(KerasBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None,
-                        log_steps=None,
-                        total_batch_size=None,
-                        warmup=1,
-                        start_time_sec=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      total_batch_size: Global batch-size.
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-      start_time_sec: the start time of the program in seconds since epoch
-    """
-
-    metrics = []
-    if 'accuracy_top_1' in stats:
-      metrics.append({'name': 'accuracy_top_1',
-                      'value': stats['accuracy_top_1'],
-                      'min_value': top_1_min,
-                      'max_value': top_1_max})
-      metrics.append({'name': 'top_1_train_accuracy',
-                      'value': stats['training_accuracy_top_1']})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup):
-      # first entry in the time_log is start of step 1. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      examples_per_sec = num_examples / elapsed
-      metrics.append({'name': 'exp_per_second',
-                      'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({'name': 'avg_exp_per_second',
-                      'value': stats['avg_exp_per_second']})
-
-    if start_time_sec and 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      # time_log[0] is recorded at the beginning of the first step.
-      startup_time = time_log[0].timestamp - start_time_sec
-      metrics.append({'name': 'startup_time', 'value': startup_time})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
--- a/official/benchmark/keras_cifar_benchmark.py
+++ b/official/benchmark/keras_cifar_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import keras_benchmark
-from official.benchmark import benchmark_wrappers
-from official.benchmark.models import resnet_cifar_main
-
-MIN_TOP_1_ACCURACY = 0.929
-MAX_TOP_1_ACCURACY = 0.938
-
-FLAGS = flags.FLAGS
-CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
-
-
-class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
-  """Accuracy tests for ResNet56 Keras CIFAR-10."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-
-    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    flag_methods = [resnet_cifar_main.define_cifar_flags]
-
-    super(Resnet56KerasAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def _setup(self):
-    super(Resnet56KerasAccuracy, self)._setup()
-    FLAGS.use_tensor_lr = False
-
-  def benchmark_graph_1_gpu(self):
-    """Test keras based model with Keras fit and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test keras based model with eager and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu(self):
-    """Test keras based model on CPU."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat(self):
-    """Test keras based model on CPU without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat_run_eagerly(self):
-    """Test keras based model on CPU w/forced eager and no dist_strat."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_cpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test keras based model with eager and no dist strat."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test keras based model w/forced eager and no dist_strat."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu_no_dist_strat(self):
-    """Test keras based model with Keras fit but not distribution strategies."""
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpu(self):
-    """Test keras based model with eager and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test keras based model with Keras fit and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_cifar_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet56KerasAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=MIN_TOP_1_ACCURACY,
-        top_1_max=MAX_TOP_1_ACCURACY,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100)
-
-
-class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
-  """Short performance tests for ResNet56 via Keras and CIFAR-10."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [resnet_cifar_main.define_cifar_flags]
-
-    super(Resnet56KerasBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_cifar_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet56KerasBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu(self):
-    """Test 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_xla(self):
-    """Test 1 gpu with xla enabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = False
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu(self):
-    """Test 1 gpu graph."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu_no_dist_strat(self):
-    """Test 1 gpu graph mode without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test 1 gpu without distribution strategy and forced eager."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpu(self):
-    """Test 2 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
-    FLAGS.batch_size = 128 * 2  # 2 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test 2 gpu graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.enable_eager = False
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    FLAGS.batch_size = 128 * 2  # 2 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu(self):
-    """Test cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_cpu(self):
-    """Test cpu graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = False
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat_run_eagerly(self):
-    """Test cpu without distribution strategy and forced eager."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_cpu_no_dist_strat_run_eagerly')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat(self):
-    """Test cpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_cpu_no_dist_strat(self):
-    """Test cpu graph mode without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = False
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-
-class Resnet56KerasBenchmarkSynth(Resnet56KerasBenchmarkBase):
-  """Synthetic benchmarks for ResNet56 and Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    default_flags = {}
-    default_flags['skip_eval'] = True
-    default_flags['use_synthetic_data'] = True
-    default_flags['train_steps'] = 110
-    default_flags['log_steps'] = 10
-    default_flags['use_tensor_lr'] = False
-
-    super(Resnet56KerasBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=default_flags)
-
-
-class Resnet56KerasBenchmarkReal(Resnet56KerasBenchmarkBase):
-  """Real data benchmarks for ResNet56 and Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    default_flags = {}
-    default_flags['skip_eval'] = True
-    default_flags['data_dir'] = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    default_flags['train_steps'] = 110
-    default_flags['log_steps'] = 10
-    default_flags['use_tensor_lr'] = False
-
-    super(Resnet56KerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=default_flags)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
--- a/official/benchmark/models/__init__.py
+++ b/official/benchmark/models/__init__.py
--- a/official/benchmark/models/cifar_preprocessing.py
+++ b/official/benchmark/models/cifar_preprocessing.py
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Provides utilities to Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-from absl import logging
-import tensorflow as tf
-
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-HEIGHT = 32
-WIDTH = 32
-NUM_CHANNELS = 3
-_DEFAULT_IMAGE_BYTES = HEIGHT * WIDTH * NUM_CHANNELS
-# The record is the image plus a one-byte label
-_RECORD_BYTES = _DEFAULT_IMAGE_BYTES + 1
-
-# TODO(tobyboyd): Change to best practice 45K(train)/5K(val)/10K(test) splits.
-NUM_IMAGES = {
-    'train': 50000,
-    'validation': 10000,
-}
-_NUM_DATA_FILES = 5
-NUM_CLASSES = 10
-
-
-def parse_record(raw_record, is_training, dtype):
-  """Parses a record containing a training example of an image.
-
-  The input record is parsed into a label and image, and the image is passed
-  through preprocessing steps (cropping, flipping, and so on).
-
-  This method converts the label to one hot to fit the loss function.
-
-  Args:
-    raw_record: scalar Tensor tf.string containing a serialized
-      Example protocol buffer.
-    is_training: A boolean denoting whether the input is for training.
-    dtype: Data type to use for input images.
-
-  Returns:
-    Tuple with processed image tensor and one-hot-encoded label tensor.
-  """
-  # Convert bytes to a vector of uint8 that is record_bytes long.
-  record_vector = tf.io.decode_raw(raw_record, tf.uint8)
-
-  # The first byte represents the label, which we convert from uint8 to int32
-  # and then to one-hot.
-  label = tf.cast(record_vector[0], tf.int32)
-
-  # The remaining bytes after the label represent the image, which we reshape
-  # from [depth * height * width] to [depth, height, width].
-  depth_major = tf.reshape(record_vector[1:_RECORD_BYTES],
-                           [NUM_CHANNELS, HEIGHT, WIDTH])
-
-  # Convert from [depth, height, width] to [height, width, depth], and cast as
-  # float32.
-  image = tf.cast(tf.transpose(a=depth_major, perm=[1, 2, 0]), tf.float32)
-
-  image = preprocess_image(image, is_training)
-  image = tf.cast(image, dtype)
-
-  return image, label
-
-
-def preprocess_image(image, is_training):
-  """Preprocess a single image of layout [height, width, depth]."""
-  if is_training:
-    # Resize the image to add four extra pixels on each side.
-    image = tf.image.resize_with_crop_or_pad(
-        image, HEIGHT + 8, WIDTH + 8)
-
-    # Randomly crop a [HEIGHT, WIDTH] section of the image.
-    image = tf.image.random_crop(image, [HEIGHT, WIDTH, NUM_CHANNELS])
-
-    # Randomly flip the image horizontally.
-    image = tf.image.random_flip_left_right(image)
-
-  # Subtract off the mean and divide by the variance of the pixels.
-  image = tf.image.per_image_standardization(image)
-  return image
-
-
-def get_filenames(is_training, data_dir):
-  """Returns a list of filenames."""
-  assert tf.io.gfile.exists(data_dir), (
-      'Run cifar10_download_and_extract.py first to download and extract the '
-      'CIFAR-10 data.')
-
-  if is_training:
-    return [
-        os.path.join(data_dir, 'data_batch_%d.bin' % i)
-        for i in range(1, _NUM_DATA_FILES + 1)
-    ]
-  else:
-    return [os.path.join(data_dir, 'test_batch.bin')]
-
-
-def input_fn(is_training,
-             data_dir,
-             batch_size,
-             dtype=tf.float32,
-             datasets_num_private_threads=None,
-             parse_record_fn=parse_record,
-             input_context=None,
-             drop_remainder=False):
-  """Input function which provides batches for train or eval.
-
-  Args:
-    is_training: A boolean denoting whether the input is for training.
-    data_dir: The directory containing the input data.
-    batch_size: The number of samples per batch.
-    dtype: Data type to use for images/features
-    datasets_num_private_threads: Number of private threads for tf.data.
-    parse_record_fn: Function to use for parsing the records.
-    input_context: A `tf.distribute.InputContext` object passed in by
-      `tf.distribute.Strategy`.
-    drop_remainder: A boolean indicates whether to drop the remainder of the
-      batches. If True, the batch dimension will be static.
-
-  Returns:
-    A dataset that can be used for iteration.
-  """
-  filenames = get_filenames(is_training, data_dir)
-  dataset = tf.data.FixedLengthRecordDataset(filenames, _RECORD_BYTES)
-
-  if input_context:
-    logging.info(
-        'Sharding the dataset: input_pipeline_id=%d num_input_pipelines=%d',
-        input_context.input_pipeline_id, input_context.num_input_pipelines)
-    dataset = dataset.shard(input_context.num_input_pipelines,
-                            input_context.input_pipeline_id)
-
-  return imagenet_preprocessing.process_record_dataset(
-      dataset=dataset,
-      is_training=is_training,
-      batch_size=batch_size,
-      shuffle_buffer=NUM_IMAGES['train'],
-      parse_record_fn=parse_record_fn,
-      dtype=dtype,
-      datasets_num_private_threads=datasets_num_private_threads,
-      drop_remainder=drop_remainder
-  )
--- a/official/benchmark/models/resnet_cifar_main.py
+++ b/official/benchmark/models/resnet_cifar_main.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Runs a ResNet model on the Cifar-10 dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow as tf
-from official.benchmark.models import cifar_preprocessing
-from official.benchmark.models import resnet_cifar_model
-from official.benchmark.models import synthetic_util
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-from official.vision.image_classification.resnet import common
-
-
-LR_SCHEDULE = [  # (multiplier, epoch to start) tuples
-    (0.1, 91), (0.01, 136), (0.001, 182)
-]
-
-
-def learning_rate_schedule(current_epoch,
-                           current_batch,
-                           batches_per_epoch,
-                           batch_size):
-  """Handles linear scaling rule and LR decay.
-
-  Scale learning rate at epoch boundaries provided in LR_SCHEDULE by the
-  provided scaling factor.
-
-  Args:
-    current_epoch: integer, current epoch indexed from 0.
-    current_batch: integer, current batch in the current epoch, indexed from 0.
-    batches_per_epoch: integer, number of steps in an epoch.
-    batch_size: integer, total batch sized.
-
-  Returns:
-    Adjusted learning rate.
-  """
-  del current_batch, batches_per_epoch  # not used
-  initial_learning_rate = common.BASE_LEARNING_RATE * batch_size / 128
-  learning_rate = initial_learning_rate
-  for mult, start_epoch in LR_SCHEDULE:
-    if current_epoch >= start_epoch:
-      learning_rate = initial_learning_rate * mult
-    else:
-      break
-  return learning_rate
-
-
-class LearningRateBatchScheduler(tf.keras.callbacks.Callback):
-  """Callback to update learning rate on every batch (not epoch boundaries).
-
-  N.B. Only support Keras optimizers, not TF optimizers.
-
-  Attributes:
-      schedule: a function that takes an epoch index and a batch index as input
-          (both integer, indexed from 0) and returns a new learning rate as
-          output (float).
-  """
-
-  def __init__(self, schedule, batch_size, steps_per_epoch):
-    super(LearningRateBatchScheduler, self).__init__()
-    self.schedule = schedule
-    self.steps_per_epoch = steps_per_epoch
-    self.batch_size = batch_size
-    self.epochs = -1
-    self.prev_lr = -1
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if not hasattr(self.model.optimizer, 'learning_rate'):
-      raise ValueError('Optimizer must have a "learning_rate" attribute.')
-    self.epochs += 1
-
-  def on_batch_begin(self, batch, logs=None):
-    """Executes before step begins."""
-    lr = self.schedule(self.epochs,
-                       batch,
-                       self.steps_per_epoch,
-                       self.batch_size)
-    if not isinstance(lr, (float, np.float32, np.float64)):
-      raise ValueError('The output of the "schedule" function should be float.')
-    if lr != self.prev_lr:
-      self.model.optimizer.learning_rate = lr  # lr should be a float here
-      self.prev_lr = lr
-      logging.debug(
-          'Epoch %05d Batch %05d: LearningRateBatchScheduler '
-          'change learning rate to %s.', self.epochs, batch, lr)
-
-
-def run(flags_obj):
-  """Run ResNet Cifar-10 training and eval loop using native Keras APIs.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-
-  Raises:
-    ValueError: If fp16 is passed as it is not currently supported.
-
-  Returns:
-    Dictionary of training and eval stats.
-  """
-  keras_utils.set_session_config(
-      enable_xla=flags_obj.enable_xla)
-
-  # Execute flag override logic for better model performance
-  if flags_obj.tf_gpu_thread_mode:
-    keras_utils.set_gpu_thread_mode_and_count(
-        per_gpu_thread_count=flags_obj.per_gpu_thread_count,
-        gpu_thread_mode=flags_obj.tf_gpu_thread_mode,
-        num_gpus=flags_obj.num_gpus,
-        datasets_num_private_threads=flags_obj.datasets_num_private_threads)
-  common.set_cudnn_batchnorm_mode()
-
-  dtype = flags_core.get_tf_dtype(flags_obj)
-  if dtype == 'fp16':
-    raise ValueError('dtype fp16 is not supported in Keras. Use the default '
-                     'value(fp32).')
-
-  data_format = flags_obj.data_format
-  if data_format is None:
-    data_format = ('channels_first' if tf.config.list_physical_devices('GPU')
-                   else 'channels_last')
-  tf.keras.backend.set_image_data_format(data_format)
-
-  strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=flags_obj.distribution_strategy,
-      num_gpus=flags_obj.num_gpus,
-      all_reduce_alg=flags_obj.all_reduce_alg,
-      num_packs=flags_obj.num_packs)
-
-  if strategy:
-    # flags_obj.enable_get_next_as_optional controls whether enabling
-    # get_next_as_optional behavior in DistributedIterator. If true, last
-    # partial batch can be supported.
-    strategy.extended.experimental_enable_get_next_as_optional = (
-        flags_obj.enable_get_next_as_optional
-    )
-
-  strategy_scope = distribution_utils.get_strategy_scope(strategy)
-
-  if flags_obj.use_synthetic_data:
-    synthetic_util.set_up_synthetic_data()
-    input_fn = common.get_synth_input_fn(
-        height=cifar_preprocessing.HEIGHT,
-        width=cifar_preprocessing.WIDTH,
-        num_channels=cifar_preprocessing.NUM_CHANNELS,
-        num_classes=cifar_preprocessing.NUM_CLASSES,
-        dtype=flags_core.get_tf_dtype(flags_obj),
-        drop_remainder=True)
-  else:
-    synthetic_util.undo_set_up_synthetic_data()
-    input_fn = cifar_preprocessing.input_fn
-
-  train_input_dataset = input_fn(
-      is_training=True,
-      data_dir=flags_obj.data_dir,
-      batch_size=flags_obj.batch_size,
-      parse_record_fn=cifar_preprocessing.parse_record,
-      datasets_num_private_threads=flags_obj.datasets_num_private_threads,
-      dtype=dtype,
-      # Setting drop_remainder to avoid the partial batch logic in normalization
-      # layer, which triggers tf.where and leads to extra memory copy of input
-      # sizes between host and GPU.
-      drop_remainder=(not flags_obj.enable_get_next_as_optional))
-
-  eval_input_dataset = None
-  if not flags_obj.skip_eval:
-    eval_input_dataset = input_fn(
-        is_training=False,
-        data_dir=flags_obj.data_dir,
-        batch_size=flags_obj.batch_size,
-        parse_record_fn=cifar_preprocessing.parse_record)
-
-  steps_per_epoch = (
-      cifar_preprocessing.NUM_IMAGES['train'] // flags_obj.batch_size)
-  lr_schedule = 0.1
-  if flags_obj.use_tensor_lr:
-    initial_learning_rate = common.BASE_LEARNING_RATE * flags_obj.batch_size / 128
-    lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(
-        boundaries=list(p[1] * steps_per_epoch for p in LR_SCHEDULE),
-        values=[initial_learning_rate] +
-        list(p[0] * initial_learning_rate for p in LR_SCHEDULE))
-
-  with strategy_scope:
-    optimizer = common.get_optimizer(lr_schedule)
-    model = resnet_cifar_model.resnet56(classes=cifar_preprocessing.NUM_CLASSES)
-    model.compile(
-        loss='sparse_categorical_crossentropy',
-        optimizer=optimizer,
-        metrics=(['sparse_categorical_accuracy']
-                 if flags_obj.report_accuracy_metrics else None),
-        run_eagerly=flags_obj.run_eagerly)
-
-  train_epochs = flags_obj.train_epochs
-
-  callbacks = common.get_callbacks()
-
-  if not flags_obj.use_tensor_lr:
-    lr_callback = LearningRateBatchScheduler(
-        schedule=learning_rate_schedule,
-        batch_size=flags_obj.batch_size,
-        steps_per_epoch=steps_per_epoch)
-    callbacks.append(lr_callback)
-
-  # if mutliple epochs, ignore the train_steps flag.
-  if train_epochs <= 1 and flags_obj.train_steps:
-    steps_per_epoch = min(flags_obj.train_steps, steps_per_epoch)
-    train_epochs = 1
-
-  num_eval_steps = (cifar_preprocessing.NUM_IMAGES['validation'] //
-                    flags_obj.batch_size)
-
-  validation_data = eval_input_dataset
-  if flags_obj.skip_eval:
-    if flags_obj.set_learning_phase_to_train:
-      # TODO(haoyuzhang): Understand slowdown of setting learning phase when
-      # not using distribution strategy.
-      tf.keras.backend.set_learning_phase(1)
-    num_eval_steps = None
-    validation_data = None
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    # TODO(b/135607227): Add device scope automatically in Keras training loop
-    # when not using distribition strategy.
-    no_dist_strat_device = tf.device('/device:GPU:0')
-    no_dist_strat_device.__enter__()
-
-  history = model.fit(train_input_dataset,
-                      epochs=train_epochs,
-                      steps_per_epoch=steps_per_epoch,
-                      callbacks=callbacks,
-                      validation_steps=num_eval_steps,
-                      validation_data=validation_data,
-                      validation_freq=flags_obj.epochs_between_evals,
-                      verbose=2)
-  eval_output = None
-  if not flags_obj.skip_eval:
-    eval_output = model.evaluate(eval_input_dataset,
-                                 steps=num_eval_steps,
-                                 verbose=2)
-
-  if not strategy and flags_obj.explicit_gpu_placement:
-    no_dist_strat_device.__exit__()
-
-  stats = common.build_stats(history, eval_output, callbacks)
-  return stats
-
-
-def define_cifar_flags():
-  common.define_keras_flags(dynamic_loss_scale=False)
-
-  flags_core.set_defaults(data_dir='/tmp/cifar10_data/cifar-10-batches-bin',
-                          model_dir='/tmp/cifar10_model',
-                          epochs_between_evals=10,
-                          batch_size=128)
-
-
-def main(_):
-  return run(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
-  define_cifar_flags()
-  app.run(main)
--- a/official/benchmark/models/resnet_cifar_model.py
+++ b/official/benchmark/models/resnet_cifar_model.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet56 model for Keras adapted from tf.keras.applications.ResNet50.
-
-# Reference:
- [Deep Residual Learning for Image Recognition](
-    https://arxiv.org/abs/1512.03385)
-Adapted from code contributed by BigMoyan.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import tensorflow as tf
-from tensorflow.python.keras import backend
-from tensorflow.python.keras  import initializers
-from tensorflow.python.keras import layers
-from tensorflow.python.keras import regularizers
-
-
-BATCH_NORM_DECAY = 0.997
-BATCH_NORM_EPSILON = 1e-5
-L2_WEIGHT_DECAY = 2e-4
-
-
-def identity_building_block(input_tensor,
-                            kernel_size,
-                            filters,
-                            stage,
-                            block,
-                            training=None):
-  """The identity block is the block that has no conv layer at shortcut.
-
-  Arguments:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: current block label, used for generating layer names
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor for the block.
-  """
-  filters1, filters2 = filters
-  if backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(filters1, kernel_size,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2a')(input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2a')(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(filters2, kernel_size,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2b')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2b')(x, training=training)
-
-  x = layers.add([x, input_tensor])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def conv_building_block(input_tensor,
-                        kernel_size,
-                        filters,
-                        stage,
-                        block,
-                        strides=(2, 2),
-                        training=None):
-  """A block that has a conv layer at shortcut.
-
-  Arguments:
-    input_tensor: input tensor
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    block: current block label, used for generating layer names
-    strides: Strides for the first conv layer in the block.
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor for the block.
-
-  Note that from stage 3,
-  the first conv layer at main path is with strides=(2, 2)
-  And the shortcut should have strides=(2, 2) as well
-  """
-  filters1, filters2 = filters
-  if tf.keras.backend.image_data_format() == 'channels_last':
-    bn_axis = 3
-  else:
-    bn_axis = 1
-  conv_name_base = 'res' + str(stage) + block + '_branch'
-  bn_name_base = 'bn' + str(stage) + block + '_branch'
-
-  x = layers.Conv2D(filters1, kernel_size, strides=strides,
-                    padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2a')(input_tensor)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2a')(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = layers.Conv2D(filters2, kernel_size, padding='same', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name=conv_name_base + '2b')(x)
-  x = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '2b')(x, training=training)
-
-  shortcut = layers.Conv2D(filters2, (1, 1), strides=strides, use_bias=False,
-                           kernel_initializer='he_normal',
-                           kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                           name=conv_name_base + '1')(input_tensor)
-  shortcut = layers.BatchNormalization(
-      axis=bn_axis, momentum=BATCH_NORM_DECAY, epsilon=BATCH_NORM_EPSILON,
-      name=bn_name_base + '1')(shortcut, training=training)
-
-  x = layers.add([x, shortcut])
-  x = layers.Activation('relu')(x)
-  return x
-
-
-def resnet_block(input_tensor,
-                 size,
-                 kernel_size,
-                 filters,
-                 stage,
-                 conv_strides=(2, 2),
-                 training=None):
-  """A block which applies conv followed by multiple identity blocks.
-
-  Arguments:
-    input_tensor: input tensor
-    size: integer, number of constituent conv/identity building blocks.
-    A conv block is applied once, followed by (size - 1) identity blocks.
-    kernel_size: default 3, the kernel size of
-        middle conv layer at main path
-    filters: list of integers, the filters of 3 conv layer at main path
-    stage: integer, current stage label, used for generating layer names
-    conv_strides: Strides for the first conv layer in the block.
-    training: Only used if training keras model with Estimator.  In other
-      scenarios it is handled automatically.
-
-  Returns:
-    Output tensor after applying conv and identity blocks.
-  """
-
-  x = conv_building_block(input_tensor, kernel_size, filters, stage=stage,
-                          strides=conv_strides, block='block_0',
-                          training=training)
-  for i in range(size - 1):
-    x = identity_building_block(x, kernel_size, filters, stage=stage,
-                                block='block_%d' % (i + 1), training=training)
-  return x
-
-
-def resnet(num_blocks, classes=10, training=None):
-  """Instantiates the ResNet architecture.
-
-  Arguments:
-    num_blocks: integer, the number of conv/identity blocks in each block.
-      The ResNet contains 3 blocks with each block containing one conv block
-      followed by (layers_per_block - 1) number of idenity blocks. Each
-      conv/idenity block has 2 convolutional layers. With the input
-      convolutional layer and the pooling layer towards the end, this brings
-      the total size of the network to (6*num_blocks + 2)
-    classes: optional number of classes to classify images into
-    training: Only used if training keras model with Estimator.  In other
-    scenarios it is handled automatically.
-
-  Returns:
-    A Keras model instance.
-  """
-
-  input_shape = (32, 32, 3)
-  img_input = layers.Input(shape=input_shape)
-
-  if backend.image_data_format() == 'channels_first':
-    x = layers.Lambda(lambda x: backend.permute_dimensions(x, (0, 3, 1, 2)),
-                      name='transpose')(img_input)
-    bn_axis = 1
-  else:  # channel_last
-    x = img_input
-    bn_axis = 3
-
-  x = layers.ZeroPadding2D(padding=(1, 1), name='conv1_pad')(x)
-  x = layers.Conv2D(16, (3, 3),
-                    strides=(1, 1),
-                    padding='valid', use_bias=False,
-                    kernel_initializer='he_normal',
-                    kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                    name='conv1')(x)
-  x = layers.BatchNormalization(axis=bn_axis,
-                                momentum=BATCH_NORM_DECAY,
-                                epsilon=BATCH_NORM_EPSILON,
-                                name='bn_conv1',)(x, training=training)
-  x = layers.Activation('relu')(x)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[16, 16],
-                   stage=2, conv_strides=(1, 1), training=training)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[32, 32],
-                   stage=3, conv_strides=(2, 2), training=training)
-
-  x = resnet_block(x, size=num_blocks, kernel_size=3, filters=[64, 64],
-                   stage=4, conv_strides=(2, 2), training=training)
-
-  rm_axes = [1, 2] if backend.image_data_format() == 'channels_last' else [2, 3]
-  x = layers.Lambda(lambda x: backend.mean(x, rm_axes), name='reduce_mean')(x)
-  x = layers.Dense(classes,
-                   activation='softmax',
-                   kernel_initializer=initializers.RandomNormal(stddev=0.01),
-                   kernel_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                   bias_regularizer=regularizers.l2(L2_WEIGHT_DECAY),
-                   name='fc10')(x)
-
-  inputs = img_input
-  # Create model.
-  model = tf.keras.models.Model(inputs, x, name='resnet56')
-
-  return model
-
-
-resnet20 = functools.partial(resnet, num_blocks=3)
-resnet32 = functools.partial(resnet, num_blocks=5)
-resnet56 = functools.partial(resnet, num_blocks=9)
-resnet10 = functools.partial(resnet, num_blocks=110)
--- a/official/benchmark/models/resnet_cifar_test.py
+++ b/official/benchmark/models/resnet_cifar_test.py
--- a/official/benchmark/models/resnet_imagenet_main.py
+++ b/official/benchmark/models/resnet_imagenet_main.py
--- a/official/benchmark/models/resnet_imagenet_test.py
+++ b/official/benchmark/models/resnet_imagenet_test.py
--- a/official/benchmark/models/resnet_imagenet_test_tpu.py
+++ b/official/benchmark/models/resnet_imagenet_test_tpu.py
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Test the keras ResNet model with ImageNet data on TPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import tensorflow as tf
-from official.benchmark.models import resnet_imagenet_main
-from official.utils.testing import integration
-from official.vision.image_classification.resnet import imagenet_preprocessing
-
-
-class KerasImagenetTest(tf.test.TestCase, parameterized.TestCase):
-  """Unit tests for Keras Models with ImageNet."""
-
-  _extra_flags_dict = {
-      "resnet": [
-          "-batch_size", "4",
-          "-train_steps", "1",
-          "-use_synthetic_data", "true"
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-      ],
-      "resnet_polynomial_decay": [
-          "-batch_size", "4",
-          "-train_steps", "1",
-          "-use_synthetic_data", "true",
-          "-model", "resnet50_v1.5",
-          "-optimizer", "resnet50_default",
-          "-pruning_method", "polynomial_decay",
-      ],
-  }
-  _tempdir = None
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(KerasImagenetTest, cls).setUpClass()
-    resnet_imagenet_main.define_imagenet_keras_flags()
-
-  def setUp(self):
-    super(KerasImagenetTest, self).setUp()
-    imagenet_preprocessing.NUM_IMAGES["validation"] = 4
-    self.policy = \
-        tf.keras.mixed_precision.experimental.global_policy()
-
-  def tearDown(self):
-    super(KerasImagenetTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-    tf.keras.mixed_precision.experimental.set_policy(self.policy)
-
-  @parameterized.parameters([
-      "resnet",
-      # "resnet_polynomial_decay"  b/151854314
-  ])
-  def test_end_to_end_tpu(self, flags_key):
-    """Test Keras model with TPU distribution strategy."""
-
-    extra_flags = [
-        "-distribution_strategy", "tpu",
-        "-data_format", "channels_last",
-        "-enable_checkpoint_and_export", "1",
-    ]
-    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-  @parameterized.parameters(["resnet"])
-  def test_end_to_end_tpu_bf16(self, flags_key):
-    """Test Keras model with TPU and bfloat16 activation."""
-
-    extra_flags = [
-        "-distribution_strategy", "tpu",
-        "-data_format", "channels_last",
-        "-dtype", "bf16",
-    ]
-    extra_flags = extra_flags + self._extra_flags_dict[flags_key]
-
-    integration.run_synthetic(
-        main=resnet_imagenet_main.run,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=extra_flags
-    )
-
-
-if __name__ == "__main__":
-  tf.test.main()