Merge pull request #1 from tensorflow/master

new pull

Merge pull request #1 from tensorflow/master
new pull
f16a7b5b · vedanshu · GitHub · 8e9296ff · 8f58f396 · f16a7b5b
Unverified Commit f16a7b5b authored May 04, 2021 by vedanshu Committed by GitHub May 04, 2021
20 changed files
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+# A list of assignees
+assignees:
+   - saikumarchalla
+   - ravikyram
--- a/.github/scripts/pylint.sh
+++ b/.github/scripts/pylint.sh
+#!/bin/bash
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Pylint wrapper extracted from main TensorFlow, sharing same exceptions.
+# Specify --incremental to only check files touched since last commit on master,
+# otherwise will recursively check current directory (full repo takes long!).
+
+set -euo pipefail
+
+# Download latest configs from main TensorFlow repo.
+wget -q -O /tmp/pylintrc https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/tools/ci_build/pylintrc
+
+SCRIPT_DIR=/tmp
+
+num_cpus() {
+  # Get the number of CPUs
+  if [[ -f /proc/cpuinfo ]]; then
+    N_CPUS=$(grep -c ^processor /proc/cpuinfo)
+  else
+    # Fallback method
+    N_CPUS=`getconf _NPROCESSORS_ONLN`
+  fi
+  if [[ -z ${N_CPUS} ]]; then
+    die "ERROR: Unable to determine the number of CPUs"
+  fi
+
+  echo ${N_CPUS}
+}
+
+get_changed_files_in_last_non_merge_git_commit() {
+  git diff --name-only $(git merge-base master $(git branch --show-current))
+}
+
+# List Python files changed in the last non-merge git commit that still exist,
+# i.e., not removed.
+# Usage: get_py_files_to_check [--incremental]
+get_py_files_to_check() {
+  if [[ "$1" == "--incremental" ]]; then
+    CHANGED_PY_FILES=$(get_changed_files_in_last_non_merge_git_commit | \
+                       grep '.*\.py$')
+
+    # Do not include files removed in the last non-merge commit.
+    PY_FILES=""
+    for PY_FILE in ${CHANGED_PY_FILES}; do
+      if [[ -f "${PY_FILE}" ]]; then
+        PY_FILES="${PY_FILES} ${PY_FILE}"
+      fi
+    done
+
+    echo "${PY_FILES}"
+  else
+    find . -name '*.py'
+  fi
+}
+
+do_pylint() {
+  if [[ $# == 1 ]] && [[ "$1" == "--incremental" ]]; then
+    PYTHON_SRC_FILES=$(get_py_files_to_check --incremental)
+
+    if [[ -z "${PYTHON_SRC_FILES}" ]]; then
+      echo "do_pylint will NOT run due to --incremental flag and due to the "\
+"absence of Python code changes in the last commit."
+      return 0
+    fi
+  elif [[ $# != 0 ]]; then
+    echo "Invalid syntax for invoking do_pylint"
+    echo "Usage: do_pylint [--incremental]"
+    return 1
+  else
+  PYTHON_SRC_FILES=$(get_py_files_to_check)
+  fi
+
+  # Something happened. TF no longer has Python code if this branch is taken
+  if [[ -z ${PYTHON_SRC_FILES} ]]; then
+    echo "do_pylint found no Python files to check. Returning."
+    return 0
+  fi
+
+  # Now that we know we have to do work, check if `pylint` is installed
+  PYLINT_BIN="python3.8 -m pylint"
+
+  echo ""
+  echo "check whether pylint is available or not."
+  echo ""
+  ${PYLINT_BIN} --version
+  if [[ $? -eq 0 ]]
+  then
+    echo ""
+    echo "pylint available, proceeding with pylint sanity check."
+    echo ""
+  else
+    echo ""
+    echo "pylint not available."
+    echo ""
+    return 1
+  fi
+
+  # Configure pylint using the following file
+  PYLINTRC_FILE="${SCRIPT_DIR}/pylintrc"
+
+  if [[ ! -f "${PYLINTRC_FILE}" ]]; then
+    die "ERROR: Cannot find pylint rc file at ${PYLINTRC_FILE}"
+  fi
+
+  # Run pylint in parallel, after some disk setup
+  NUM_SRC_FILES=$(echo ${PYTHON_SRC_FILES} | wc -w)
+  NUM_CPUS=$(num_cpus)
+
+  echo "Running pylint on ${NUM_SRC_FILES} files with ${NUM_CPUS} "\
+"parallel jobs..."
+  echo ""
+
+  PYLINT_START_TIME=$(date +'%s')
+  OUTPUT_FILE="$(mktemp)_pylint_output.log"
+  ERRORS_FILE="$(mktemp)_pylint_errors.log"
+
+  rm -rf ${OUTPUT_FILE}
+  rm -rf ${ERRORS_FILE}
+
+  set +e
+  # When running, filter to only contain the error code lines. Removes module
+  # header, removes lines of context that show up from some lines.
+  # Also, don't redirect stderr as this would hide pylint fatal errors.
+  ${PYLINT_BIN} --rcfile="${PYLINTRC_FILE}" --output-format=parseable \
+      --jobs=${NUM_CPUS} ${PYTHON_SRC_FILES} | grep '\[[CEFW]' > ${OUTPUT_FILE}
+  PYLINT_END_TIME=$(date +'%s')
+
+  echo ""
+  echo "pylint took $((PYLINT_END_TIME - PYLINT_START_TIME)) s"
+  echo ""
+
+  # Report only what we care about
+  # Ref https://pylint.readthedocs.io/en/latest/technical_reference/features.html
+  # E: all errors
+  # W0311 bad-indentation
+  # W0312 mixed-indentation
+  # C0330 bad-continuation
+  # C0301 line-too-long
+  # C0326 bad-whitespace
+  # W0611 unused-import
+  # W0622 redefined-builtin
+  grep -E '(\[E|\[W0311|\[W0312|\[C0330|\[C0301|\[C0326|\[W0611|\[W0622)' ${OUTPUT_FILE} > ${ERRORS_FILE}
+
+  # Determine counts of errors
+  N_FORBID_ERRORS=$(wc -l ${ERRORS_FILE} | cut -d' ' -f1)
+  set -e
+
+  # Now, print the errors we should fix
+  echo ""
+  if [[ ${N_FORBID_ERRORS} != 0 ]]; then
+    echo "Found ${N_FORBID_ERRORS} pylint errors:"
+    cat ${ERRORS_FILE}
+  fi
+
+  echo ""
+  if [[ ${N_FORBID_ERRORS} != 0 ]]; then
+    echo "FAIL: Found ${N_FORBID_ERRORS} errors"
+    return 1
+  else
+    echo "PASS: Found no errors"
+  fi
+}
+
+do_pylint "$@"
+
--- a/.github/stale.yml
+++ b/.github/stale.yml
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# Number of days of inactivity before an Issue or Pull Request becomes stale
+daysUntilStale: 7
+# Number of days of inactivity before a stale Issue or Pull Request is closed
+daysUntilClose: 7
+# Only issues or pull requests with all of these labels are checked if stale. Defaults to `[]` (disabled)
+onlyLabels:
+ - stat:awaiting response
+# Comment to post when marking as stale. Set to `false` to disable
+markComment: >
+  This issue has been automatically marked as stale because it has not had
+  recent activity. It will be closed if no further activity occurs. Thank you.
+# Comment to post when removing the stale label. Set to `false` to disable
+unmarkComment: false
+closeComment: >
+  Closing as stale. Please reopen if you'd like to work on this further.
+limitPerRun: 30
+# Limit to only `issues` or `pulls`
+only: issues
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
+name: CI
+on: pull_request
+
+jobs:
+  pylint:
+    runs-on: ubuntu-latest
+    
+    steps:
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      
+      - name: Install pylint 2.4.4
+        run: |
+          python -m pip install --upgrade pip
+          pip install pylint==2.4.4
+        
+      - name: Checkout code
+        uses: actions/checkout@v2
+        with:
+          ref: ${{ github.event.pull_request.head.sha }}
+          fetch-depth: 0
+      
+      - name: Fetch master for diff
+        run: git fetch origin master:master
+       
+      - name: Run pylint script
+        run: bash ./.github/scripts/pylint.sh --incremental
--- a/CODEOWNERS
+++ b/CODEOWNERS
 * @tensorflow/tf-garden-team @tensorflow/tf-model-garden-team
 /official/ @rachellj218 @saberkun @jaeyounkim
-/official/nlp/ @saberkun @chenGitHuber @lehougoogle @rachellj218
-/official/vision/ @pengchongjin @xianzhidu @yeqingli @arashwan @saberkun @rachellj218
-/research/adv_imagenet_models/ @alexeykurakin
-/research/adversarial_crypto/ @dave-andersen
-/research/adversarial_logit_pairing/ @alexeykurakin
+/official/nlp/ @saberkun @chenGitHuber @lehougoogle @rachellj218 @jaeyounkim
+/official/vision/ @xianzhidu @yeqingli @arashwan @saberkun @rachellj218 @jaeyounkim
+/official/vision/beta/projects/assemblenet/ @mryoo
+/official/vision/beta/projects/deepmac_maskrcnn/ @vighneshbirodkar
+/official/vision/beta/projects/simclr/ @luotigerlsx @chentingpc @saxenasaurabh
 /research/adversarial_text/ @rsepassi @a-dai
 /research/attention_ocr/ @xavigibert
 /research/audioset/ @plakal @dpwe
 /research/autoaugment/* @barretzoph
-/research/autoencoders/ @snurkabill
-/research/brain_coder/ @danabo
-/research/cognitive_mapping_and_planning/ @s-gupta
-/research/compression/ @nmjohn
+/research/cognitive_planning/ @s-gupta
 /research/cvt_text/ @clarkkev @lmthang
-/research/deep_contextual_bandits/ @rikel
 /research/deep_speech/ @yhliang2018
 /research/deeplab/ @aquariusjay @yknzhu @gpapan
 /research/delf/ @andrefaraujo
-/research/domain_adaptation/ @bousmalis @dmrd
 /research/efficient-hrl/ @ofirnachum
-/research/feelvos/ @pvoigtlaender @yuningchai @aquariusjay
-/research/fivo/ @dieterichlawson
-/research/global_objectives/ @mackeya-google
-/research/im2txt/ @cshallue
-/research/inception/ @shlens @vincentvanhoucke
-/research/keypointnet/ @mnorouzi
-/research/learned_optimizer/ @olganw @nirum
-/research/learning_to_remember_rare_events/ @lukaszkaiser @ofirnachum
-/research/learning_unsupervised_learning/ @lukemetz @nirum
-/research/lexnet_nc/ @vered1986 @waterson
 /research/lfads/ @jazcollins @sussillo
-/research/lm_1b/ @oriolvinyals @panyx0718
-/research/lm_commonsense/ @thtrieu
 /research/lstm_object_detection/ @yinxiaoli @yongzhe2160
 /research/marco/ @vincentvanhoucke
-/research/maskgan/ @liamb315 @a-dai
-/research/namignizer/ @knathanieltucker
-/research/neural_gpu/ @lukaszkaiser
-/research/neural_programmer/ @arvind2505
-/research/next_frame_prediction/ @panyx0718
 /research/object_detection/ @jch1 @tombstone @pkulzc
 /research/pcl_rl/ @ofirnachum
-/research/ptn/ @xcyan @arkanath @hellojas @honglaklee
-/research/qa_kg/ @yuyuz
-/research/real_nvp/ @laurent-dinh
 /research/rebar/ @gjtucker
-/research/sentiment_analysis/ @sculd
-/research/seq2species/ @apbusia @depristo
-/research/skip_thoughts/ @cshallue
+/research/seq_flow_lite/ @thunderfyc
 /research/slim/ @sguada @marksandler2
-/research/steve/ @buckman-google
-/research/street/ @theraysmith
-/research/struct2depth/ @aneliaangelova
-/research/swivel/ @waterson
-/research/tcn/ @coreylynch @sermanet
-/research/textsum/ @panyx0718 @peterjliu
-/research/transformer/ @daviddao
 /research/vid2depth/ @rezama
-/research/video_prediction/ @cbfinn
--- a/README.md
+++ b/README.md
@@ -14,17 +14,6 @@ can take full advantage of TensorFlow for their research and product development

 ## [Announcements](https://github.com/tensorflow/models/wiki/Announcements)

-| Date | News |
-|------|------|
-| July 10, 2020 | TensorFlow 2 meets the [Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection) ([Blog](https://blog.tensorflow.org/2020/07/tensorflow-2-meets-object-detection-api.html)) |
-| June 30, 2020 | [SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization](https://github.com/tensorflow/models/tree/master/official/vision/detection#train-a-spinenet-49-based-mask-r-cnn) released ([Tweet](https://twitter.com/GoogleAI/status/1278016712978264064)) |
-| June 17, 2020 | [Context R-CNN: Long Term Temporal Context for Per-Camera Object Detection](https://github.com/tensorflow/models/tree/master/research/object_detection#june-17th-2020) released ([Tweet](https://twitter.com/GoogleAI/status/1276571419422253057)) |
-| May 21, 2020 | [Unifying Deep Local and Global Features for Image Search (DELG)](https://github.com/tensorflow/models/tree/master/research/delf#delg) code released |
-| May 19, 2020 | [MobileDets: Searching for Object Detection Architectures for Mobile Accelerators](https://github.com/tensorflow/models/tree/master/research/object_detection#may-19th-2020) released |
-| May 7, 2020 | [MnasFPN with MobileNet-V2 backbone](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md#mobile-models) released for object detection |
-| May 1, 2020 | [DELF: DEep Local Features](https://github.com/tensorflow/models/tree/master/research/delf) updated to support TensorFlow 2.1 |
-| March 31, 2020 | [Introducing the Model Garden for TensorFlow 2](https://blog.tensorflow.org/2020/03/introducing-model-garden-for-tensorflow-2.html) ([Tweet](https://twitter.com/TensorFlow/status/1245029834633297921)) |
-
 ## Contributions

 [![help wanted:paper implementation](https://img.shields.io/github/issues/tensorflow/models/help%20wanted%3Apaper%20implementation)](https://github.com/tensorflow/models/labels/help%20wanted%3Apaper%20implementation)

--- a/community/README.md
+++ b/community/README.md
@@ -19,6 +19,15 @@ This repository provides a curated list of the GitHub repositories with machine
 | [ResNet 101](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet101) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
 | [ResNet 50](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet50) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
 | [ResNet 50v1.5](https://github.com/IntelAI/models/tree/master/benchmarks/image_recognition/tensorflow/resnet50v1_5) | [Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385) | • Int8 Inference<br/>• FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
+| [EfficientNet](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Classification/ConvNets/efficientnet) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/pdf/1905.11946.pdf) | • Automatic mixed precision<br/>• Horovod Multi-GPU training (NCCL)<br/>• Multi-node training on a Pyxis/Enroot Slurm cluster<br/>• XLA | [NVIDIA](https://github.com/NVIDIA) |
+
+### Object Detection
+
+| Model | Paper | Features | Maintainer |
+|-------|-------|----------|------------|
+| [R-FCN](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/rfcn) | [R-FCN: Object Detection<br/>via Region-based Fully Convolutional Networks](https://arxiv.org/pdf/1605.06409) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
+| [SSD-MobileNet](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/ssd-mobilenet) | [MobileNets: Efficient Convolutional Neural Networks<br/>for Mobile Vision Applications](https://arxiv.org/pdf/1704.04861) | • Int8 Inference<br/>• FP32 Inference | [Intel](https://github.com/IntelAI) |
+| [SSD-ResNet34](https://github.com/IntelAI/models/tree/master/benchmarks/object_detection/tensorflow/ssd-resnet34) | [SSD: Single Shot MultiBox Detector](https://arxiv.org/pdf/1512.02325) | • Int8 Inference<br/>• FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |

 ### Segmentation

@@ -27,6 +36,25 @@ This repository provides a curated list of the GitHub repositories with machine
 | [Mask R-CNN](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN) | [Mask R-CNN](https://arxiv.org/abs/1703.06870) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• TensorRT | [NVIDIA](https://github.com/NVIDIA) |
 | [U-Net Medical Image Segmentation](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/UNet_Medical) | [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• TensorRT | [NVIDIA](https://github.com/NVIDIA) |

+## Natural Language Processing
+
+| Model | Paper | Features | Maintainer |
+|-------|-------|----------|------------|
+| [BERT](https://github.com/IntelAI/models/tree/master/benchmarks/language_modeling/tensorflow/bert_large) | [BERT: Pre-training of Deep Bidirectional Transformers<br/>for Language Understanding](https://arxiv.org/pdf/1810.04805) | • FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
+| [BERT](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/LanguageModeling/BERT) | [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805) | • Horovod Multi-GPU<br/>• Multi-node with Horovod and Pyxis/Enroot Slurm cluster<br/>• XLA<br/>• Automatic mixed precision<br/>• LAMB | [NVIDIA](https://github.com/NVIDIA) |
+| [ELECTRA](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/LanguageModeling/ELECTRA) | [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/forum?id=r1xMH1BtvB) | • Automatic Mixed Precision<br/>• Multi-GPU training support with Horovod<br/>• Multi-node training on a Pyxis/Enroot Slurm cluster | [NVIDIA](https://github.com/NVIDIA) |
+| [GNMT](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/mlperf_gnmt) | [Google’s Neural Machine Translation System:<br/>Bridging the Gap between Human and Machine Translation](https://arxiv.org/pdf/1609.08144) | • FP32 Inference | [Intel](https://github.com/IntelAI) |
+| [Transformer-LT (Official)](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/transformer_lt_official) | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762) | • FP32 Inference | [Intel](https://github.com/IntelAI) |
+| [Transformer-LT (MLPerf)](https://github.com/IntelAI/models/tree/master/benchmarks/language_translation/tensorflow/transformer_mlperf) | [Attention Is All You Need](https://arxiv.org/pdf/1706.03762) | • FP32 Training | [Intel](https://github.com/IntelAI) |
+
+## Recommendation Systems
+
+| Model | Paper | Features | Maintainer |
+|-------|-------|----------|------------|
+| [Wide & Deep](https://github.com/IntelAI/models/tree/master/benchmarks/recommendation/tensorflow/wide_deep_large_ds) | [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792) | • FP32 Inference<br/>• FP32 Training | [Intel](https://github.com/IntelAI) |
+| [Wide & Deep](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/WideAndDeep) | [Wide & Deep Learning for Recommender Systems](https://arxiv.org/pdf/1606.07792) | • Automatic mixed precision<br/>• Multi-GPU training support with Horovod<br/>• XLA | [NVIDIA](https://github.com/NVIDIA) |
+| [DLRM](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Recommendation/DLRM) | [Deep Learning Recommendation Model for Personalization and Recommendation Systems](https://arxiv.org/pdf/1906.00091.pdf) | • Automatic Mixed Precision<br/>• Hybrid-parallel multiGPU training using Horovod all2all<br/>• Multinode training for Pyxis/Enroot Slurm clusters<br/>• XLA<br/>• Criteo dataset preprocessing with Spark on GPU | [NVIDIA](https://github.com/NVIDIA) |
+
 ## Contributions

 If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
--- a/official/README-TPU.md
+++ b/official/README-TPU.md
@@ -23,3 +23,7 @@
    be used to classify ImageNet's dataset of 1000 classes.
    See [Training ResNet on Cloud TPU](https://cloud.google.com/tpu/docs/tutorials/resnet-2.x) tutorial and [Tensorboard.dev metrics](https://tensorboard.dev/experiment/CxlDK8YMRrSpYEGtBRpOhg).
 *   [retinanet](vision/detection): A fast and powerful object detector. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/b8NRnWU3TqG6Rw0UxueU6Q).
+*   [shapemask](vision/detection): An object detection and instance segmentation model using shape priors. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/ZbXgVoc6Rf6mBRlPj0JpLA).
+
+## Recommendation
+*   [ncf](recommendation): Neural Collaborative Filtering. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/0k3gKjZlR1ewkVTRyLB6IQ).
--- a/official/README.md
+++ b/official/README.md
@@ -19,7 +19,7 @@ In the near future, we will add:

 * State-of-the-art language understanding models.
 * State-of-the-art image classification models.
-* State-of-the-art objection detection and instance segmentation models.
+* State-of-the-art object detection and instance segmentation models.

 ## Table of Contents

@@ -41,6 +41,7 @@ In the near future, we will add:
 |-------|-------------------|
 | [MNIST](vision/image_classification) | A basic model to classify digits from the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) |
 | [ResNet](vision/image_classification) | [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) |
+| [ResNet-RS](vision/beta/MODEL_GARDEN.md) | [Revisiting ResNets: Improved Training and Scaling Strategies](https://arxiv.org/abs/2103.07579) |
 | [EfficientNet](vision/image_classification) | [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) |

 #### Object Detection and Segmentation
@@ -61,6 +62,7 @@ In the near future, we will add:
 | [NHNet (News Headline generation model)](nlp/nhnet) | [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386) |
 | [Transformer](nlp/transformer) | [Attention Is All You Need](https://arxiv.org/abs/1706.03762) |
 | [XLNet](nlp/xlnet) | [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) |
+| [MobileBERT](nlp/projects/mobilebert) | [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) |

 ### Recommendation

@@ -98,17 +100,30 @@ pip3 install tf-nightly

 #### Method 1: Install the TensorFlow Model Garden pip package

-**tf-models-nightly** is the nightly Model Garden package
-created daily automatically. pip will install all models
-and dependencies automatically.
+**tf-models-official** is the stable Model Garden package.
+pip will install all models and dependencies automatically.

 ```shell
-pip install tf-models-nightly
+pip install tf-models-official
+```
+
+If you are using nlp packages, please also install **tensorflow-text**:
+
+```shell
+pip install tensorflow-text
 ```

 Please check out our [example](colab/fine_tuning_bert.ipynb)
 to learn how to use a PIP package.

+Note that **tf-models-official** may not include the latest changes in this
+github repo. To include latest changes, you may install **tf-models-nightly**,
+which is the nightly Model Garden package created daily automatically.
+
+```shell
+pip install tf-models-nightly
+```
+
 #### Method 2: Clone the source

 1. Clone the GitHub repository:
@@ -136,6 +151,27 @@ os.environ['PYTHONPATH'] += ":/path/to/models"
 pip3 install --user -r official/requirements.txt
 ```

+Finally, if you are using nlp packages, please also install
+**tensorflow-text-nightly**:
+
+```shell
+pip3 install tensorflow-text-nightly
+```
+
 ## Contributions

 If you want to contribute, please review the [contribution guidelines](https://github.com/tensorflow/models/wiki/How-to-contribute).
+
+## Citing TF Official Model Garden
+
+To cite this repository:
+
+```
+@software{tfmodels2020github,
+  author = {Chen Chen and Xianzhi Du and Le Hou and Jaeyoun Kim and Jing Li and
+  Yeqing Li and Abdullah Rashwan and Fan Yang and Hongkun Yu},
+  title = {TensorFlow Official Model Garden},
+  url = {https://github.com/tensorflow/models/tree/master/official},
+  year = {2020},
+}
+```
--- a/official/benchmark/benchmark_wrappers.py
+++ b/official/benchmark/benchmark_wrappers.py
-# Lint as: python3
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils to annotate and trace benchmarks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_multi_string(
-    'benchmark_method_flags', None,
-    'Optional list of runtime flags of the form key=value. Specify '
-    'multiple times to specify different flags. These will override the FLAGS '
-    'object directly after hardcoded settings in individual benchmark methods '
-    'before they call _run_and_report benchmark. Example if we set '
-    '--benchmark_method_flags=train_steps=10 and a benchmark method hardcodes '
-    'FLAGS.train_steps=10000 and later calls _run_and_report_benchmark, '
-    'it\'ll only run for 10 steps. This is useful for '
-    'debugging/profiling workflows.')
-
-
-def enable_runtime_flags(decorated_func):
-  """Sets attributes from --benchmark_method_flags for method execution.
-
-  @enable_runtime_flags decorator temporarily adds flags passed in via
-  --benchmark_method_flags and runs the decorated function in that context.
-
-  A user can set --benchmark_method_flags=train_steps=5 to run the benchmark
-  method in the snippet below with FLAGS.train_steps=5 for debugging (without
-  modifying the benchmark code).
-
-  class ModelBenchmark():
-
-    @benchmark_wrappers.enable_runtime_flags
-    def _run_and_report_benchmark(self):
-      # run benchmark ...
-      # report benchmark results ...
-
-    def benchmark_method(self):
-      FLAGS.train_steps = 1000
-      ...
-      self._run_and_report_benchmark()
-
-  Args:
-    decorated_func: The method that runs the benchmark after previous setup
-      execution that set some flags.
-
-  Returns:
-    new_func: The same method which executes in a temporary context where flag
-      overrides from --benchmark_method_flags are active.
-  """
-
-  def runner(*args, **kwargs):
-    """Creates a temporary context to activate --benchmark_method_flags."""
-    if FLAGS.benchmark_method_flags:
-      saved_flag_values = flagsaver.save_flag_values()
-      for key_value in FLAGS.benchmark_method_flags:
-        key, value = key_value.split('=', 1)
-        try:
-          numeric_float = float(value)
-          numeric_int = int(numeric_float)
-          if abs(numeric_int) == abs(numeric_float):
-            flag_value = numeric_int
-          else:
-            flag_value = numeric_float
-        except ValueError:
-          flag_value = value
-        logging.info('Setting --%s=%s', key, flag_value)
-        setattr(FLAGS, key, flag_value)
-    else:
-      saved_flag_values = None
-    try:
-      result = decorated_func(*args, **kwargs)
-      return result
-    finally:
-      if saved_flag_values:
-        flagsaver.restore_flag_values(saved_flag_values)
-
-  return runner
--- a/official/benchmark/bert_benchmark.py
+++ b/official/benchmark/bert_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes BERT benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import json
-import math
-import os
-import time
-
-# pylint: disable=g-bad-import-order
-from absl import flags
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import bert_benchmark_utils as benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.bert import configs
-from official.nlp.bert import run_classifier
-from official.utils.misc import distribution_utils
-from official.benchmark import benchmark_wrappers
-
-# pylint: disable=line-too-long
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
-CLASSIFIER_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_train.tf_record'
-CLASSIFIER_EVAL_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_eval.tf_record'
-CLASSIFIER_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/classification/mrpc_meta_data'
-MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
-# pylint: enable=line-too-long
-
-TMP_DIR = os.getenv('TMPDIR')
-FLAGS = flags.FLAGS
-
-
-class BertClassifyBenchmarkBase(benchmark_utils.BertBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, output_dir=None, tpu=None):
-    super(BertClassifyBenchmarkBase, self).__init__(output_dir, tpu=tpu)
-    self.num_epochs = None
-    self.num_steps_per_epoch = None
-    FLAGS.steps_per_loop = 1
-
-  @flagsaver.flagsaver
-  def _run_bert_classifier(self, callbacks=None, use_ds=True):
-    """Starts BERT classification task."""
-    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-      input_meta_data = json.loads(reader.read().decode('utf-8'))
-
-    bert_config = configs.BertConfig.from_json_file(FLAGS.bert_config_file)
-    epochs = self.num_epochs if self.num_epochs else FLAGS.num_train_epochs
-    if self.num_steps_per_epoch:
-      steps_per_epoch = self.num_steps_per_epoch
-    else:
-      train_data_size = input_meta_data['train_data_size']
-      steps_per_epoch = int(train_data_size / FLAGS.train_batch_size)
-    warmup_steps = int(epochs * steps_per_epoch * 0.1)
-    eval_steps = int(
-        math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
-    if self.tpu:
-      strategy = distribution_utils.get_distribution_strategy(
-          distribution_strategy='tpu', tpu_address=self.tpu)
-    else:
-      strategy = distribution_utils.get_distribution_strategy(
-          distribution_strategy='mirrored' if use_ds else 'off',
-          num_gpus=self.num_gpus)
-
-    max_seq_length = input_meta_data['max_seq_length']
-    train_input_fn = run_classifier.get_dataset_fn(
-        FLAGS.train_data_path,
-        max_seq_length,
-        FLAGS.train_batch_size,
-        is_training=True)
-    eval_input_fn = run_classifier.get_dataset_fn(
-        FLAGS.eval_data_path,
-        max_seq_length,
-        FLAGS.eval_batch_size,
-        is_training=False)
-    _, summary = run_classifier.run_bert_classifier(
-        strategy,
-        bert_config,
-        input_meta_data,
-        FLAGS.model_dir,
-        epochs,
-        steps_per_epoch,
-        FLAGS.steps_per_loop,
-        eval_steps,
-        warmup_steps,
-        FLAGS.learning_rate,
-        FLAGS.init_checkpoint,
-        train_input_fn,
-        eval_input_fn,
-        training_callbacks=False,
-        custom_callbacks=callbacks)
-    return summary
-
-
-class BertClassifyBenchmarkReal(BertClassifyBenchmarkBase):
-  """Short benchmark performance tests for BERT model.
-
-  Tests BERT classification performance in different GPU, TPU configurations.
-  The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` for GPUs and
-  `benchmark_(topology)_tpu_(dataset type)` for TPUs.
-  """
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    super(BertClassifyBenchmarkReal, self).__init__(
-        output_dir=output_dir, tpu=tpu)
-
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.bert_config_file = MODEL_CONFIG_FILE_PATH
-    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
-
-    # Since we only care about performance metrics, we limit
-    # the number of training steps and epochs to prevent unnecessarily
-    # long tests.
-    self.num_steps_per_epoch = 100
-    self.num_epochs = 1
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0,
-                                max_accuracy=1,
-                                use_ds=True):
-    """Starts BERT performance benchmark test."""
-    start_time_sec = time.time()
-    summary = self._run_bert_classifier(
-        callbacks=[self.timer_callback], use_ds=use_ds)
-    wall_time_sec = time.time() - start_time_sec
-
-    # Since we do not load from any pretrained checkpoints, we ignore all
-    # accuracy metrics.
-    summary.pop('eval_metrics', None)
-    summary['start_time_sec'] = start_time_sec
-
-    super(BertClassifyBenchmarkReal, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def benchmark_1_gpu_mrpc(self):
-    """Test BERT model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_mrpc_xla(self):
-    """Test BERT model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_xla')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-    FLAGS.enable_xla = True
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_mrpc_no_dist_strat(self):
-    """Test BERT model performance with 1 GPU, no distribution strategy."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_mrpc_no_dist_strat')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_mrpc(self):
-    """Test BERT model performance with 8 GPUs."""
-
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_1_gpu_amp_mrpc_no_dist_strat(self):
-    """Performance for 1 GPU no DS with automatic mixed precision."""
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_amp_mrpc_no_dist_strat')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 4
-    FLAGS.eval_batch_size = 4
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  def benchmark_8_gpu_amp_mrpc(self):
-    """Test BERT model performance with 8 GPUs with automatic mixed precision."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 32
-    FLAGS.eval_batch_size = 32
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_mrpc(self):
-    """Test BERT model performance with 2x2 TPU."""
-
-    self._setup()
-    FLAGS.steps_per_loop = 50
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.train_batch_size = 32
-    FLAGS.eval_batch_size = 32
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path, use_ds=False)
-
-
-class BertClassifyAccuracy(BertClassifyBenchmarkBase):
-  """Short accuracy test for BERT model.
-
-  Tests BERT classification task model accuracy. The naming
-  convention of below test cases follow
-  `benchmark_(number of gpus)_gpu_(dataset type)` format.
-  """
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    self.train_data_path = CLASSIFIER_TRAIN_DATA_PATH
-    self.eval_data_path = CLASSIFIER_EVAL_DATA_PATH
-    self.bert_config_file = MODEL_CONFIG_FILE_PATH
-    self.input_meta_data_path = CLASSIFIER_INPUT_META_DATA_PATH
-    self.pretrained_checkpoint_path = PRETRAINED_CHECKPOINT_PATH
-
-    super(BertClassifyAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                training_summary_path,
-                                min_accuracy=0.84,
-                                max_accuracy=0.88):
-    """Starts BERT accuracy benchmark test."""
-
-    start_time_sec = time.time()
-    summary = self._run_bert_classifier(callbacks=[self.timer_callback])
-    wall_time_sec = time.time() - start_time_sec
-
-    super(BertClassifyAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=min_accuracy,
-        max_accuracy=max_accuracy)
-
-  def _setup(self):
-    super(BertClassifyAccuracy, self)._setup()
-    FLAGS.train_data_path = self.train_data_path
-    FLAGS.eval_data_path = self.eval_data_path
-    FLAGS.input_meta_data_path = self.input_meta_data_path
-    FLAGS.bert_config_file = self.bert_config_file
-    FLAGS.init_checkpoint = self.pretrained_checkpoint_path
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu_mrpc(self):
-    """Run BERT model accuracy test with 8 GPUs.
-
-    Due to comparatively small cardinality of  MRPC dataset, training
-    accuracy metric has high variance between trainings. As so, we
-    set the wide range of allowed accuracy (84% to 88%).
-    """
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc')
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  def benchmark_8_gpu_mrpc_xla(self):
-    """Run BERT model accuracy test with 8 GPUs with XLA."""
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_mrpc_xla')
-    FLAGS.enable_xla = True
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu_mrpc(self):
-    """Run BERT model accuracy test on 2x2 TPU."""
-    self._setup()
-    FLAGS.steps_per_loop = 50
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_mrpc')
-
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    self._run_and_report_benchmark(summary_path)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/bert_benchmark_utils.py
+++ b/official/benchmark/bert_benchmark_utils.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility functions or classes shared between BERT benchmarks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-# pylint: disable=g-bad-import-order
-import numpy as np
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.utils.flags import core as flags_core
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-
-FLAGS = flags.FLAGS
-
-
-class BenchmarkTimerCallback(tf.keras.callbacks.Callback):
-  """Callback that records time it takes to run each batch."""
-
-  def __init__(self, num_batches_to_skip=10):
-    super(BenchmarkTimerCallback, self).__init__()
-    self.batch_start_times = {}
-    self.batch_stop_times = {}
-
-  def on_batch_begin(self, batch, logs=None):
-    self.batch_start_times[batch] = time.time()
-
-  def on_batch_end(self, batch, logs=None):
-    # If there are multiple steps_per_loop, the end batch index will not be the
-    # same as the starting index. Use the last starting index instead.
-    if batch not in self.batch_start_times:
-      batch = max(self.batch_start_times.keys())
-
-    self.batch_stop_times[batch] = time.time()
-
-  def get_examples_per_sec(self, batch_size, num_batches_to_skip=1):
-    batch_durations = []
-    for batch in self.batch_start_times:
-      if batch in self.batch_stop_times and batch >= num_batches_to_skip:
-        batch_durations.append(self.batch_stop_times[batch] -
-                               self.batch_start_times[batch])
-    return batch_size / np.mean(batch_durations)
-
-  def get_startup_time(self, program_start_time):
-    return self.batch_start_times[0] - program_start_time
-
-
-class BertBenchmarkBase(PerfZeroBenchmark):
-  """Base class to hold methods common to test classes."""
-  local_flags = None
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    super(BertBenchmarkBase, self).__init__(
-        output_dir=output_dir, tpu=tpu, **kwargs)
-    self.num_gpus = 8
-    self.timer_callback = None
-
-  def _setup(self):
-    """Sets up and resets flags before each test."""
-    super(BertBenchmarkBase, self)._setup()
-    self.timer_callback = BenchmarkTimerCallback()
-
-  def _report_benchmark(self, stats, wall_time_sec, min_accuracy, max_accuracy):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from BERT models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      min_accuracy: Minimum classification accuracy constraint to verify
-        correctness of the model.
-      max_accuracy: Maximum classification accuracy constraint to verify
-        correctness of the model.
-    """
-    metrics = [{
-        'name': 'training_loss',
-        'value': stats['train_loss'],
-    }]
-    if self.timer_callback:
-      metrics.append({
-          'name':
-              'exp_per_second',
-          'value':
-              self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
-                                                       FLAGS.steps_per_loop)
-      })
-    else:
-      metrics.append({
-          'name': 'exp_per_second',
-          'value': 0.0,
-      })
-    if self.timer_callback and 'start_time_sec' in stats:
-      metrics.append({
-          'name': 'startup_time',
-          'value': self.timer_callback.get_startup_time(stats['start_time_sec'])
-      })
-
-    if 'eval_metrics' in stats:
-      metrics.append({
-          'name': 'eval_accuracy',
-          'value': stats['eval_metrics'],
-          'min_value': min_accuracy,
-          'max_value': max_accuracy,
-      })
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=stats['total_training_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
--- a/official/benchmark/bert_pretrain_benchmark.py
+++ b/official/benchmark/bert_pretrain_benchmark.py
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes benchmark testing for bert pretraining."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import json
-import os
-import time
-from typing import Optional
-
-from absl import flags
-from absl import logging
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import bert_benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.bert import run_pretraining
-from official.utils.flags import core as flags_core
-from official.utils.misc import distribution_utils
-
-# Pretrain masked lanauge modeling accuracy range:
-MIN_MLM_ACCURACY = 0.635
-MAX_MLM_ACCURACY = 0.645
-
-# Pretrain next sentence prediction accuracy range:
-MIN_NSP_ACCURACY = 0.94
-MAX_NSP_ACCURACY = 0.96
-
-BERT_PRETRAIN_FILES_SEQ128 = 'gs://mlcompass-data/bert/pretraining_data/seq_128/wikipedia.tfrecord*,gs://mlcompass-data/bert/pretraining_data/seq_128/books.tfrecord*'
-BERT_BASE_CONFIG_FILE = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12/bert_config.json'
-
-FLAGS = flags.FLAGS
-
-
-class BertPretrainAccuracyBenchmark(bert_benchmark_utils.BertBenchmarkBase):
-  """Benchmark accuracy tests for BERT Pretraining."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               tpu: Optional[str] = None,
-               **kwargs):
-    """Inits BertPretrainAccuracyBenchmark class.
-
-    Args:
-      output_dir: Directory where to output e.g. log files
-      tpu: TPU name to use in a TPU benchmark.
-      **kwargs: Additional keyword arguments.
-    """
-    super(BertPretrainAccuracyBenchmark, self).__init__(
-        output_dir=output_dir, tpu=tpu, **kwargs)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, summary_path: str, report_accuracy: bool):
-    """Runs and reports the benchmark given the provided configuration."""
-    distribution = distribution_utils.get_distribution_strategy(
-        distribution_strategy='tpu', tpu_address=self.tpu)
-    logging.info('Flags: %s', flags_core.get_nondefault_flags_as_str())
-    start_time_sec = time.time()
-    run_pretraining.run_bert_pretrain(
-        strategy=distribution, custom_callbacks=self.timer_callback)
-    wall_time_sec = time.time() - start_time_sec
-
-    with tf.io.gfile.GFile(summary_path, 'rb') as reader:
-      summary = json.loads(reader.read().decode('utf-8'))
-    self._report_benchmark(summary, start_time_sec, wall_time_sec,
-                           report_accuracy)
-
-  def _report_benchmark(self, summary, start_time_sec, wall_time_sec,
-                        report_accuracy):
-    metrics = [{
-        'name': 'train_loss',
-        'value': summary['train_loss'],
-    }, {
-        'name':
-            'exp_per_second',
-        'value':
-            self.timer_callback.get_examples_per_sec(FLAGS.train_batch_size *
-                                                     FLAGS.steps_per_loop)
-    }, {
-        'name': 'startup_time',
-        'value': self.timer_callback.get_startup_time(start_time_sec)
-    }]
-    if report_accuracy:
-      metrics.extend([{
-          'name': 'masked_lm_accuracy',
-          'value': summary['masked_lm_accuracy'],
-          'min_value': MIN_MLM_ACCURACY,
-          'max_value': MAX_MLM_ACCURACY,
-      }, {
-          'name': 'next_sentence_accuracy',
-          'value': summary['next_sentence_accuracy'],
-          'min_value': MIN_NSP_ACCURACY,
-          'max_value': MAX_NSP_ACCURACY,
-      }])
-    self.report_benchmark(
-        iters=summary['total_training_steps'],
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_core.get_nondefault_flags_as_str()})
-
-  def _specify_common_flags(self):
-    FLAGS.bert_config_file = BERT_BASE_CONFIG_FILE
-    FLAGS.train_batch_size = 512
-    FLAGS.learning_rate = 1e-4
-    FLAGS.warmup_steps = 10000
-    FLAGS.steps_per_loop = 10000
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.input_files = BERT_PRETRAIN_FILES_SEQ128
-    FLAGS.max_seq_length = 128
-    FLAGS.max_predictions_per_seq = 20
-    FLAGS.dtype = 'bf16'
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps(self):
-    """Test bert pretraining with 8x8 TPU for 500k steps."""
-    # This is used for accuracy test.
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 500000
-    FLAGS.num_train_epochs = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_accuracy_8x8_tpu_bf16_seq128_500k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Set train_summary_interval to -1 to disable training summary, because
-    # writing summary to gcs may fail and summaries are not needed for this
-    # accuracy benchmark test.
-    FLAGS.train_summary_interval = -1
-    self._run_and_report_benchmark(summary_path=summary_path,
-                                   report_accuracy=True)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_4x4_tpu_bf16_seq128_10k_steps(self):
-    """Test bert pretraining with 4x4 TPU for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_4x4_tpu_bf16_seq128_10k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Disable accuracy check.
-    self._run_and_report_benchmark(
-        summary_path=summary_path, report_accuracy=False)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_perf_8x8_tpu_bf16_seq128_10k_steps(self):
-    """Test bert pretraining with 8x8 TPU for 10000 steps."""
-    self._setup()
-    self._specify_common_flags()
-    FLAGS.num_steps_per_epoch = 5000
-    FLAGS.num_train_epochs = 2
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_perf_8x8_tpu_bf16_seq128_10k_steps')
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    # Disable accuracy check.
-    self._run_and_report_benchmark(summary_path=summary_path,
-                                   report_accuracy=False)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/bert_squad_benchmark.py
+++ b/official/benchmark/bert_squad_benchmark.py
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes BERT SQuAD benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import time
-
-# pylint: disable=g-bad-import-order
-from absl import flags
-from absl import logging
-from absl.testing import flagsaver
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from official.benchmark import bert_benchmark_utils as benchmark_utils
-from official.benchmark import owner_utils
-from official.nlp.bert import run_squad
-from official.utils.misc import distribution_utils
-from official.utils.misc import keras_utils
-from official.benchmark import benchmark_wrappers
-
-
-# pylint: disable=line-too-long
-PRETRAINED_CHECKPOINT_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_model.ckpt'
-SQUAD_TRAIN_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_train.tf_record'
-SQUAD_PREDICT_FILE = 'gs://tf-perfzero-data/bert/squad/dev-v1.1.json'
-SQUAD_VOCAB_FILE = 'gs://tf-perfzero-data/bert/squad/vocab.txt'
-SQUAD_MEDIUM_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_medium_meta_data'
-SQUAD_LONG_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_long_meta_data'
-SQUAD_FULL_INPUT_META_DATA_PATH = 'gs://tf-perfzero-data/bert/squad/squad_full_meta_data'
-MODEL_CONFIG_FILE_PATH = 'gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16/bert_config.json'
-# pylint: enable=line-too-long
-
-TMP_DIR = os.getenv('TMPDIR')
-FLAGS = flags.FLAGS
-
-
-class BertSquadBenchmarkBase(benchmark_utils.BertBenchmarkBase):
-  """Base class to hold methods common to test classes in the module."""
-
-  def __init__(self, output_dir=None, tpu=None):
-    super(BertSquadBenchmarkBase, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  def _read_training_summary_from_file(self):
-    """Reads the training summary from a file."""
-    summary_path = os.path.join(FLAGS.model_dir,
-                                'summaries/training_summary.txt')
-    with tf.io.gfile.GFile(summary_path, 'rb') as reader:
-      return json.loads(reader.read().decode('utf-8'))
-
-  def _read_input_meta_data_from_file(self):
-    """Reads the input metadata from a file."""
-    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-      return json.loads(reader.read().decode('utf-8'))
-
-  def _get_distribution_strategy(self, ds_type='mirrored'):
-    """Gets the distribution strategy.
-
-    Args:
-      ds_type: String, the distribution strategy type to be used. Can be
-      'mirrored', 'multi_worker_mirrored', 'tpu' and 'off'.
-
-    Returns:
-      A `tf.distribute.DistibutionStrategy` object.
-    """
-    if self.tpu or ds_type == 'tpu':
-      return distribution_utils.get_distribution_strategy(
-          distribution_strategy='tpu', tpu_address=self.tpu)
-    elif ds_type == 'multi_worker_mirrored':
-      # Configures cluster spec for multi-worker distribution strategy.
-      _ = distribution_utils.configure_cluster(FLAGS.worker_hosts,
-                                               FLAGS.task_index)
-    return distribution_utils.get_distribution_strategy(
-        distribution_strategy=ds_type,
-        num_gpus=self.num_gpus,
-        all_reduce_alg=FLAGS.all_reduce_alg)
-
-  def _init_gpu_and_data_threads(self):
-    """Set env variables before any TF calls."""
-    if FLAGS.tf_gpu_thread_mode:
-      keras_utils.set_gpu_thread_mode_and_count(
-          per_gpu_thread_count=FLAGS.per_gpu_thread_count,
-          gpu_thread_mode=FLAGS.tf_gpu_thread_mode,
-          num_gpus=self.num_gpus,
-          datasets_num_private_threads=FLAGS.datasets_num_private_threads)
-
-  @flagsaver.flagsaver
-  def _train_squad(self, run_eagerly=False, ds_type='mirrored'):
-    """Runs BERT SQuAD training. Uses mirrored strategy by default."""
-    self._init_gpu_and_data_threads()
-    input_meta_data = self._read_input_meta_data_from_file()
-    strategy = self._get_distribution_strategy(ds_type)
-
-    run_squad.train_squad(
-        strategy=strategy,
-        input_meta_data=input_meta_data,
-        run_eagerly=run_eagerly,
-        custom_callbacks=[self.timer_callback])
-
-  @flagsaver.flagsaver
-  def _evaluate_squad(self, ds_type='mirrored'):
-    """Runs BERT SQuAD evaluation. Uses mirrored strategy by default."""
-    self._init_gpu_and_data_threads()
-    input_meta_data = self._read_input_meta_data_from_file()
-    strategy = self._get_distribution_strategy(ds_type)
-
-    if input_meta_data.get('version_2_with_negative', False):
-      logging.error('In memory evaluation result for SQuAD v2 is not accurate')
-    eval_metrics = run_squad.eval_squad(strategy=strategy,
-                                        input_meta_data=input_meta_data)
-    # Use F1 score as reported evaluation metric.
-    self.eval_metrics = eval_metrics['final_f1']
-
-
-class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
-  """Short benchmark performance tests for BERT SQuAD model.
-
-  Tests BERT SQuAD performance in different GPU configurations.
-  The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu` format for GPUs and
-  `benchmark_(topology)_tpu` format for TPUs.
-  """
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    super(BertSquadBenchmarkReal, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  def _setup(self):
-    """Sets up the benchmark and SQuAD flags."""
-    super(BertSquadBenchmarkReal, self)._setup()
-    FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
-    FLAGS.predict_file = SQUAD_PREDICT_FILE
-    FLAGS.vocab_file = SQUAD_VOCAB_FILE
-    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
-    FLAGS.num_train_epochs = 1
-    FLAGS.steps_per_loop = 100
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                run_eagerly=False,
-                                ds_type='mirrored'):
-    """Runs the benchmark and reports various metrics."""
-    if FLAGS.train_batch_size <= 4 or run_eagerly:
-      FLAGS.input_meta_data_path = SQUAD_MEDIUM_INPUT_META_DATA_PATH
-    else:
-      FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH
-    start_time_sec = time.time()
-    self._train_squad(run_eagerly=run_eagerly, ds_type=ds_type)
-    wall_time_sec = time.time() - start_time_sec
-
-    summary = self._read_training_summary_from_file()
-    summary['start_time_sec'] = start_time_sec
-
-    super(BertSquadBenchmarkReal, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=0,
-        max_accuracy=1)
-
-  def benchmark_1_gpu(self):
-    """Tests BERT SQuAD model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad')
-    FLAGS.train_batch_size = 4
-
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_eager(self):
-    """Tests BERT SQuAD model performance with 1 GPU."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_eager')
-    FLAGS.train_batch_size = 2
-
-    self._run_and_report_benchmark(run_eagerly=True)
-
-  def benchmark_1_gpu_xla(self):
-    """Tests BERT SQuAD model performance with 1 GPU with XLA."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad')
-    # XLA runs out of memory when running with batch size 4.
-    FLAGS.train_batch_size = 3
-    FLAGS.enable_xla = True
-
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Tests BERT SQuAD model performance with 1 GPU without DS."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
-
-    self._run_and_report_benchmark(ds_type='off')
-
-  def benchmark_1_gpu_eager_no_dist_strat(self):
-    """Tests BERT SQuAD model performance with 1 GPU with eager execution."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_eager_no_dist_strat_squad')
-    FLAGS.train_batch_size = 4
-
-    self._run_and_report_benchmark(ds_type='off', run_eagerly=True)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu(self):
-    """Tests BERT SQuAD model performance with 8 GPUs."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 24
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16_eager(self):
-    """Tests BERT SQuAD model performance with 1 GPU and FP16."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_fp16_eager')
-    FLAGS.train_batch_size = 4
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-
-    self._run_and_report_benchmark(run_eagerly=True)
-
-  def benchmark_1_gpu_fp16(self):
-    """Tests BERT SQuAD model performance with 1 GPU and FP16."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_fp16')
-    FLAGS.train_batch_size = 4
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_xla_fp16(self):
-    """Tests BERT SQuAD model performance with 1 GPU with XLA and FP16."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla_squad_fp16')
-    FLAGS.train_batch_size = 4
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Tests BERT SQuAD model performance with 8 GPUs."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
-    FLAGS.train_batch_size = 32
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_xla_fp16(self):
-    """Tests BERT SQuAD model performance with 8 GPUs with XLA."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
-    FLAGS.train_batch_size = 32
-    FLAGS.enable_xla = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_amp(self):
-    """Tests BERT SQuAD model performance with 1 GPU with automatic mixed precision."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp_squad')
-    FLAGS.train_batch_size = 4
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Tests BERT SQuAD model performance with 1 GPU with automatic mixed precision."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp_squad')
-    FLAGS.train_batch_size = 32
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu(self):
-    """Tests BERT SQuAD model performance with 2x2 TPU."""
-
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
-    FLAGS.train_batch_size = 48
-    FLAGS.predict_batch_size = 48
-    FLAGS.mode = 'train'
-    FLAGS.learning_rate = 8e-5
-    FLAGS.num_train_epochs = 1
-    FLAGS.steps_per_loop = 100
-    FLAGS.do_lower_case = True
-    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
-    self._run_and_report_benchmark()
-
-
-class BertSquadAccuracy(BertSquadBenchmarkBase):
-  """Short accuracy test for BERT SQuAD model.
-
-  Tests BERT SQuAD accuracy. The naming convention of below test cases follow
-  `benchmark_(number of gpus)_gpu` format for GPUs and
-  `benchmark_(topology)_tpu` format for TPUs.
-  """
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    super(BertSquadAccuracy, self).__init__(output_dir=output_dir, tpu=tpu)
-
-  def _setup(self):
-    """Sets up the benchmark and SQuAD flags."""
-    super(BertSquadAccuracy, self)._setup()
-    FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
-    FLAGS.predict_file = SQUAD_PREDICT_FILE
-    FLAGS.vocab_file = SQUAD_VOCAB_FILE
-    FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
-    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
-    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
-    FLAGS.num_train_epochs = 2
-    FLAGS.steps_per_loop = 100
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                run_eagerly=False,
-                                ds_type='mirrored'):
-    """Runs the benchmark and reports various metrics."""
-    start_time_sec = time.time()
-    self._train_squad(run_eagerly=run_eagerly, ds_type=ds_type)
-    self._evaluate_squad(ds_type=ds_type)
-    wall_time_sec = time.time() - start_time_sec
-
-    summary = self._read_training_summary_from_file()
-    summary['eval_metrics'] = self.eval_metrics
-    summary['start_time_sec'] = start_time_sec
-
-    super(BertSquadAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=0.900,
-        max_accuracy=0.920)
-
-  def benchmark_1_gpu_eager(self):
-    """Tests BERT SQuAD model accuracy with 1 GPU with eager execution."""
-
-    self._setup()
-    self.num_gpus = 1
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_squad_eager')
-    FLAGS.train_batch_size = 4
-
-    self._run_and_report_benchmark(ds_type='off', run_eagerly=True)
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_8_gpu(self):
-    """Tests BERT SQuAD model accuracy with 8 GPUs."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad')
-    FLAGS.train_batch_size = 24
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Tests BERT SQuAD model accuracy with 8 GPUs and FP16."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_fp16')
-    FLAGS.train_batch_size = 32
-    FLAGS.dtype = 'fp16'
-    FLAGS.loss_scale = 'dynamic'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_xla(self):
-    """Tests BERT SQuAD model accuracy with 8 GPUs."""
-
-    self._setup()
-    self.num_gpus = 8
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_squad_xla')
-    FLAGS.train_batch_size = 32
-    FLAGS.enable_xla = True
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-
-    self._run_and_report_benchmark()
-
-  @owner_utils.Owner('tf-model-garden')
-  def benchmark_2x2_tpu(self):
-    """Tests BERT SQuAD model accuracy with 2x2 TPU."""
-
-    self._setup()
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu')
-    FLAGS.train_batch_size = 48
-
-    self._run_and_report_benchmark()
-
-
-class BertSquadMultiWorkerAccuracy(BertSquadBenchmarkBase):
-  """BERT SQuAD distributed accuracy tests with multiple workers."""
-
-  def __init__(self, output_dir=None, tpu=None, **kwargs):
-    super(BertSquadMultiWorkerAccuracy, self).__init__(
-        output_dir=output_dir, tpu=tpu)
-
-  def _setup(self):
-    """Sets up the benchmark and SQuAD flags."""
-    super(BertSquadMultiWorkerAccuracy, self)._setup()
-    FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
-    FLAGS.predict_file = SQUAD_PREDICT_FILE
-    FLAGS.vocab_file = SQUAD_VOCAB_FILE
-    FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
-    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
-    FLAGS.init_checkpoint = PRETRAINED_CHECKPOINT_PATH
-    FLAGS.num_train_epochs = 2
-    FLAGS.steps_per_loop = 100
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                use_ds=True,
-                                run_eagerly=False):
-    """Runs the benchmark and reports various metrics."""
-    start_time_sec = time.time()
-    self._train_squad(run_eagerly=run_eagerly,
-                      ds_type='multi_worker_mirrored')
-    self._evaluate_squad(ds_type='multi_worker_mirrored')
-    wall_time_sec = time.time() - start_time_sec
-
-    summary = self._read_training_summary_from_file()
-    summary['eval_metrics'] = self.eval_metrics
-
-    super(BertSquadMultiWorkerAccuracy, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=0.900,
-        max_accuracy=0.920)
-
-  def _benchmark_common(self, num_workers, all_reduce_alg):
-    """Common to all benchmarks in this class."""
-    self._setup()
-
-    num_gpus = 8
-    FLAGS.num_gpus = num_gpus
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_xla = False
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_8_gpu_{}_worker_fp16_{}_tweaked'.format(
-            num_workers, all_reduce_alg))
-    FLAGS.train_batch_size = 4 * num_gpus * num_workers
-    FLAGS.all_reduce_alg = all_reduce_alg
-
-    self._run_and_report_benchmark()
-
-  def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
-    """8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
-    self._benchmark_common(num_workers=2, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
-    """8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(num_workers=2, all_reduce_alg='nccl')
-
-  def benchmark_8_gpu_8_workers_fp16_ring_tweaked(self):
-    """8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
-    self._benchmark_common(num_workers=8, all_reduce_alg='ring')
-
-  def benchmark_8_gpu_8_workers_fp16_nccl_tweaked(self):
-    """8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(num_workers=8, all_reduce_alg='nccl')
-
-
-class BertSquadMultiWorkerBenchmark(BertSquadBenchmarkBase):
-  """BERT SQuAD distributed benchmark tests with multiple workers."""
-
-  def __init__(self, output_dir=TMP_DIR, tpu=None, **kwargs):
-    super(BertSquadMultiWorkerBenchmark, self).__init__(
-        output_dir=output_dir, tpu=tpu)
-
-  def _setup(self):
-    """Sets up the benchmark and SQuAD flags."""
-    super(BertSquadMultiWorkerBenchmark, self)._setup()
-    FLAGS.train_data_path = SQUAD_TRAIN_DATA_PATH
-    FLAGS.predict_file = SQUAD_PREDICT_FILE
-    FLAGS.vocab_file = SQUAD_VOCAB_FILE
-    FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
-    FLAGS.bert_config_file = MODEL_CONFIG_FILE_PATH
-    FLAGS.num_train_epochs = 1
-    FLAGS.steps_per_loop = 100
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                use_ds=True,
-                                run_eagerly=False):
-    """Runs the benchmark and reports various metrics."""
-    if FLAGS.train_batch_size <= 4 * 8:
-      FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH
-    else:
-      FLAGS.input_meta_data_path = SQUAD_FULL_INPUT_META_DATA_PATH
-    start_time_sec = time.time()
-    self._train_squad(run_eagerly=run_eagerly,
-                      ds_type='multi_worker_mirrored')
-    wall_time_sec = time.time() - start_time_sec
-
-    summary = self._read_training_summary_from_file()
-    summary['start_time_sec'] = start_time_sec
-
-    super(BertSquadMultiWorkerBenchmark, self)._report_benchmark(
-        stats=summary,
-        wall_time_sec=wall_time_sec,
-        min_accuracy=0,
-        max_accuracy=1)
-
-  def _benchmark_common(self, num_workers, all_reduce_alg):
-    """Common to all benchmarks in this class."""
-    self._setup()
-
-    num_gpus = 8
-    FLAGS.num_gpus = num_gpus
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_xla = False
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_8_gpu_{}_worker_fp16_{}_tweaked'.format(
-            num_workers, all_reduce_alg))
-    FLAGS.train_batch_size = 4 * num_gpus * num_workers
-    FLAGS.all_reduce_alg = all_reduce_alg
-
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_1_worker_fp16_ring_tweaked(self):
-    """8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
-    self._benchmark_common(num_workers=1, all_reduce_alg='ring')
-
-  def benchmark_8_gpu_1_worker_fp16_nccl_tweaked(self):
-    """8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
-    self._benchmark_common(num_workers=1, all_reduce_alg='nccl')
-
-  def benchmark_8_gpu_2_workers_fp16_ring_tweaked(self):
-    """8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
-    self._benchmark_common(num_workers=2, all_reduce_alg='ring')
-
-  def benchmark_8_gpu_2_workers_fp16_nccl_tweaked(self):
-    """8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(num_workers=2, all_reduce_alg='nccl')
-
-  def benchmark_8_gpu_8_workers_fp16_ring_tweaked(self):
-    """8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
-    self._benchmark_common(num_workers=8, all_reduce_alg='ring')
-
-  def benchmark_8_gpu_8_workers_fp16_nccl_tweaked(self):
-    """8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(num_workers=8, all_reduce_alg='nccl')
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/datastore/schema/benchmark_metric.json
+++ b/official/benchmark/datastore/schema/benchmark_metric.json
-[
-  {
-    "description": "The ID of the benchmark run, where this metric should tie to.",
-    "mode": "REQUIRED",
-    "name": "run_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The name of the metric, which should be descriptive. E.g. training_loss, accuracy.",
-    "mode": "REQUIRED",
-    "name": "name",
-    "type": "STRING"
-  },
-  {
-    "description": "The unit of the metric. E.g. MB per sec.",
-    "mode": "NULLABLE",
-    "name": "unit",
-    "type": "STRING"
-  },
-  {
-    "description": "The value of the metric.",
-    "mode": "NULLABLE",
-    "name": "value",
-    "type": "FLOAT"
-  },
-  {
-    "description": "The timestamp when the metric is recorded.",
-    "mode": "REQUIRED",
-    "name": "timestamp",
-    "type": "TIMESTAMP"
-  },
-  {
-    "description": "The global step when this metric is recorded.",
-    "mode": "NULLABLE",
-    "name": "global_step",
-    "type": "INTEGER"
-  },
-  {
-    "description": "Free format metadata for the extra information about the metric.",
-    "mode": "REPEATED",
-    "name": "extras",
-    "type": "RECORD",
-    "fields": [
-      {
-        "mode": "NULLABLE",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ]
-  }
-]
--- a/official/benchmark/datastore/schema/benchmark_run.json
+++ b/official/benchmark/datastore/schema/benchmark_run.json
-[
-  {
-    "description": "The UUID of the run for the benchmark.",
-    "mode": "REQUIRED",
-    "name": "model_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The name of the model, E.g ResNet50, LeNet-5 etc.",
-    "mode": "REQUIRED",
-    "name": "model_name",
-    "type": "STRING"
-  },
-  {
-    "description": "The date when the test of the model is started",
-    "mode": "REQUIRED",
-    "name": "run_date",
-    "type": "TIMESTAMP"
-  },
-  {
-    "description": "The unique name for a test by the combination of key parameters, eg batch size, num of GPU, etc. It is hardware independent.",
-    "mode": "NULLABLE",
-    "name": "test_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The tensorflow version information.",
-    "fields": [
-      {
-        "description": "Version of the tensorflow. E.g. 1.7.0-rc0",
-        "mode": "REQUIRED",
-        "name": "version",
-        "type": "STRING"
-      },
-      {
-        "description": "Git Hash of the tensorflow",
-        "mode": "NULLABLE",
-        "name": "git_hash",
-        "type": "STRING"
-      },
-      {
-        "description": "The channel of the tensorflow binary, eg, nightly, RC, final, custom.",
-        "mode": "NULLABLE",
-        "name": "channel",
-        "type": "STRING"
-      },
-      {
-        "description": "Identify anything special about the build, eg CUDA 10, NCCL, MKL, etc.",
-        "mode": "NULLABLE",
-        "name": "build_type",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REQUIRED",
-    "name": "tensorflow_version",
-    "type": "RECORD"
-  },
-  {
-    "description": "The arbitrary attribute of the model.",
-    "fields": [
-      {
-        "description": "The name of the attribute.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the attribute.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "attribute",
-    "type": "RECORD"
-  },
-  {
-    "description": "Environment variables when the benchmark run is executed.",
-    "fields": [
-      {
-        "description": "The name of the variable.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the variable.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "environment_variable",
-    "type": "RECORD"
-  },
-  {
-    "description": "TF Environment variables when the benchmark run is executed.",
-    "fields": [
-      {
-        "description": "The name of the variable.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The value of the variable.",
-        "mode": "NULLABLE",
-        "name": "value",
-        "type": "STRING"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "tensorflow_environment_variables",
-    "type": "RECORD"
-  },
-  {
-    "description": "The list of parameters run with the model. It could contain hyperparameters or others.",
-    "fields": [
-      {
-        "description": "The name of the parameter.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The string value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "string_value",
-        "type": "STRING"
-      },
-      {
-        "description": "The bool value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "bool_value",
-        "type": "STRING"
-      },
-      {
-        "description": "The int/long value of the parameter.",
-        "mode": "NULLABLE",
-        "name": "long_value",
-        "type": "INTEGER"
-      },
-      {
-        "description": "The double/float value of parameter.",
-        "mode": "NULLABLE",
-        "name": "float_value",
-        "type": "FLOAT"
-      }
-    ],
-    "mode": "REPEATED",
-    "name": "run_parameters",
-    "type": "RECORD"
-  },
-  {
-    "description": "The dataset that run with the benchmark.",
-    "mode": "NULLABLE",
-    "name": "dataset",
-    "type": "RECORD",
-    "fields": [
-      {
-        "description": "The name of the dataset that the model is trained/validated with. E.g ImageNet, mnist.",
-        "mode": "REQUIRED",
-        "name": "name",
-        "type": "STRING"
-      },
-      {
-        "description": "The arbitrary attribute of the dataset.",
-        "fields": [
-          {
-            "description": "The name of the attribute.",
-            "mode": "REQUIRED",
-            "name": "name",
-            "type": "STRING"
-          },
-          {
-            "description": "The value of the attribute.",
-            "mode": "NULLABLE",
-            "name": "value",
-            "type": "STRING"
-          }
-        ],
-        "mode": "REPEATED",
-        "name": "attribute",
-        "type": "RECORD"
-      }
-    ]
-  },
-  {
-    "description": "Used to differentiate from AWS, GCE or DGX-1 at a high level",
-    "mode": "NULLABLE",
-    "name": "test_environment",
-    "type": "STRING"
-  },
-  {
-    "description": "The machine configuration of the benchmark run.",
-    "mode": "NULLABLE",
-    "name": "machine_config",
-    "type": "RECORD",
-    "fields": [
-      {
-        "description": "The platform information of the benchmark run.",
-        "mode": "NULLABLE",
-        "name": "platform_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "description": "Eg: 64bit.",
-            "mode": "NULLABLE",
-            "name": "bits",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: ELF.",
-            "mode": "NULLABLE",
-            "name": "linkage",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: i386.",
-            "mode": "NULLABLE",
-            "name": "machine",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: 3.13.0-76-generic.",
-            "mode": "NULLABLE",
-            "name": "release",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: Linux.",
-            "mode": "NULLABLE",
-            "name": "system",
-            "type": "STRING"
-          },
-          {
-            "description": "Eg: #120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016.",
-            "mode": "NULLABLE",
-            "name": "version",
-            "type": "STRING"
-          }
-        ]
-      },
-      {
-        "description": "The CPU information of the benchmark run.",
-        "mode": "NULLABLE",
-        "name": "cpu_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "mode": "NULLABLE",
-            "name": "num_cores",
-            "type": "INTEGER"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "num_cores_allowed",
-            "type": "INTEGER"
-          },
-          {
-            "description" : "How fast are those CPUs.",
-            "mode": "NULLABLE",
-            "name": "mhz_per_cpu",
-            "type": "FLOAT"
-          },
-          {
-            "description" : "Additional CPU info, Eg: Intel Ivybridge with HyperThreading (24 cores).",
-            "mode": "NULLABLE",
-            "name": "cpu_info",
-            "type": "STRING"
-          },
-          {
-            "description" : "What kind of cpu scaling is enabled on the host. Eg performance, ondemand, conservative, mixed.",
-            "mode": "NULLABLE",
-            "name": "cpu_governor",
-            "type": "STRING"
-          },
-          {
-            "description": "Cache size of the CPUs.",
-            "mode": "NULLABLE",
-            "name": "cache_size",
-            "type": "RECORD",
-            "fields": [
-              {
-                "mode": "NULLABLE",
-                "name": "level",
-                "type": "STRING"
-              },
-              {
-                "mode": "NULLABLE",
-                "name": "size",
-                "type": "INTEGER"
-              }
-            ]
-          }
-        ]
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "gpu_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "mode": "NULLABLE",
-            "name": "count",
-            "type": "INTEGER"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "model",
-            "type": "STRING"
-          },
-          {
-            "mode": "NULLABLE",
-            "name": "cuda_version",
-            "type": "STRING"
-          }
-        ]
-      },
-      {
-        "description": "The cloud instance inforation if the benchmark run is executed on cloud",
-        "mode": "NULLABLE",
-        "name": "cloud_info",
-        "type": "RECORD",
-        "fields": [
-          {
-            "description": "The instance type, E.g. n1-standard-4.",
-            "mode": "NULLABLE",
-            "name": "instance_type",
-            "type": "STRING"
-          },
-          {
-            "description": "The arbitrary attribute of the cloud info.",
-            "fields": [
-              {
-                "description": "The name of the attribute.",
-                "mode": "REQUIRED",
-                "name": "name",
-                "type": "STRING"
-              },
-              {
-                "description": "The value of the attribute.",
-                "mode": "NULLABLE",
-                "name": "value",
-                "type": "STRING"
-              }
-            ],
-            "mode": "REPEATED",
-            "name": "attribute",
-            "type": "RECORD"
-          }
-        ]
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "memory_total",
-        "type": "INTEGER"
-      },
-      {
-        "mode": "NULLABLE",
-        "name": "memory_available",
-        "type": "STRING"
-      }
-    ]
-  }
-]
--- a/official/benchmark/datastore/schema/benchmark_run_status.json
+++ b/official/benchmark/datastore/schema/benchmark_run_status.json
-[
-  {
-    "description": "The UUID of the run for the benchmark.",
-    "mode": "REQUIRED",
-    "name": "run_id",
-    "type": "STRING"
-  },
-  {
-    "description": "The status of the run for the benchmark. Eg, running, failed, success",
-    "mode": "REQUIRED",
-    "name": "status",
-    "type": "STRING"
-  }
-]
\ No newline at end of file
--- a/official/benchmark/keras_benchmark.py
+++ b/official/benchmark/keras_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from official.benchmark.perfzero_benchmark import PerfZeroBenchmark
-from official.utils.flags import core as flags_core
-
-
-class KerasBenchmark(PerfZeroBenchmark):
-  """Base benchmark class with methods to simplify testing."""
-
-  def __init__(self,
-               output_dir=None,
-               default_flags=None,
-               flag_methods=None,
-               tpu=None):
-    super(KerasBenchmark, self).__init__(
-        output_dir=output_dir,
-        default_flags=default_flags,
-        flag_methods=flag_methods,
-        tpu=tpu)
-
-  def _report_benchmark(self,
-                        stats,
-                        wall_time_sec,
-                        top_1_max=None,
-                        top_1_min=None,
-                        log_steps=None,
-                        total_batch_size=None,
-                        warmup=1,
-                        start_time_sec=None):
-    """Report benchmark results by writing to local protobuf file.
-
-    Args:
-      stats: dict returned from keras models with known entries.
-      wall_time_sec: the during of the benchmark execution in seconds
-      top_1_max: highest passing level for top_1 accuracy.
-      top_1_min: lowest passing level for top_1 accuracy.
-      log_steps: How often the log was created for stats['step_timestamp_log'].
-      total_batch_size: Global batch-size.
-      warmup: number of entries in stats['step_timestamp_log'] to ignore.
-      start_time_sec: the start time of the program in seconds since epoch
-    """
-
-    metrics = []
-    if 'accuracy_top_1' in stats:
-      metrics.append({'name': 'accuracy_top_1',
-                      'value': stats['accuracy_top_1'],
-                      'min_value': top_1_min,
-                      'max_value': top_1_max})
-      metrics.append({'name': 'top_1_train_accuracy',
-                      'value': stats['training_accuracy_top_1']})
-
-    if (warmup and 'step_timestamp_log' in stats and
-        len(stats['step_timestamp_log']) > warmup):
-      # first entry in the time_log is start of step 1. The rest of the
-      # entries are the end of each step recorded
-      time_log = stats['step_timestamp_log']
-      elapsed = time_log[-1].timestamp - time_log[warmup].timestamp
-      num_examples = (
-          total_batch_size * log_steps * (len(time_log) - warmup - 1))
-      examples_per_sec = num_examples / elapsed
-      metrics.append({'name': 'exp_per_second',
-                      'value': examples_per_sec})
-
-    if 'avg_exp_per_second' in stats:
-      metrics.append({'name': 'avg_exp_per_second',
-                      'value': stats['avg_exp_per_second']})
-
-    if start_time_sec and 'step_timestamp_log' in stats:
-      time_log = stats['step_timestamp_log']
-      # time_log[0] is recorded at the beginning of the first step.
-      startup_time = time_log[0].timestamp - start_time_sec
-      metrics.append({'name': 'startup_time', 'value': startup_time})
-
-    flags_str = flags_core.get_nondefault_flags_as_str()
-    self.report_benchmark(
-        iters=-1,
-        wall_time=wall_time_sec,
-        metrics=metrics,
-        extras={'flags': flags_str})
--- a/official/benchmark/keras_cifar_benchmark.py
+++ b/official/benchmark/keras_cifar_benchmark.py
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import keras_benchmark
-from official.benchmark import benchmark_wrappers
-from official.benchmark.models import resnet_cifar_main
-
-MIN_TOP_1_ACCURACY = 0.929
-MAX_TOP_1_ACCURACY = 0.938
-
-FLAGS = flags.FLAGS
-CIFAR_DATA_DIR_NAME = 'cifar-10-batches-bin'
-
-
-class Resnet56KerasAccuracy(keras_benchmark.KerasBenchmark):
-  """Accuracy tests for ResNet56 Keras CIFAR-10."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-
-    self.data_dir = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    flag_methods = [resnet_cifar_main.define_cifar_flags]
-
-    super(Resnet56KerasAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def _setup(self):
-    super(Resnet56KerasAccuracy, self)._setup()
-    FLAGS.use_tensor_lr = False
-
-  def benchmark_graph_1_gpu(self):
-    """Test keras based model with Keras fit and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test keras based model with eager and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu(self):
-    """Test keras based model on CPU."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat(self):
-    """Test keras based model on CPU without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat_run_eagerly(self):
-    """Test keras based model on CPU w/forced eager and no dist_strat."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_cpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test keras based model with eager and no dist strat."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test keras based model w/forced eager and no dist_strat."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu_no_dist_strat(self):
-    """Test keras based model with Keras fit but not distribution strategies."""
-    self._setup()
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.num_gpus = 1
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpu(self):
-    """Test keras based model with eager and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test keras based model with Keras fit and distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128
-    FLAGS.train_epochs = 182
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    FLAGS.dtype = 'fp32'
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_cifar_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet56KerasAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=MIN_TOP_1_ACCURACY,
-        top_1_max=MAX_TOP_1_ACCURACY,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100)
-
-
-class Resnet56KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
-  """Short performance tests for ResNet56 via Keras and CIFAR-10."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    flag_methods = [resnet_cifar_main.define_cifar_flags]
-
-    super(Resnet56KerasBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_cifar_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet56KerasBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  def benchmark_1_gpu(self):
-    """Test 1 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_xla(self):
-    """Test 1 gpu with xla enabled."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = False
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_xla')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu(self):
-    """Test 1 gpu graph."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test 1 gpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_1_gpu_no_dist_strat(self):
-    """Test 1 gpu graph mode without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = False
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test 1 gpu without distribution strategy and forced eager."""
-    self._setup()
-    FLAGS.num_gpus = 1
-    FLAGS.batch_size = 128
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    self._run_and_report_benchmark()
-
-  def benchmark_2_gpu(self):
-    """Test 2 gpu."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2_gpu')
-    FLAGS.batch_size = 128 * 2  # 2 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_2_gpu(self):
-    """Test 2 gpu graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 2
-    FLAGS.enable_eager = False
-    FLAGS.run_eagerly = False
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_2_gpu')
-    FLAGS.batch_size = 128 * 2  # 2 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu(self):
-    """Test cpu."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_cpu(self):
-    """Test cpu graph mode."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = False
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat_run_eagerly(self):
-    """Test cpu without distribution strategy and forced eager."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_cpu_no_dist_strat_run_eagerly')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_cpu_no_dist_strat(self):
-    """Test cpu without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_cpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-  def benchmark_graph_cpu_no_dist_strat(self):
-    """Test cpu graph mode without distribution strategies."""
-    self._setup()
-    FLAGS.num_gpus = 0
-    FLAGS.enable_eager = False
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_graph_cpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    FLAGS.data_format = 'channels_last'
-    self._run_and_report_benchmark()
-
-
-class Resnet56KerasBenchmarkSynth(Resnet56KerasBenchmarkBase):
-  """Synthetic benchmarks for ResNet56 and Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    default_flags = {}
-    default_flags['skip_eval'] = True
-    default_flags['use_synthetic_data'] = True
-    default_flags['train_steps'] = 110
-    default_flags['log_steps'] = 10
-    default_flags['use_tensor_lr'] = False
-
-    super(Resnet56KerasBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=default_flags)
-
-
-class Resnet56KerasBenchmarkReal(Resnet56KerasBenchmarkBase):
-  """Real data benchmarks for ResNet56 and Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    default_flags = {}
-    default_flags['skip_eval'] = True
-    default_flags['data_dir'] = os.path.join(root_data_dir, CIFAR_DATA_DIR_NAME)
-    default_flags['train_steps'] = 110
-    default_flags['log_steps'] = 10
-    default_flags['use_tensor_lr'] = False
-
-    super(Resnet56KerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=default_flags)
-
-
-if __name__ == '__main__':
-  tf.test.main()
--- a/official/benchmark/keras_imagenet_benchmark.py
+++ b/official/benchmark/keras_imagenet_benchmark.py
-# Lint as: python3
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Executes Keras benchmarks and accuracy tests."""
-# pylint: disable=line-too-long
-from __future__ import print_function
-
-import json
-import os
-import time
-
-from typing import Any, MutableMapping, Optional
-
-from absl import flags
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.benchmark import benchmark_wrappers
-from official.benchmark import keras_benchmark
-from official.benchmark.models import resnet_imagenet_main
-from official.vision.image_classification import classifier_trainer
-
-MIN_TOP_1_ACCURACY = 0.76
-MAX_TOP_1_ACCURACY = 0.77
-
-MOBILENET_V1_MIN_TOP_1_ACCURACY = 0.65
-MOBILENET_V1_MAX_TOP_1_ACCURACY = 0.68
-
-# Range of top-1 accracies for model optimization techniques.
-# Each item indicates (MIN_TOP_1_ACCURACY, MAX_TOP_1_ACCURACY).
-MODEL_OPTIMIZATION_TOP_1_ACCURACY = {
-    'RESNET50_FINETUNE_PRUNING': (0.76, 0.77),
-    'MOBILENET_V1_FINETUNE_PRUNING': (0.67, 0.68),
-}
-
-FLAGS = flags.FLAGS
-
-
-def _get_classifier_parameters(
-    num_gpus: int = 0,
-    builder: str = 'records',
-    skip_eval: bool = False,
-    distribution_strategy: str = 'mirrored',
-    per_replica_batch_size: int = 128,
-    epochs: int = 90,
-    steps: int = 0,
-    epochs_between_evals: int = 1,
-    dtype: str = 'float32',
-    enable_xla: bool = False,
-    run_eagerly: bool = False,
-    gpu_thread_mode: Optional[str] = None,
-    dataset_num_private_threads: Optional[int] = None,
-    loss_scale: Optional[str] = None,
-    report_metrics: bool = True,
-    batchnorm_spatial_persistent: bool = False) -> MutableMapping[str, Any]:
-  """Gets classifier trainer's ResNet parameters."""
-  return {
-      'runtime': {
-          'num_gpus': num_gpus,
-          'distribution_strategy': distribution_strategy,
-          'run_eagerly': run_eagerly,
-          'enable_xla': enable_xla,
-          'dataset_num_private_threads': dataset_num_private_threads,
-          'gpu_thread_mode': gpu_thread_mode,
-          'loss_scale': loss_scale,
-          'batchnorm_spatial_persistent': batchnorm_spatial_persistent,
-      },
-      'train_dataset': {
-          'builder': builder,
-          'use_per_replica_batch_size': True,
-          'batch_size': per_replica_batch_size,
-          'image_size': 224,
-          'dtype': dtype,
-      },
-      'validation_dataset': {
-          'builder': builder,
-          'batch_size': per_replica_batch_size,
-          'use_per_replica_batch_size': True,
-          'image_size': 224,
-          'dtype': dtype,
-      },
-      'train': {
-          'epochs': epochs,
-          'steps': steps,
-          'callbacks': {
-              'enable_tensorboard': False,
-              'enable_checkpoint_and_export': False,
-              'enable_time_history': True,
-          },
-          'metrics': ['accuracy'] if report_metrics else [],
-      },
-      'model': {
-          'loss': {
-              'label_smoothing': 0.1,
-          },
-      },
-      'evaluation': {
-          'epochs_between_evals': epochs_between_evals,
-          'skip_eval': skip_eval,
-      },
-  }
-
-
-class Resnet50KerasAccuracy(keras_benchmark.KerasBenchmark):
-  """Benchmark accuracy tests for ResNet50 in Keras."""
-
-  def __init__(self,
-               output_dir: Optional[str] = None,
-               root_data_dir: Optional[str] = None,
-               **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-
-    flag_methods = [classifier_trainer.define_classifier_flags]
-
-    self.data_dir = os.path.join(root_data_dir, 'imagenet')
-    super(Resnet50KerasAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(
-      self,
-      experiment_name: str,
-      top_1_min: float = MIN_TOP_1_ACCURACY,
-      top_1_max: float = MAX_TOP_1_ACCURACY,
-      num_gpus: int = 0,
-      distribution_strategy: str = 'mirrored',
-      per_replica_batch_size: int = 128,
-      epochs: int = 90,
-      steps: int = 0,
-      epochs_between_evals: int = 1,
-      dtype: str = 'float32',
-      enable_xla: bool = False,
-      run_eagerly: bool = False,
-      gpu_thread_mode: Optional[str] = None,
-      dataset_num_private_threads: Optional[int] = None,
-      loss_scale: Optional[str] = None):
-    """Runs and reports the benchmark given the provided configuration."""
-    FLAGS.model_type = 'resnet'
-    FLAGS.dataset = 'imagenet'
-    FLAGS.mode = 'train_and_eval'
-    FLAGS.data_dir = self.data_dir
-    FLAGS.model_dir = self._get_model_dir(experiment_name)
-    parameters = _get_classifier_parameters(
-        num_gpus=num_gpus,
-        distribution_strategy=distribution_strategy,
-        per_replica_batch_size=per_replica_batch_size,
-        epochs=epochs,
-        steps=steps,
-        epochs_between_evals=epochs_between_evals,
-        dtype=dtype,
-        enable_xla=enable_xla,
-        run_eagerly=run_eagerly,
-        gpu_thread_mode=gpu_thread_mode,
-        dataset_num_private_threads=dataset_num_private_threads,
-        report_metrics=True,
-        loss_scale=loss_scale,
-        batchnorm_spatial_persistent=True)
-    FLAGS.params_override = json.dumps(parameters)
-    total_batch_size = num_gpus * per_replica_batch_size
-
-    start_time_sec = time.time()
-    stats = classifier_trainer.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet50KerasAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=top_1_min,
-        top_1_max=top_1_max,
-        total_batch_size=total_batch_size,
-        log_steps=100)
-
-  def benchmark_8_gpu(self):
-    """Tests Keras model with eager, dist_strat and 8 GPUs."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu',
-        num_gpus=8,
-        per_replica_batch_size=128,
-        epochs=90,
-        epochs_between_evals=10,
-        dtype='float32')
-
-  def benchmark_8_gpu_fp16(self):
-    """Tests Keras model with eager, dist_strat, 8 GPUs, and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu_fp16',
-        num_gpus=8,
-        per_replica_batch_size=256,
-        epochs=90,
-        epochs_between_evals=10,
-        dtype='float16')
-
-  def benchmark_xla_8_gpu_fp16(self):
-    """Tests Keras model with XLA, eager, dist_strat, 8 GPUs and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16',
-        num_gpus=8,
-        per_replica_batch_size=256,
-        epochs=90,
-        epochs_between_evals=10,
-        dtype='float16',
-        enable_xla=True)
-
-  def benchmark_xla_8_gpu_fp16_dynamic(self):
-    """Tests Keras model with XLA, eager, dist_strat, 8 GPUs, dynamic fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16_dynamic',
-        top_1_min=0.736,
-        num_gpus=8,
-        per_replica_batch_size=256,
-        epochs=90,
-        epochs_between_evals=10,
-        dtype='float16',
-        loss_scale='dynamic')
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-
-class MobilenetV1KerasAccuracy(keras_benchmark.KerasBenchmark):
-  """Benchmark accuracy tests for MobilenetV1 in Keras."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    """A benchmark class.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
-
-    self.data_dir = os.path.join(root_data_dir, 'imagenet')
-    super(MobilenetV1KerasAccuracy, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags={
-            'model': 'mobilenet',
-            'optimizer': 'mobilenet_default',
-            'initial_learning_rate_per_sample': 0.00039,
-        })
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with eager, dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.data_dir = self.data_dir
-    FLAGS.batch_size = 128 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                top_1_min=MOBILENET_V1_MIN_TOP_1_ACCURACY,
-                                top_1_max=MOBILENET_V1_MAX_TOP_1_ACCURACY):
-    start_time_sec = time.time()
-    stats = resnet_imagenet_main.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(MobilenetV1KerasAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=top_1_min,
-        top_1_max=top_1_max,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100)
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-
-class Resnet50KerasClassifierBenchmarkBase(keras_benchmark.KerasBenchmark):
-  """Resnet50 (classifier_trainer) benchmarks."""
-
-  def __init__(self, output_dir=None, default_flags=None,
-               tpu=None, dataset_builder='records', train_epochs=1,
-               train_steps=110, data_dir=None):
-    flag_methods = [classifier_trainer.define_classifier_flags]
-
-    self.dataset_builder = dataset_builder
-    self.train_epochs = train_epochs
-    self.train_steps = train_steps
-    self.data_dir = data_dir
-
-    super(Resnet50KerasClassifierBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags,
-        tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(
-      self,
-      experiment_name: str,
-      skip_steps: Optional[int] = None,
-      top_1_min: float = MIN_TOP_1_ACCURACY,
-      top_1_max: float = MAX_TOP_1_ACCURACY,
-      num_gpus: int = 0,
-      num_tpus: int = 0,
-      distribution_strategy: str = 'mirrored',
-      per_replica_batch_size: int = 128,
-      epochs_between_evals: int = 1,
-      dtype: str = 'float32',
-      enable_xla: bool = False,
-      run_eagerly: bool = False,
-      gpu_thread_mode: Optional[str] = None,
-      dataset_num_private_threads: Optional[int] = None,
-      loss_scale: Optional[str] = None):
-    """Runs and reports the benchmark given the provided configuration."""
-    FLAGS.model_type = 'resnet'
-    FLAGS.dataset = 'imagenet'
-    FLAGS.mode = 'train_and_eval'
-    FLAGS.data_dir = self.data_dir
-    FLAGS.model_dir = self._get_model_dir(experiment_name)
-    parameters = _get_classifier_parameters(
-        builder=self.dataset_builder,
-        skip_eval=True,
-        num_gpus=num_gpus,
-        distribution_strategy=distribution_strategy,
-        per_replica_batch_size=per_replica_batch_size,
-        epochs=self.train_epochs,
-        steps=self.train_steps,
-        epochs_between_evals=epochs_between_evals,
-        dtype=dtype,
-        enable_xla=enable_xla,
-        gpu_thread_mode=gpu_thread_mode,
-        dataset_num_private_threads=dataset_num_private_threads,
-        loss_scale=loss_scale,
-        report_metrics=False,
-        batchnorm_spatial_persistent=True)
-    FLAGS.params_override = json.dumps(parameters)
-    if distribution_strategy == 'tpu':
-      total_batch_size = num_tpus * per_replica_batch_size
-    else:
-      total_batch_size = num_gpus * per_replica_batch_size
-
-    start_time_sec = time.time()
-    stats = classifier_trainer.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    # Number of logged step time entries that are excluded in performance
-    # report. We keep results from last 100 batches, or skip the steps based on
-    # input skip_steps.
-    warmup = (skip_steps or (self.train_steps - 100)) // FLAGS.log_steps
-
-    super(Resnet50KerasClassifierBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=total_batch_size,
-        log_steps=FLAGS.log_steps,
-        warmup=warmup,
-        start_time_sec=start_time_sec)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Tests Keras model with 1 GPU, no distribution strategy."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu_no_dist_strat',
-        num_gpus=1,
-        distribution_strategy='off',
-        per_replica_batch_size=128)
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Tests Keras model with 1 GPU, no distribution strategy, run eagerly."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly',
-        num_gpus=1,
-        run_eagerly=True,
-        distribution_strategy='off',
-        per_replica_batch_size=64)
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
-    """Tests with 1 GPU, no distribution strategy, fp16, run eagerly."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu_no_dist_strat_run_eagerly_fp16',
-        num_gpus=1,
-        run_eagerly=True,
-        distribution_strategy='off',
-        dtype='float16',
-        per_replica_batch_size=128)
-
-  def benchmark_1_gpu(self):
-    """Tests Keras model with 1 GPU."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu',
-        num_gpus=1,
-        distribution_strategy='one_device',
-        per_replica_batch_size=128)
-
-  def benchmark_xla_1_gpu(self):
-    """Tests Keras model with XLA and 1 GPU."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_1_gpu',
-        num_gpus=1,
-        enable_xla=True,
-        distribution_strategy='one_device',
-        per_replica_batch_size=128)
-
-  def benchmark_1_gpu_fp16(self):
-    """Tests Keras model with 1 GPU and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu_fp16',
-        num_gpus=1,
-        distribution_strategy='one_device',
-        dtype='float16',
-        per_replica_batch_size=256)
-
-  def benchmark_1_gpu_fp16_dynamic(self):
-    """Tests Keras model with 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_1_gpu_fp16_dynamic',
-        num_gpus=1,
-        distribution_strategy='one_device',
-        dtype='float16',
-        per_replica_batch_size=256,
-        loss_scale='dynamic')
-
-  def benchmark_xla_1_gpu_fp16(self):
-    """Tests Keras model with XLA, 1 GPU and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_1_gpu_fp16',
-        num_gpus=1,
-        enable_xla=True,
-        distribution_strategy='one_device',
-        dtype='float16',
-        per_replica_batch_size=256)
-
-  def benchmark_xla_1_gpu_fp16_tweaked(self):
-    """Tests Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_1_gpu_fp16_tweaked',
-        num_gpus=1,
-        enable_xla=True,
-        distribution_strategy='one_device',
-        dtype='float16',
-        per_replica_batch_size=256,
-        gpu_thread_mode='gpu_private')
-
-  def benchmark_xla_1_gpu_fp16_dynamic(self):
-    """Tests Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_1_gpu_fp16_dynamic',
-        num_gpus=1,
-        enable_xla=True,
-        distribution_strategy='one_device',
-        dtype='float16',
-        per_replica_batch_size=256,
-        loss_scale='dynamic')
-
-  def benchmark_8_gpu(self):
-    """Tests Keras model with 8 GPUs."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu',
-        num_gpus=8,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=128)
-
-  def benchmark_8_gpu_tweaked(self):
-    """Tests Keras model with manual config tuning and 8 GPUs."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu_tweaked',
-        num_gpus=8,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=128,
-        dataset_num_private_threads=14)
-
-  def benchmark_xla_8_gpu(self):
-    """Tests Keras model with XLA and 8 GPUs."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=128)
-
-  def benchmark_xla_8_gpu_tweaked(self):
-    """Tests Keras model with manual config tuning, 8 GPUs, and XLA."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_tweaked',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=128,
-        gpu_thread_mode='gpu_private',
-        dataset_num_private_threads=24)
-
-  def benchmark_8_gpu_fp16(self):
-    """Tests Keras model with 8 GPUs and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu_fp16',
-        num_gpus=8,
-        dtype='float16',
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256)
-
-  def benchmark_8_gpu_fp16_tweaked(self):
-    """Tests Keras model with 8 GPUs, fp16, and manual config tuning."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu_fp16_tweaked',
-        num_gpus=8,
-        dtype='float16',
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256,
-        gpu_thread_mode='gpu_private',
-        dataset_num_private_threads=40)
-
-  def benchmark_8_gpu_fp16_dynamic_tweaked(self):
-    """Tests Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8_gpu_fp16_dynamic_tweaked',
-        num_gpus=8,
-        dtype='float16',
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256,
-        loss_scale='dynamic',
-        gpu_thread_mode='gpu_private',
-        dataset_num_private_threads=40)
-
-  def benchmark_xla_8_gpu_fp16(self):
-    """Tests Keras model with XLA, 8 GPUs and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16',
-        dtype='float16',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256)
-
-  def benchmark_xla_8_gpu_fp16_tweaked(self):
-    """Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16_tweaked',
-        dtype='float16',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256,
-        gpu_thread_mode='gpu_private',
-        dataset_num_private_threads=48)
-
-  def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
-    """Tests with manual config tuning, XLA, 8 GPUs and fp16.
-
-    Delay performance measurement for stable performance on 96 vCPU platforms.
-    """
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16_tweaked_delay_measure',
-        dtype='float16',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256,
-        gpu_thread_mode='gpu_private',
-        dataset_num_private_threads=48,
-        steps=310)
-
-  def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
-    """Tests Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_xla_8_gpu_fp16_dynamic_tweaked',
-        dtype='float16',
-        num_gpus=8,
-        enable_xla=True,
-        distribution_strategy='mirrored',
-        per_replica_batch_size=256,
-        gpu_thread_mode='gpu_private',
-        loss_scale='dynamic',
-        dataset_num_private_threads=48)
-
-  def benchmark_2x2_tpu_bf16(self):
-    """Test Keras model with 2x2 TPU, bf16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_2x2_tpu_bf16',
-        dtype='bfloat16',
-        num_tpus=8,
-        distribution_strategy='tpu',
-        per_replica_batch_size=128)
-
-  def benchmark_4x4_tpu_bf16(self):
-    """Test Keras model with 4x4 TPU, bf16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_4x4_tpu_bf16',
-        dtype='bfloat16',
-        num_tpus=32,
-        distribution_strategy='tpu',
-        per_replica_batch_size=128)
-
-  def benchmark_8x8_tpu_bf16(self):
-    """Test Keras model with 8x8 TPU, bf16."""
-    self._setup()
-    self._run_and_report_benchmark(
-        experiment_name='benchmark_8x8_tpu_bf16',
-        dtype='bfloat16',
-        num_tpus=128,
-        distribution_strategy='tpu',
-        per_replica_batch_size=64)
-
-  def fill_report_object(self, stats):
-    super(Resnet50KerasClassifierBenchmarkBase, self).fill_report_object(
-        stats,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-
-class Resnet50KerasBenchmarkBase(keras_benchmark.KerasBenchmark):
-  """Resnet50 benchmarks."""
-
-  def __init__(self, output_dir=None, default_flags=None, tpu=None):
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
-
-    super(Resnet50KerasBenchmarkBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags,
-        tpu=tpu)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self, skip_steps=None):
-    start_time_sec = time.time()
-    stats = resnet_imagenet_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-    # Number of logged step time entries that are excluded in performance
-    # report. We keep results from last 100 batches, or skip the steps based on
-    # input skip_steps.
-    warmup = (skip_steps or (FLAGS.train_steps - 100)) // FLAGS.log_steps
-
-    super(Resnet50KerasBenchmarkBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps,
-        warmup=warmup,
-        start_time_sec=start_time_sec)
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
-    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.explicit_gpu_placement = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
-    FLAGS.batch_size = 64
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
-    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
-    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.explicit_gpu_placement = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test Keras model with 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_amp(self):
-    """Test Keras model with 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
-    FLAGS.batch_size = 256
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Test Keras model with XLA and 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
-    FLAGS.batch_size = 128
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_amp(self):
-    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
-    FLAGS.batch_size = 256
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16(self):
-    """Test Keras model with 1 GPU and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16_dynamic(self):
-    """Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.loss_scale = 'dynamic'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16(self):
-    """Test Keras model with XLA, 1 GPU and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16_tweaked(self):
-    """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16_dynamic(self):
-    """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.loss_scale = 'dynamic'
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_amp(self):
-    """Test Keras model with 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_tweaked(self):
-    """Test Keras model with manual config tuning and 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_tweaked')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    FLAGS.datasets_num_private_threads = 14
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu(self):
-    """Test Keras model with XLA and 8 GPUs."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu')
-    FLAGS.batch_size = 128 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_amp(self):
-    """Test Keras model with XLA and 8 GPUs with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_amp')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_tweaked(self):
-    """Test Keras model with manual config tuning, 8 GPUs, and XLA."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_tweaked')
-    FLAGS.batch_size = 128 * 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 24
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16(self):
-    """Test Keras model with 8 GPUs and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16_tweaked(self):
-    """Test Keras model with 8 GPUs, fp16, and manual config tuning."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_fp16_tweaked')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.dataset_num_private_threads = 40
-    self._run_and_report_benchmark()
-
-  def benchmark_8_gpu_fp16_dynamic_tweaked(self):
-    """Test Keras model with 8 GPUs, fp16, dynamic loss scaling, and tuned."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_8_gpu_fp16_dynamic_tweaked')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.loss_scale = 'dynamic'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.dataset_num_private_threads = 40
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_fp16(self):
-    """Test Keras model with XLA, 8 GPUs and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_fp16_tweaked(self):
-    """Test Keras model with manual config tuning, XLA, 8 GPUs and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_8_gpu_fp16_tweaked')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_fp16_tweaked_delay_measure(self):
-    """Test with manual config tuning, XLA, 8 GPUs and fp16.
-
-    Delay performance measurement for stable performance on 96 vCPU platforms.
-    """
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_fp16_tweaked_delay_measure')
-    FLAGS.batch_size = 256 * 8
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
-    FLAGS.train_steps = 310
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_8_gpu_fp16_dynamic_tweaked(self):
-    """Test Keras model with config tuning, XLA, 8 GPUs and dynamic fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'mirrored'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_xla_8_gpu_fp16_dynamic_tweaked')
-    FLAGS.batch_size = 256 * 8  # 8 GPUs
-    FLAGS.loss_scale = 'dynamic'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 48
-    self._run_and_report_benchmark()
-
-  def benchmark_2x2_tpu_bf16(self):
-    """Test Keras model with 2x2 TPU, bf16."""
-    self._setup()
-
-    FLAGS.dtype = 'bf16'
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_2x2_tpu_bf16')
-    FLAGS.batch_size = 1024
-    self._run_and_report_benchmark()
-
-  def benchmark_4x4_tpu_bf16(self):
-    """Test Keras model with 4x4 TPU, bf16."""
-    self._setup()
-
-    FLAGS.dtype = 'bf16'
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_4x4_tpu_bf16')
-    FLAGS.batch_size = 4096
-    self._run_and_report_benchmark()
-
-  def benchmark_8x8_tpu_bf16(self):
-    """Test Keras model with 8x8 TPU, bf16."""
-    self._setup()
-
-    FLAGS.dtype = 'bf16'
-    FLAGS.distribution_strategy = 'tpu'
-    FLAGS.model_dir = self._get_model_dir('benchmark_8x8_tpu_bf16')
-    FLAGS.batch_size = 8192
-    self._run_and_report_benchmark()
-
-  def fill_report_object(self, stats):
-    super(Resnet50KerasBenchmarkBase, self).fill_report_object(
-        stats,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-
-class Resnet50KerasBenchmarkSynth(Resnet50KerasClassifierBenchmarkBase):
-  """Resnet50 synthetic benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
-    def_flags = {}
-    def_flags['log_steps'] = 10
-
-    super(Resnet50KerasBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags, tpu=tpu,
-        dataset_builder='synthetic', train_epochs=1, train_steps=110)
-
-
-class Resnet50KerasBenchmarkReal(Resnet50KerasClassifierBenchmarkBase):
-  """Resnet50 real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, tpu=None, **kwargs):
-    data_dir = os.path.join(root_data_dir, 'imagenet')
-    def_flags = {}
-    def_flags['log_steps'] = 10
-
-    super(Resnet50KerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags, tpu=tpu,
-        dataset_builder='records', train_epochs=1, train_steps=110,
-        data_dir=data_dir)
-
-
-class Resnet50KerasBenchmarkRemoteData(Resnet50KerasBenchmarkBase):
-  """Resnet50 real data (stored in remote storage) benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['report_accuracy_metrics'] = False
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    # Defining multiple epochs overrides the train_steps setting in benchmarks.
-    def_flags['train_epochs'] = 2
-    # Cache dataset so performance is stable after the first epoch.
-    def_flags['training_dataset_cache'] = True
-    def_flags['log_steps'] = 100
-    # Note that for single GPU and pure eager tests which are less likely to be
-    # input bound and more stable, these tests will run for shorter time by
-    # overriding FLAGS.train_epochs, train_seteps, log_steps in benchmark
-    # methods, and skip_steps in _run_and_report_benchmark().
-
-    super(Resnet50KerasBenchmarkRemoteData, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-  def _override_flags_to_run_test_shorter(self):
-    FLAGS.train_epochs = 1
-    FLAGS.train_steps = 300
-    FLAGS.log_steps = 10
-
-  def benchmark_1_gpu_no_dist_strat(self):
-    """Test Keras model with 1 GPU, no distribution strategy."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_no_dist_strat')
-    FLAGS.batch_size = 128
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly(self):
-    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly')
-    FLAGS.batch_size = 64
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked(self):
-    """Test Keras model with 1 GPU, no distribution strategy, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.explicit_gpu_placement = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_tweaked')
-    FLAGS.batch_size = 64
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16(self):
-    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 128
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked(self):
-    """Test with 1 GPU, no distribution strategy, fp16, run eagerly."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.run_eagerly = True
-    FLAGS.explicit_gpu_placement = True
-    FLAGS.distribution_strategy = 'off'
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_1_gpu_no_dist_strat_run_eagerly_fp16_tweaked')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 128
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu(self):
-    """Test Keras model with 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu')
-    FLAGS.batch_size = 128
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_amp(self):
-    """Test Keras model with 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_amp')
-    FLAGS.batch_size = 256
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu(self):
-    """Test Keras model with XLA and 1 GPU."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu')
-    FLAGS.batch_size = 128
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_amp(self):
-    """Test Keras model with XLA and 1 GPU with automatic mixed precision."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.dtype = 'fp16'
-    FLAGS.fp16_implementation = 'graph_rewrite'
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_amp')
-    FLAGS.batch_size = 256
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16(self):
-    """Test Keras model with 1 GPU and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_1_gpu_fp16_dynamic(self):
-    """Test Keras model with 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_1_gpu_fp16_dynamic')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.loss_scale = 'dynamic'
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16(self):
-    """Test Keras model with XLA, 1 GPU and fp16."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16_tweaked(self):
-    """Test Keras model with XLA, 1 GPU, fp16, and manual config tuning."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_tweaked')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  def benchmark_xla_1_gpu_fp16_dynamic(self):
-    """Test Keras model with XLA, 1 GPU, fp16, and dynamic loss scaling."""
-    self._setup()
-
-    FLAGS.num_gpus = 1
-    FLAGS.enable_eager = True
-    FLAGS.enable_xla = True
-    FLAGS.distribution_strategy = 'one_device'
-    FLAGS.model_dir = self._get_model_dir('benchmark_xla_1_gpu_fp16_dynamic')
-    FLAGS.dtype = 'fp16'
-    FLAGS.batch_size = 256
-    FLAGS.loss_scale = 'dynamic'
-    self._override_flags_to_run_test_shorter()
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    if FLAGS.num_gpus == 1 or FLAGS.run_eagerly:
-      # For single GPU and pure eager tests which are less likely to be input
-      # bound and more stable, run for shorter time and use the default
-      # skip_steps.
-      skip_steps = None
-    else:
-      # skip the first epoch for performance measurement.
-      skip_steps = 600
-    super(Resnet50KerasBenchmarkRemoteData,
-          self)._run_and_report_benchmark(skip_steps=skip_steps)
-
-
-class TrivialKerasBenchmarkReal(keras_benchmark.KerasBenchmark):
-  """Trivial model with real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
-
-    def_flags = {}
-    def_flags['use_trivial_model'] = True
-    def_flags['skip_eval'] = True
-    def_flags['report_accuracy_metrics'] = False
-    def_flags['dtype'] = 'fp16'
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    def_flags['train_steps'] = 600
-    def_flags['log_steps'] = 100
-    def_flags['distribution_strategy'] = 'mirrored'
-
-    super(TrivialKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=def_flags)
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self):
-    start_time_sec = time.time()
-    stats = resnet_imagenet_main.run(FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(TrivialKerasBenchmarkReal, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-  def benchmark_8_gpu_warmup(self):
-    """Dummy test that runs over an epoch to warmup the machine."""
-    self._setup()
-
-    FLAGS.num_gpus = 8
-    FLAGS.enable_eager = True
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu_warmup')
-    FLAGS.batch_size = 256 * 8
-    FLAGS.train_steps = 700
-    self._run_and_report_benchmark()
-
-  def fill_report_object(self, stats):
-    super(TrivialKerasBenchmarkReal, self).fill_report_object(
-        stats,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=FLAGS.log_steps)
-
-
-class Resnet50MultiWorkerKerasAccuracy(keras_benchmark.KerasBenchmark):
-  """Resnet50 distributed accuracy tests with multiple workers."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    flag_methods = [classifier_trainer.define_imagenet_keras_flags]
-    self.data_dir = os.path.join(root_data_dir, 'imagenet')
-    super(Resnet50MultiWorkerKerasAccuracy, self).__init__(
-        output_dir=output_dir, flag_methods=flag_methods)
-
-  def _benchmark_common(self, eager, num_workers, all_reduce_alg):
-    """Common to all benchmarks in this class."""
-    self._setup()
-
-    num_gpus = 8
-    FLAGS.num_gpus = num_gpus
-    FLAGS.data_dir = self.data_dir
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = eager
-    FLAGS.enable_xla = False
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
-            'eager' if eager else 'graph', num_workers, all_reduce_alg))
-    FLAGS.batch_size = 256 * num_gpus * num_workers
-    FLAGS.all_reduce_alg = all_reduce_alg
-
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                top_1_min=MIN_TOP_1_ACCURACY,
-                                top_1_max=MAX_TOP_1_ACCURACY):
-    start_time_sec = time.time()
-    stats = classifier_trainer.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(Resnet50MultiWorkerKerasAccuracy, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=top_1_min,
-        top_1_max=top_1_max,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100)
-
-  def _get_model_dir(self, folder_name):
-    return os.path.join(self.output_dir, folder_name)
-
-  def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
-    """Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
-    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
-    """Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
-
-  def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
-    """Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
-    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
-    """Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
-
-
-class Resnet50MultiWorkerKerasBenchmark(Resnet50KerasBenchmarkBase):
-  """Resnet50 distributed benchmark tests with multiple workers."""
-
-  def __init__(self, output_dir=None, default_flags=None):
-    super(Resnet50MultiWorkerKerasBenchmark, self).__init__(
-        output_dir=output_dir, default_flags=default_flags)
-
-  def _benchmark_common(self, eager, num_workers, all_reduce_alg):
-    """Common to all benchmarks in this class."""
-    self._setup()
-
-    num_gpus = 8
-    FLAGS.num_gpus = num_gpus
-    FLAGS.dtype = 'fp16'
-    FLAGS.enable_eager = eager
-    FLAGS.enable_xla = False
-    FLAGS.distribution_strategy = 'multi_worker_mirrored'
-    FLAGS.tf_gpu_thread_mode = 'gpu_private'
-    FLAGS.datasets_num_private_threads = 32
-    FLAGS.model_dir = self._get_model_dir(
-        'benchmark_{}_8_gpu_{}_worker_fp16_{}_tweaked'.format(
-            'eager' if eager else 'graph', num_workers, all_reduce_alg))
-    FLAGS.batch_size = 256 * num_gpus * num_workers
-    FLAGS.all_reduce_alg = all_reduce_alg
-
-    self._run_and_report_benchmark()
-
-  def benchmark_eager_8_gpu_1_worker_fp16_ring_tweaked(self):
-    """Eager, 8 GPUs per worker, 1 worker, fp16, ring all-reduce."""
-    self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_1_worker_fp16_nccl_tweaked(self):
-    """Eager, 8 GPUs per worker, 1 worker, fp16, nccl all-reduce."""
-    self._benchmark_common(eager=True, num_workers=1, all_reduce_alg='nccl')
-
-  def benchmark_eager_8_gpu_2_workers_fp16_ring_tweaked(self):
-    """Eager, 8 GPUs per worker, 2 workers, fp16, ring all-reduce."""
-    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_2_workers_fp16_nccl_tweaked(self):
-    """Eager, 8 GPUs per worker, 2 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(eager=True, num_workers=2, all_reduce_alg='nccl')
-
-  def benchmark_eager_8_gpu_8_workers_fp16_ring_tweaked(self):
-    """Eager, 8 GPUs per worker, 8 workers, fp16, ring all-reduce."""
-    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='ring')
-
-  def benchmark_eager_8_gpu_8_workers_fp16_nccl_tweaked(self):
-    """Eager, 8 GPUs per worker, 8 workers, fp16, nccl all-reduce."""
-    self._benchmark_common(eager=True, num_workers=8, all_reduce_alg='nccl')
-
-
-class Resnet50MultiWorkerKerasBenchmarkSynth(Resnet50MultiWorkerKerasBenchmark):
-  """Resnet50 multi-worker synthetic data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['report_accuracy_metrics'] = False
-    def_flags['use_synthetic_data'] = True
-    def_flags['train_steps'] = 110
-    def_flags['log_steps'] = 10
-
-    super(Resnet50MultiWorkerKerasBenchmarkSynth, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-class Resnet50MultiWorkerKerasBenchmarkReal(Resnet50MultiWorkerKerasBenchmark):
-  """Resnet50 multi-worker real data benchmark tests."""
-
-  def __init__(self, output_dir=None, root_data_dir=None, **kwargs):
-    def_flags = {}
-    def_flags['skip_eval'] = True
-    def_flags['report_accuracy_metrics'] = False
-    def_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-    def_flags['train_steps'] = 110
-    def_flags['log_steps'] = 10
-
-    super(Resnet50MultiWorkerKerasBenchmarkReal, self).__init__(
-        output_dir=output_dir, default_flags=def_flags)
-
-
-# TODO(kimjaehong): It also should be also cover other metheods of model
-# optimization techniques. In that time, this class will change to something
-# like 'KerasModelOptimizationAccuracyBase'.
-class KerasPruningAccuracyBase(keras_benchmark.KerasBenchmark):
-  """Benchmark accuracy tests for pruning method."""
-
-  def __init__(self,
-               output_dir=None,
-               root_data_dir=None,
-               default_flags=None,
-               **kwargs):
-    """A accuracy benchmark class for pruning method.
-
-    Args:
-      output_dir: directory where to output e.g. log files
-      root_data_dir: directory under which to look for dataset
-      default_flags: default flags
-      **kwargs: arbitrary named arguments. This is needed to make the
-                constructor forward compatible in case PerfZero provides more
-                named arguments before updating the constructor.
-    """
-    if default_flags is None:
-      default_flags = {}
-    default_flags['pruning_method'] = 'polynomial_decay'
-    default_flags['data_dir'] = os.path.join(root_data_dir, 'imagenet')
-
-    flag_methods = [resnet_imagenet_main.define_imagenet_keras_flags]
-
-    super(KerasPruningAccuracyBase, self).__init__(
-        output_dir=output_dir,
-        flag_methods=flag_methods,
-        default_flags=default_flags,
-        **kwargs)
-
-  def benchmark_8_gpu(self):
-    """Test Keras model with eager, dist_strat and 8 GPUs."""
-    self._setup()
-    FLAGS.num_gpus = 8
-    FLAGS.batch_size = 32 * 8
-    FLAGS.train_epochs = 90
-    FLAGS.epochs_between_evals = 10
-    FLAGS.model_dir = self._get_model_dir('benchmark_8_gpu')
-    FLAGS.dtype = 'fp32'
-    FLAGS.enable_eager = True
-    self._run_and_report_benchmark()
-
-  @benchmark_wrappers.enable_runtime_flags
-  def _run_and_report_benchmark(self,
-                                top_1_min=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
-                                    'RESNET50_FINETUNE_PRUNING'][0],
-                                top_1_max=MODEL_OPTIMIZATION_TOP_1_ACCURACY[
-                                    'RESNET50_FINETUNE_PRUNING'][1]):
-    start_time_sec = time.time()
-    stats = resnet_imagenet_main.run(flags.FLAGS)
-    wall_time_sec = time.time() - start_time_sec
-
-    super(KerasPruningAccuracyBase, self)._report_benchmark(
-        stats,
-        wall_time_sec,
-        top_1_min=top_1_min,
-        top_1_max=top_1_max,
-        total_batch_size=FLAGS.batch_size,
-        log_steps=100)
-
-
-class MobilenetV1KerasPruningAccuracy(KerasPruningAccuracyBase):
-  """Benchmark accuracy tests for MobilenetV1 with pruning method."""
-
-  def __init__(self, root_data_dir=None, **kwargs):
-    default_flags = {
-        'model': 'mobilenet',
-        'optimizer': 'mobilenet_default',
-        'initial_learning_rate_per_sample': 0.00007,
-        'pretrained_filepath': tf.train.latest_checkpoint(
-            os.path.join(root_data_dir, 'mobilenet_v1')),
-        'pruning_begin_step': 0,
-        'pruning_end_step': 100000,
-        'pruning_initial_sparsity': 0.0,
-        'pruning_final_sparsity': 0.5,
-        'pruning_frequency': 100,
-    }
-    super(MobilenetV1KerasPruningAccuracy, self).__init__(
-        root_data_dir=root_data_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def _run_and_report_benchmark(self):
-    super(MobilenetV1KerasPruningAccuracy, self)._run_and_report_benchmark(
-        top_1_min=\
-        MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][0],
-        top_1_max=\
-        MODEL_OPTIMIZATION_TOP_1_ACCURACY['MOBILENET_V1_FINETUNE_PRUNING'][1])
-
-
-class Resnet50KerasPruningAccuracy(KerasPruningAccuracyBase):
-  """Benchmark accuracy tests for resnet50 with pruning method."""
-
-  def __init__(self, root_data_dir=None, **kwargs):
-    default_flags = {
-        'model': 'resnet50_v1.5',
-        'optimizer': 'mobilenet_default',
-        'initial_learning_rate_per_sample': 0.0000039,
-        'pretrained_filepath': tf.train.latest_checkpoint(
-            os.path.join(root_data_dir, 'resnet50')),
-        'pruning_begin_step': 0,
-        'pruning_end_step': 50000,
-        'pruning_initial_sparsity': 0.0,
-        'pruning_final_sparsity': 0.5,
-        'pruning_frequency': 100,
-    }
-    super(Resnet50KerasPruningAccuracy, self).__init__(
-        root_data_dir=root_data_dir,
-        default_flags=default_flags,
-        **kwargs)
-
-  def _run_and_report_benchmark(self):
-    super(Resnet50KerasPruningAccuracy, self)._run_and_report_benchmark(
-        top_1_min=\
-        MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][0],
-        top_1_max=\
-        MODEL_OPTIMIZATION_TOP_1_ACCURACY['RESNET50_FINETUNE_PRUNING'][1])
-
-
-class KerasPruningBenchmarkRealBase(Resnet50KerasBenchmarkBase):
-  """Pruning method benchmarks."""
-
-  def __init__(self, root_data_dir=None, default_flags=None, **kwargs):
-    if default_flags is None:
-      default_flags = {}
-    default_flags.update({
-        'skip_eval': True,
-        'report_accuracy_metrics': False,
-        'data_dir': os.path.join(root_data_dir, 'imagenet'),
-        'train_steps': 110,
-        'log_steps': 10,
-        'pruning_method': 'polynomial_decay',
-        'pruning_begin_step': 0,
-        'pruning_end_step': 50000,
-        'pruning_initial_sparsity': 0,
-        'pruning_final_sparsity': 0.5,
-        'pruning_frequency': 100,
-    })
-    super(KerasPruningBenchmarkRealBase, self).__init__(
-        default_flags=default_flags, **kwargs)
-
-
-class MobilenetV1KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
-  """Pruning method benchmarks for MobilenetV1."""
-
-  def __init__(self, **kwargs):
-    default_flags = {
-        'model': 'mobilenet',
-        'optimizer': 'mobilenet_default',
-    }
-    super(MobilenetV1KerasPruningBenchmarkReal, self).__init__(
-        default_flags=default_flags, **kwargs)
-
-
-class Resnet50KerasPruningBenchmarkReal(KerasPruningBenchmarkRealBase):
-  """Pruning method benchmarks for resnet50."""
-
-  def __init__(self, **kwargs):
-    default_flags = {
-        'model': 'resnet50_v1.5',
-        'optimizer': 'mobilenet_default',
-    }
-    super(Resnet50KerasPruningBenchmarkReal, self).__init__(
-        default_flags=default_flags, **kwargs)
-
-
-if __name__ == '__main__':
-  tf.test.main()