Merge branch 'tf2' into 'main'

tf2 detection See merge request dcutoolkit/deeplearing/dlexamples_new!2

Merge branch 'tf2' into 'main'
tf2 detection See merge request dcutoolkit/deeplearing/dlexamples_new!2
d0d91e12 · huchen · 2795dc1f · c320b6ef · d0d91e12 · d0d91e12
Commit d0d91e12 authored Apr 15, 2022 by huchen
20 changed files
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/.gitignore
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/.gitignore
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/MaskRCNN.iml
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/MaskRCNN.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/inspectionProfiles/profiles_settings.xml
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/inspectionProfiles/profiles_settings.xml
+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/misc.xml
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/modules.xml
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/MaskRCNN.iml" filepath="$PROJECT_DIR$/.idea/MaskRCNN.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/1.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/1.py
+import torch
+import torchvision
+
+print(torch.__version__)
+print(torchvision.__version__)
+
+print(torch.cuda.is_available())
+
+print(torch.cuda.device_count())
+
+print(torch.cuda.get_device_name(0))
+
+
+a=torch.Tensor([[1,1,2,2],[1,1,3.100001,3],[1,1,3.1,3]])
+
+b=torch.Tensor([0.9,0.98,0.980005])
+
+from torchvision.ops import nms
+ccc=nms(a,b,0.4)
+print(ccc)
+print(a[ccc])
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/Dockerfile
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/Dockerfile
+#===============================================================================
+#
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ==============================================================================
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
+FROM ${FROM_IMAGE_NAME}
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN rm -rf /workspace && mkdir -p /workspace
+ADD . /workspace
+WORKDIR /workspace
+
+RUN apt-get update && \
+    apt-get install -y libsm6 libxext6 libxrender-dev python3-tk cmake && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Make sure python and pip points to pip3 and python3
+RUN python -m pip install --upgrade pip && \
+    pip --no-cache-dir --no-cache install \
+        Cython \
+        matplotlib \
+        opencv-python-headless \
+        mpi4py \
+        Pillow \
+        pytest \
+        pyyaml && \
+    git clone https://github.com/pybind/pybind11 /opt/pybind11 && \
+    cd /opt/pybind11 && cmake . && make install && pip install . && \
+    pip --no-cache-dir --no-cache install \
+        'git+https://github.com/NVIDIA/cocoapi#egg=pycocotools&subdirectory=PythonAPI' && \
+    pip --no-cache-dir --no-cache install \
+        'git+https://github.com/NVIDIA/dllogger'
+
+
+# Update protobuf 3 to 3.3.0
+RUN \
+    curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip && \
+    unzip -u protoc-3.3.0-linux-x86_64.zip -d protoc3 && \
+    mv protoc3/bin/* /usr/local/bin/ && \
+    mv protoc3/include/* /usr/local/include/
+
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/LICENSE
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   
+   Copyright 2019 NVIDIA Corporation
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md
+# 简介
+* Tensorflow训练Mask R-CNN模型
+<br>
+# 环境准备  
+
+## 1）安装工具包  
+* rocm3.3环境安装tensorflow1.15  
+* 安装pycocotools  
+  pip3 install pycocotools -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com  
+* 更新pandas  
+  pip3 install -U pandas -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com  
+* 安装dllogger  
+  git clone --recursive https://github.com/NVIDIA/dllogger.git  
+  python3 setup.py install  
+<br>  
+## 2）数据处理（train 和 val）
+```  
+cd dataset/  
+git clone http://github.com/tensorflow/models tf-models  
+cd tf-models/research  
+wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip protobuf.zip  
+unzip protobuf.zip  
+./bin/protoc object_detection/protos/.proto --python_out=. 
+```
+返回dataset目录  
+  vim create_coco_tf_record.py  
+注释掉310 316行 
+<br> 
+```
+PYTHONPATH="tf-models:tf-models/research" python3 create_coco_tf_record.py \
+  --logtostderr \
+  --include_masks \
+  --train_image_dir=/path/to/COCO2017/images/train2017 \
+  --val_image_dir=/path/to/COCO2017/images/val2017 \
+  --train_object_annotations_file=/path/to/COCO2017/annotations/instances_train2017.json \
+  --val_object_annotations_file=/path/to/COCO2017/annotations/instances_val2017.json \
+  --train_caption_annotations_file=/path/to/COCO2017/annotations/captions_train2017.json \
+  --val_caption_annotations_file=/path/to/COCO2017/annotations/captions_val2017.json \
+  --output_dir=coco2017_tfrecord  
+```
+生成coco2017_tfrecord文件夹  
+## 3）预训练模型下载  
+<br>
+生成的模型文件结构如下:
+
+``` 
+weights/
+>mask-rcnn/1555659850/  
+https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/saved_model.pb 
+>>variables/  
+https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/variables/variables.data-00000-of-00001  
+https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/variables/variables.index  
+>resnet/
+>>extracted_from_maskrcnn/
+>>resnet-nhwc-2018-02-07/  
+https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/checkpoint 
+>>>model.ckpt-112603/  
+https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.data-00000-of-00001  
+https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.index  
+https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.meta  
+>>resnet-nhwc-2018-10-14/
+```
+
+# 测试  
+
+## 单卡训练  
+```
+python3 scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4}  
+python3 scripts/benchmark_training.py --gpus 1 --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
+```
+## 多卡训练 
+``` 
+python3 scripts/benchmark_training.py --gpus 2 --batch_size 4 --model_dir save_model_2dcu --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights 
+```
+
+## 推理  
+```
+python3 scripts/benchmark_inference.py --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
+```
+
+# 参考资料
+[https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN)
\ No newline at end of file
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md-org
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/README.md-org
+# Mask R-CNN For Tensorflow
+
+This repository provides a script and recipe to train the Mask R-CNN model for Tensorflow to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+
+## Table of Contents
+ 
+- [Model overview](#model-overview)
+   * [Model architecture](#model-architecture)
+   * [Default configuration](#default-configuration)
+   * [Feature support matrix](#feature-support-matrix)
+     * [Features](#features)
+   * [Mixed precision training](#mixed-precision-training)
+     * [Enabling mixed precision](#enabling-mixed-precision)
+     * [Enabling TF32](#enabling-tf32)
+- [Setup](#setup)
+   * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+   * [Scripts and sample code](#scripts-and-sample-code)
+   * [Parameters](#parameters)
+   * [Command-line options](#command-line-options)
+   * [Getting the data](#getting-the-data)
+     * [Dataset guidelines](#dataset-guidelines)
+     * [Multi-dataset](#multi-dataset)
+   * [Training process](#training-process)
+   * [Inference process](#inference-process)
+- [Performance](#performance)   
+   * [Benchmarking](#benchmarking)
+     * [Training performance benchmark](#training-performance-benchmark)
+     * [Inference performance benchmark](#inference-performance-benchmark)
+    * [Results](#results)
+        * [Training accuracy results TensorFlow 1.1x](#training-accuracy-results-tensorflow-11x)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)  
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16GB)
+        * [Training performance results Tensorflow 1.1x](#training-performance-results-tensorflow-11x)
+            * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb) 
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+        * [Training accuracy results TensorFlow 2.x](#training-accuracy-results-tensorflow-2x)
+            * [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb-1)  
+            * [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb-1)
+        * [Training performance results Tensorflow 2.x](#training-performance-results-tensorflow-2x)
+            * [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb-1) 
+            * [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb-1)
+        * [Inference performance results TensorFlow 1.1x](#inference-performance-results-tensorflow-11x)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
+        * [Inference performance results TensorFlow 2.x](#inference-performance-results-tensorflow-2x)
+            * [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb-1)
+            * [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb-1)
+- [Release notes](#release-notes)
+   * [Changelog](#changelog)
+   * [Known issues](#known-issues)
+
+## Model overview
+
+Mask R-CNN is a convolution-based neural network for the task of object instance segmentation. The paper describing the model can be found [here](https://arxiv.org/abs/1703.06870). NVIDIA’s Mask R-CNN 20.06 is an optimized version of [Google's TPU implementation](https://github.com/tensorflow/tpu/tree/master/models/official/mask_rcnn), leveraging mixed precision arithmetic using Tensor Cores on NVIDIA Volta, Turing, and Ampere GPUs while maintaining target accuracy. 
+
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results  2.2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+
+This repository also contains scripts to interactively launch training, 
+benchmarking and inference routines in a Docker container.
+
+The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
+
+- Mixed precision support with [TensorFlow AMP](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#tfamp).
+- Gradient accumulation to simulate larger batches.
+- Custom fused CUDA kernels for faster computations.
+
+There are other publicly NVIDIA available implementations of Mask R-CNN:
+
+- [NVIDIA PyTorch implementation](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/MaskRCNN)
+- [Matterport](https://github.com/matterport/Mask_RCNN)
+- [Tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN)
+
+### Model architecture
+
+Mask R-CNN builds on top of Faster R-CNN adding an additional mask head for the task of image segmentation.
+
+The architecture consists of the following:
+
+- ResNet-50 backbone with Feature Pyramid Network (FPN)
+- Region proposal network (RPN) head
+- RoI Align
+- Bounding and classification box head
+- Mask head
+
+### Default configuration
+
+The Mask R-CNN configuration and the hyper-parameters for training and testing purposes are in separate files.
+The default configuration of this model can be found at `mask-rcnn/hyperparameters/mask_rcnn_params.py`. 
+
+The default configuration is as follows:
+
+  - Feature extractor:
+    - Images resized with aspect ratio maintained and smaller side length between [832,1344]
+    - Ground Truth mask size 112
+    - Backbone network weights are frozen after second epoch
+
+  - RPN:
+    - Anchor stride set to 16
+    - Anchor sizes set to (32, 64, 128, 256, 512)
+    - Foreground IOU Threshold set to 0.7, Background IOU Threshold set to 0.3
+    - RPN target fraction of positive proposals set to 0.5
+    - Train Pre-NMS Top proposals set to 2000 per FPN layer
+    - Train Post-NMS Top proposals set to 1000
+    - Test Pre-NMS Top proposals set to 1000 per FPN layer
+    - Test Post-NMS Top proposals set to 1000
+    - RPN NMS Threshold set to 0.7
+
+  - RoI heads:
+    - Foreground threshold set to 0.5
+    - Batch size per image set to 512
+    - Positive fraction of batch set to 0.25
+
+The default hyper-parameters can be found at `mask-rcnn/hyperparameters/cmdline_utils.py`. 
+These hyperparameters can be overridden through the command-line options, in the launch scripts.
+
+### Feature support matrix
+
+The following features are supported by this model:
+
+| **Feature** | **Mask R-CNN** |
+-------------|---------------------|
+| Automatic mixed precision (AMP) | Yes |
+| Horovod Multi-GPU (NCCL)        | Yes |
+| Accelerated Linear Algebra (XLA)| Yes |     
+
+#### Features
+
+**Automatic Mixed Precision (AMP)**
+ 
+This implementation of Mask-RCNN uses AMP to implement mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying just a few lines of code.
+ 
+**Horovod**
+ 
+Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
+ 
+Multi-GPU training with Horovod
+ 
+Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+ 
+**XLA support (experimental)**
+ 
+XLA is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage: most internal benchmarks run ~1.1-1.5x faster after XLA is enabled.
+
+### Mixed precision training
+
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
+1.  Porting the model to use the FP16 data type where appropriate.    
+2.  Adding loss scaling to preserve small gradient values.
+
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code.  AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
+
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+
+#### Enabling mixed precision
+
+Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
+
+#### Enabling TF32
+
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+
+## Setup
+
+The following section lists the requirements that you need to meet in order to start training the Mask R-CNN model.
+
+### Requirements
+
+This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies. 
+Aside from these dependencies, ensure you have the following components:
+
+- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+- TensorFlow 20.06-tf1-py3 [NGC container](https://ngc.nvidia.com/registry/nvidia-tensorflow)
+- GPU-based architecture:
+    - [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+    - [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+
+
+For more information about how to get started with NGC containers, see the following sections from the 
+NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
+-   Running [TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
+
+For those unable to use the TensorFlow NGC container, to set up the required environment or create your own 
+container, see the versioned 
+[NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
+
+## Quick Start Guide
+
+To train your model using mixed precision with Tensor Cores or using 32-bit, perform the following steps using 
+the default parameters of the Mask R-CNN model on the COCO 2014 dataset.
+
+1. Clone the repository.
+
+    ```bash
+    git clone https://github.com/NVIDIA/DeepLearningExamples.git
+    cd DeepLearningExamples/TensorFlow/Segmentation/MaskRCNN
+    ```
+
+2.  Build the Mask R-CNN TensorFlow NGC container.
+
+    **For TensorFlow 1.1x:** `bash ./scripts/docker/build_tf1.sh`
+
+    **For TensorFlow 2.x:** `bash ./scripts/docker/build_tf2.sh`
+
+3.  Start an interactive session in the NGC container to run training/inference.
+
+    Run the following command to launch the Docker container, the only argument is the *absolute path* to the 
+    `data directory` which holds or will hold the `tfrecords` data. If data has not already been downloaded in the `data directory` then download it in step 4, else step 4 can be skipped.
+    
+    **For TensorFlow 1.1x:** `bash ./scripts/docker/launch_tf1.sh [data directory]`    
+    
+    **For TensorFlow 2.x:** `bash ./scripts/docker/launch_tf2.sh [data directory]`
+
+4.  Download and preprocess the dataset.
+
+    This repository provides scripts to download and extract the [COCO 2017 dataset](http://cocodataset.org/#download).  
+    If you already have the data then you do not need to run the following script, proceed to downloading the pre-trained weights. 
+    Data will be downloaded to the `data directory` provided in step 3.
+    
+    ```bash
+    cd dataset
+    bash download_and_preprocess_coco.sh /data
+    ```
+
+    By default, the data is organized into the following structure:
+
+    ```bash
+    <data/dir>
+    annotations/
+      instances_train2017.json
+      instances_val2017.json
+    train2017/
+      COCO_train2017_*.jpg
+    val2017/
+      COCO_val2017_*.jpg
+    ```
+
+    This repository also provides scripts to download the pre-trained weights of ResNet-50 backbone. 
+    The script will make a new directory with the name `weights` in the current directory and 
+    download the pre-trained weights in it.
+
+    ```bash
+    ./download_and_process_pretrained_weights.sh
+    ```
+
+    Ensure that the `weights` folder created has a `resnet` folder in it. Inside the `resnet` folder there 
+    should be 3 folders for checkpoints and weights: `extracted_from_maskrcnn`, `resnet-nhwc-2018-02-07` and 
+    `resnet-nhwc-2018-10-14`. Before moving to the next step, ensure the above folders are not empty.
+
+
+5. Start training.
+    
+    To run training for a default configuration (on 1/4/8 GPUs, AMP/32-bit), run one of the scripts in the 
+    `./scripts` directory called `./scripts/train{_AMP}_{1,4,8}GPU.sh`. For example: 
+    
+    `bash ./scripts/train_AMP_8GPU.sh`
+
+    The above script trains a model and performs an evaluation on the COCO 2017 dataset. By default, this training script:
+
+    -  Uses 8 GPUs.
+    -  Saves a checkpoint every 3696 iterations and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
+    -  Mixed precision training with Tensor Cores.
+
+6. Start validation/evaluation.
+
+    - For evaluation with AMP precision: `bash ./scripts/evaluation_AMP.sh`
+    - For evaluation with 32-bit precision: `bash ./scripts/evaluation.sh`
+
+## Advanced
+
+The following sections provide greater details of the dataset, running training and inference, and the training results.
+
+### Scripts and sample code
+
+Descriptions of the key scripts and folders are provided below.
+
+-  `mask_rcnn` - Contains codes to build individual components of the model such as 
+backbone, FPN, RPN, mask and bbox heads etc.
+-  `download_and_process_pretrained_weights.sh` - Can be used to download backbone pre-trained weights.
+-  `scripts/` - A folder that contains shell scripts to train the model and perform inferences.
+    -   `train{_AMP}_{1,4,8}GPU.sh` - Training script on 1, 4, 8 GPUs with AMP or 32-bit precision.
+    -   `evaluation_{AMP}.sh` - Evaluations script on either AMP precision or 32-bit precision.
+    -   `benchmark_training.py` - Script for running train performance benchmarks.
+    -   `benchmark_inference.py` - Script for running inference performance benchmarks.
+-  `dataset/` - A folder that contains shell scripts and Python files to download the dataset.
+-  `mask_rcnn_main.py` - Is the main function that is the starting point for the training and evaluation process.
+-  `docker/` - A folder that contains scripts to build a Docker image and start an interactive session.
+
+### Parameters
+
+#### `mask_rcnn_main.py` script parameters
+
+You can modify the training behavior through the various flags in both the `train_net.py` script and through overriding specific parameters in the config files. Flags in the `mask_rcnn_main.py` script are as follows:
+
+-   `--mode` - Specifies the action to take like `train`, `train_and_eval` or `eval`.
+-   `--checkpoint` - The checkpoint of the backbone.
+-   `--eval_samples` - Number of samples to evaluate.
+-   `--init_learning_rate` - Initial learning rate.
+-   `--learning_rate_steps` - Specifies at which steps to reduce the learning rate.
+-   `--num_steps_per_eval` - Specifies after how many steps of training evaluation should be performed.
+-   `--total_steps` - Specifies the total number of steps for which training should be run.
+-   `--train_batch_size` - Training batch size per GPU.
+-   `--eval_batch_size` - Evaluation batch size per GPU.
+-   `--amp` - Specifies to use AMP precision or 32-bit.
+-   `--xla` - Specifies to use XLA (Accelerated Linear Algebra) of TensorFlow or not.
+
+### Command-line options
+
+To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
+`python mask_rcnn_main.py --helpfull`
+
+### Getting the data
+
+The Mask R-CNN model was trained on the COCO 2017 dataset.  This dataset comes with a training and validation set.
+
+This repository contains the `./dataset/download_and_preprocess_coco.sh` script which automatically downloads and preprocesses the training and validation sets. The helper scripts are also present in the `dataset/` folder.
+
+#### Dataset guidelines
+
+The data should be organized into the following structure:
+
+```bash
+<data/dir>
+annotations/
+  instances_train2017.json
+  instances_val2017.json
+train2017/
+  COCO_train2017_*.jpg
+val2017/
+  COCO_val2017_*.jpg
+```
+
+### Training process
+
+Training is performed using the `mask_rcnn_main.py` script along with parameters defined in the config files. 
+The default config files can be found in the 
+`mask_rcnn_tf/mask_rcnn/mask_rcnn_params.py, mask_rcnn_tf/mask_rcnn/cmd_utils.py` files. To specify which GPUs to train on, `CUDA_VISIBLE_DEVICES` variable can be changed in the training scripts
+provided in the `scripts` folder. 
+
+This script outputs results to the `/results` directory by default. The training log will contain information about:
+
+-   Loss, time per iteration, learning rate and memory metrics
+-   Performance values such as throughput per step
+-   Test accuracy and test performance values after evaluation
+
+### Inference process
+
+To run inference run `mask_rcnn_main.py` with commandline parameter 
+`mode=eval`. To run inference with a checkpoint, set the commandline 
+parameter `--model_dir` to `[absolute path of checkpoint folder]`.
+
+The inference log will contain information about:
+
+-   Inference time per step
+-   Inference throughput per step
+-   Evaluation accuracy and performance values
+
+
+## Performance
+
+### Benchmarking
+
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+
+#### Training performance benchmark
+
+To run training benchmarking on a selected number of GPUs with either AMP or 32-bit precision, run the following script:
+
+```bash
+python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]
+```
+
+#### Inference performance benchmark
+
+To run inference benchmarking on a single GPU with either AMP or 32-bit precision, run the following script:
+
+```bash
+python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]
+```
+
+### Results
+
+The following sections provide details on how we achieved our performance and accuracy in training and inference.
+
+#### Training accuracy results Tensorflow 1.1x
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+ 
+Our results were obtained by building and launching the docker containers for TensorFlow 1.1x `./scripts/docker/build_tf1.sh`, `bash ./scripts/docker/launch_tf1.sh [data directory]` respectively and running the `scripts/train{_AMP}_{1,4,8}GPU.sh`  training script on NVIDIA DGX A100 (8x A100 40GB) GPUs.
+
+| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train  | Time to train speedup |
+|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
+| 8    | 4                | TF32      | 0.3777        | 0.3435        | 5 h            | -                     |
+| 8    | 4                | AMP       | 0.3782        | 0.3432        | 4 h            | 1.25                  |
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by building and launching the docker containers for TensorFlow 1.1x `./scripts/docker/build_tf1.sh`, `bash ./scripts/docker/launch_tf1.sh [data directory]` respectively and running the `scripts/train{_AMP}_{1,4,8}GPU.sh`  training script on NVIDIA DGX-1 with 8x V100 16GB GPUs.
+
+| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train  | Time to train speedup |
+|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
+| 8    | 4                | FP32      | 0.3767        | 0.3420        | 14 h           | -                     |
+| 8    | 4                | AMP       | 0.3770        | 0.3423        | 9 h            | 1.50                  |
+
+**Learning curves**
+
+The following image shows the training loss as a function of iteration for training using DGX A100 (TF32 and TF-AMP) and DGX-1 V100 (FP32 and TF-AMP).
+
+![LearningCurvesTF1](images/MaskRCNN_TF1_conv.png)
+
+#### Training performance results Tensorflow 1.1x
+
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3 
+NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
+
+| GPUs | Batch size / GPU | Throughput - TF32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|---|---|-------|--------|------|------|------|
+| 1 | 2 | 11.38 | 18.51  | 1.63 | -    | -    |
+| 1 | 4 | 12.49 | 21.20  | 1.70 | -    | -    |
+| 4 | 2 | 43.95 | 65.74  | 1.50 | 3.86 | 3.55 |
+| 4 | 4 | 48.26 | 72.96  | 1.51 | 3.86 | 3.44 |
+| 8 | 2 | 81.69 | 114.59 | 1.40 | 7.18 | 6.19 |
+| 8 | 4 | 89.02 | 132.31 | 1.49 | 7.13 | 6.24 |
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3 
+NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
+
+| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|---|---|-------|-------|------|------|------|
+| 1 | 2 | 6.37  | 12.19 | 1.91 | -    | -    |
+| 1 | 4 | 6.79  | 12.79 | 1.88 | -    | -    |
+| 4 | 2 | 23.32 | 30.82 | 1.32 | 3.66 | 2.53 |
+| 4 | 4 | 22.96 | 36.45 | 1.59 | 3.38 | 2.85 |
+| 8 | 2 | 40.18 | 58.41 | 1.45 | 6.31 | 4.79 |
+| 8 | 4 | 42.65 | 62.80 | 1.47 | 6.28 | 4.91 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Training accuracy results Tensorflow 2.x
+
+##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
+ 
+Our results were obtained by running the `scripts/train{_AMP}_{1,4,8}GPU.sh`  training script in the 
+TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs.
+
+| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train  | Time to train speedup |
+|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
+| 8    | 4                | TF32      | 0.3783        | 0.3400        | 5 h            | -                     |
+| 8    | 4                | AMP       | 0.3796        | 0.3415        | 4 h            | 1.25                  |
+
+
+
+##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running the `scripts/train{_AMP}_{1,4,8}GPU.sh`  training script in the 
+TensorFlow 20.06-py3 NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs.
+
+| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train  | Time to train speedup |
+|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
+| 8    | 4                | FP32      | 0.3784        | 0.3400        | 14 h           | -                     |
+| 8    | 4                | AMP       | 0.3786        | 0.3410        | 9 h            | 1.50                  |
+
+**Learning curves**
+
+The following image shows the training loss as a function of iteration for training using DGX A100 (TF32 and TF-AMP) and DGX-1 V100 (FP32 and TF-AMP).
+
+![LearningCurvesTF2](images/MaskRCNN_TF2_conv.png)
+
+#### Training performance results Tensorflow 2.x
+
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+
+Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3 
+NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
+
+| GPUs | Batch size / GPU | Throughput - TF32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|---|---|-------------|-------------|------|------|------|
+| 1 | 2 | 11.83822087 | 18.5130037  | 1.56 | -    | -    |
+| 1 | 4 | 12.67925418 | 19.93965192 | 1.57 | -    | -    |
+| 4 | 2 | 44.50704695 | 58.11168627 | 1.31 | 3.76 | 3.14 |
+| 4 | 4 | 47.38663139 | 64.66523539 | 1.36 | 3.74 | 3.24 |
+| 8 | 2 | 80.21134592 | 110.9716499 | 1.38 | 6.78 | 5.99 |
+| 8 | 4 | 89.93247608 | 150.0217503 | 1.67 | 7.09 | 7.52 |
+
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+
+Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3 
+NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
+
+| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|---|---|-------|-------|------|------|------|
+| 1 | 2 | 5.70  | 11.63 | 2.04 | -    | -    |
+| 1 | 4 | 6.20  | 12.63 | 2.04 | -    | -    |
+| 4 | 2 | 21.22 | 25.18 | 1.19 | 3.72 | 2.16 |
+| 4 | 4 | 21.79 | 30.63 | 1.41 | 3.51 | 2.42 |
+| 8 | 2 | 38.64 | 52.13 | 1.35 | 6.78 | 4.48 |
+| 8 | 4 | 40.76 | 59.62 | 1.46 | 6.57 | 4.72 |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Inference performance results TensorFlow 1.1x
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+
+Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3 
+NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
+
+FP16
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         28.37          |
+|     4      |         31.35          |
+|     8      |         33.79          |
+
+TF32
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         21.81          |
+|     4      |         23.77          |
+|     8      |         24.59          |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+
+Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3 
+NGC container on NVIDIA DGX-1 V100 (1x V100 16GB) GPU.
+
+FP16
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         23.52          |
+|     4      |         24.64          |
+|     8      |         26.83          |
+
+FP32
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         14.85          |
+|     4      |         15.45          |
+|     8      |         16.00          |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+#### Inference performance results TensorFlow 2.x
+
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+
+Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3 
+NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU. 
+
+FP16
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         26.28          |
+|     4      |         36.23          |
+|     8      |         40.84          |
+
+
+TF32
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         20.20          |
+|     4      |         24.94          |
+|     8      |         31.38          |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+
+##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
+
+Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3 
+NGC container on NVIDIA DGX-1 V100 (1x V100 16GB) GPU. 
+
+FP16
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         23.63          |
+|     4      |         27.64          |
+|     8      |         33.60          |
+
+FP32
+
+| Batch size | Throughput Avg [img/s] |
+|:----------:|:----------------------:|
+|     2      |         15.45          |
+|     4      |         16.71          |
+|     8      |         18.78          |
+
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+
+## Release notes
+
+### Changelog
+
+June 2020
+
+- Updated accuracy tables with A100 results
+
+- Updated training and inference performance tables with A100 results
+
+March 2020
+
+- Initial release
+
+### Known issues
+
+There are no known issues with this model.
+
+
+
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/dataset/create_coco_tf_record.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/dataset/create_coco_tf_record.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Convert raw COCO dataset to TFRecord for object_detection.
+
+Example usage:
+    python create_coco_tf_record.py --logtostderr \
+      --train_image_dir="${TRAIN_IMAGE_DIR}" \
+      --val_image_dir="${VAL_IMAGE_DIR}" \
+      --test_image_dir="${TEST_IMAGE_DIR}" \
+      --train_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
+      --val_annotations_file="${VAL_ANNOTATIONS_FILE}" \
+      --testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \
+      --output_dir="${OUTPUT_DIR}"
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import hashlib
+import io
+import json
+import multiprocessing
+import os
+from absl import app
+from absl import flags
+import numpy as np
+import PIL.Image
+
+from pycocotools import mask
+from research.object_detection.utils import dataset_util
+from research.object_detection.utils import label_map_util
+
+import tensorflow as tf
+
+flags.DEFINE_boolean('include_masks', False,
+                     'Whether to include instance segmentations masks '
+                     '(PNG encoded) in the result. default: False.')
+flags.DEFINE_string('train_image_dir', '', 'Training image directory.')
+flags.DEFINE_string('val_image_dir', '', 'Validation image directory.')
+flags.DEFINE_string('test_image_dir', '', 'Test image directory.')
+flags.DEFINE_string('train_object_annotations_file', '', '')
+flags.DEFINE_string('val_object_annotations_file', '', '')
+flags.DEFINE_string('train_caption_annotations_file', '', '')
+flags.DEFINE_string('val_caption_annotations_file', '', '')
+flags.DEFINE_string('testdev_annotations_file', '',
+                    'Test-dev annotations JSON file.')
+flags.DEFINE_string('output_dir', '/tmp/', 'Output data directory.')
+
+FLAGS = flags.FLAGS
+
+tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
+
+def create_tf_example(image,
+                      bbox_annotations,
+                      caption_annotations,
+                      image_dir,
+                      category_index,
+                      include_masks=False):
+  """Converts image and annotations to a tf.Example proto.
+
+  Args:
+    image: dict with keys:
+      [u'license', u'file_name', u'coco_url', u'height', u'width',
+      u'date_captured', u'flickr_url', u'id']
+    bbox_annotations:
+      list of dicts with keys:
+      [u'segmentation', u'area', u'iscrowd', u'image_id',
+      u'bbox', u'category_id', u'id']
+      Notice that bounding box coordinates in the official COCO dataset are
+      given as [x, y, width, height] tuples using absolute coordinates where
+      x, y represent the top-left (0-indexed) corner.  This function converts
+      to the format expected by the Tensorflow Object Detection API (which is
+      which is [ymin, xmin, ymax, xmax] with coordinates normalized relative
+      to image size).
+    image_dir: directory containing the image files.
+    category_index: a dict containing COCO category information keyed
+      by the 'id' field of each category.  See the
+      label_map_util.create_category_index function.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+  Returns:
+    example: The converted tf.Example
+    num_annotations_skipped: Number of (invalid) annotations that were ignored.
+
+  Raises:
+    ValueError: if the image pointed to by data['filename'] is not a valid JPEG
+  """
+  image_height = image['height']
+  image_width = image['width']
+  filename = image['file_name']
+  image_id = image['id']
+
+  full_path = os.path.join(image_dir, filename)
+  with tf.io.gfile.GFile(full_path, 'rb') as fid:
+    encoded_jpg = fid.read()
+  encoded_jpg_io = io.BytesIO(encoded_jpg)
+  image = PIL.Image.open(encoded_jpg_io)
+  key = hashlib.sha256(encoded_jpg).hexdigest()
+
+  xmin = []
+  xmax = []
+  ymin = []
+  ymax = []
+  is_crowd = []
+  category_names = []
+  category_ids = []
+  area = []
+  encoded_mask_png = []
+  num_annotations_skipped = 0
+  for object_annotations in bbox_annotations:
+    (x, y, width, height) = tuple(object_annotations['bbox'])
+    if width <= 0 or height <= 0:
+      num_annotations_skipped += 1
+      continue
+    if x + width > image_width or y + height > image_height:
+      num_annotations_skipped += 1
+      continue
+    xmin.append(float(x) / image_width)
+    xmax.append(float(x + width) / image_width)
+    ymin.append(float(y) / image_height)
+    ymax.append(float(y + height) / image_height)
+    is_crowd.append(object_annotations['iscrowd'])
+    category_id = int(object_annotations['category_id'])
+    category_ids.append(category_id)
+    category_names.append(category_index[category_id]['name'].encode('utf8'))
+    area.append(object_annotations['area'])
+
+    if include_masks:
+      run_len_encoding = mask.frPyObjects(object_annotations['segmentation'],
+                                          image_height, image_width)
+      binary_mask = mask.decode(run_len_encoding)
+      if not object_annotations['iscrowd']:
+        binary_mask = np.amax(binary_mask, axis=2)
+      pil_image = PIL.Image.fromarray(binary_mask)
+      output_io = io.BytesIO()
+      pil_image.save(output_io, format='PNG')
+      encoded_mask_png.append(output_io.getvalue())
+
+  captions = []
+  for caption_annotation in caption_annotations:
+    captions.append(caption_annotation['caption'].encode('utf8'))
+
+  feature_dict = {
+      'image/height':
+          dataset_util.int64_feature(image_height),
+      'image/width':
+          dataset_util.int64_feature(image_width),
+      'image/filename':
+          dataset_util.bytes_feature(filename.encode('utf8')),
+      'image/source_id':
+          dataset_util.bytes_feature(str(image_id).encode('utf8')),
+      'image/key/sha256':
+          dataset_util.bytes_feature(key.encode('utf8')),
+      'image/encoded':
+          dataset_util.bytes_feature(encoded_jpg),
+      'image/caption':
+        dataset_util.bytes_list_feature(captions),
+      'image/format':
+          dataset_util.bytes_feature('jpeg'.encode('utf8')),
+      'image/object/bbox/xmin':
+          dataset_util.float_list_feature(xmin),
+      'image/object/bbox/xmax':
+          dataset_util.float_list_feature(xmax),
+      'image/object/bbox/ymin':
+          dataset_util.float_list_feature(ymin),
+      'image/object/bbox/ymax':
+          dataset_util.float_list_feature(ymax),
+      'image/object/class/text':
+          dataset_util.bytes_list_feature(category_names),
+      'image/object/class/label':
+          dataset_util.int64_list_feature(category_ids),
+      'image/object/is_crowd':
+          dataset_util.int64_list_feature(is_crowd),
+      'image/object/area':
+          dataset_util.float_list_feature(area),
+  }
+  if include_masks:
+    feature_dict['image/object/mask'] = (
+        dataset_util.bytes_list_feature(encoded_mask_png))
+  example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
+  return key, example, num_annotations_skipped
+
+
+def _pool_create_tf_example(args):
+  return create_tf_example(*args)
+
+
+def _load_object_annotations(object_annotations_file):
+  with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
+    obj_annotations = json.load(fid)
+
+  images = obj_annotations['images']
+  category_index = label_map_util.create_category_index(
+      obj_annotations['categories'])
+
+  img_to_obj_annotation = collections.defaultdict(list)
+  tf.compat.v1.logging.info('Building bounding box index.')
+  for annotation in obj_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_obj_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_obj_annotation:
+      missing_annotation_count += 1
+
+  tf.compat.v1.logging.info('%d images are missing bboxes.', missing_annotation_count)
+
+  return images, img_to_obj_annotation, category_index
+
+
+def _load_caption_annotations(caption_annotations_file):
+  with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
+    caption_annotations = json.load(fid)
+
+  img_to_caption_annotation = collections.defaultdict(list)
+  tf.compat.v1.logging.info('Building caption index.')
+  for annotation in caption_annotations['annotations']:
+    image_id = annotation['image_id']
+    img_to_caption_annotation[image_id].append(annotation)
+
+  missing_annotation_count = 0
+  images = caption_annotations['images']
+  for image in images:
+    image_id = image['id']
+    if image_id not in img_to_caption_annotation:
+      missing_annotation_count += 1
+
+  tf.compat.v1.logging.info('%d images are missing captions.', missing_annotation_count)
+
+  return img_to_caption_annotation
+
+
+def _create_tf_record_from_coco_annotations(
+    object_annotations_file,
+    caption_annotations_file,
+    image_dir, output_path, include_masks, num_shards):
+  """Loads COCO annotation json files and converts to tf.Record format.
+
+  Args:
+    object_annotations_file: JSON file containing bounding box annotations.
+    caption_annotations_file: JSON file containing caption annotations.
+    image_dir: Directory containing the image files.
+    output_path: Path to output tf.Record file.
+    include_masks: Whether to include instance segmentations masks
+      (PNG encoded) in the result. default: False.
+    num_shards: Number of output files to create.
+  """
+
+  tf.compat.v1.logging.info('writing to output path: %s', output_path)
+  writers = [
+      tf.io.TFRecordWriter(output_path + '-%05d-of-%05d.tfrecord' %
+                                  (i, num_shards)) for i in range(num_shards)
+  ]
+
+  images, img_to_obj_annotation, category_index = (
+      _load_object_annotations(object_annotations_file))
+  img_to_caption_annotation = (
+      _load_caption_annotations(caption_annotations_file))
+
+  pool = multiprocessing.Pool()
+  total_num_annotations_skipped = 0
+  for idx, (_, tf_example, num_annotations_skipped) in enumerate(
+      pool.imap(_pool_create_tf_example,
+                [(image,
+                  img_to_obj_annotation[image['id']],
+                  img_to_caption_annotation[image['id']],
+                  image_dir,
+                  category_index,
+                  include_masks)
+                 for image in images])):
+    if idx % 100 == 0:
+      tf.compat.v1.logging.info('On image %d of %d', idx, len(images))
+
+    total_num_annotations_skipped += num_annotations_skipped
+    writers[idx % num_shards].write(tf_example.SerializeToString())
+
+  pool.close()
+  pool.join()
+
+  for writer in writers:
+    writer.close()
+
+  tf.compat.v1.logging.info('Finished writing, skipped %d annotations.',
+                  total_num_annotations_skipped)
+
+
+def main(_):
+  assert FLAGS.train_image_dir, '`train_image_dir` missing.'
+  assert FLAGS.val_image_dir, '`val_image_dir` missing.'
+  assert FLAGS.test_image_dir, '`test_image_dir` missing.'
+
+  if not tf.io.gfile.isdir(FLAGS.output_dir):
+    tf.io.gfile.makedirs(FLAGS.output_dir)
+  train_output_path = os.path.join(FLAGS.output_dir, 'train')
+  val_output_path = os.path.join(FLAGS.output_dir, 'val')
+  testdev_output_path = os.path.join(FLAGS.output_dir, 'test-dev')
+
+  _create_tf_record_from_coco_annotations(
+      FLAGS.train_object_annotations_file,
+      FLAGS.train_caption_annotations_file,
+      FLAGS.train_image_dir,
+      train_output_path,
+      FLAGS.include_masks,
+      num_shards=256)
+  _create_tf_record_from_coco_annotations(
+      FLAGS.val_object_annotations_file,
+      FLAGS.val_caption_annotations_file,
+      FLAGS.val_image_dir,
+      val_output_path,
+      FLAGS.include_masks,
+      num_shards=32)
+
+
+if __name__ == '__main__':
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+  app.run(main)
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/dataset/download_and_preprocess_coco.sh
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/dataset/download_and_preprocess_coco.sh
+#!/bin/bash
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# Script to download and preprocess the COCO data set for detection.
+#
+# The outputs of this script are TFRecord files containing serialized
+# tf.Example protocol buffers. See create_coco_tf_record.py for details of how
+# the tf.Example protocol buffers are constructed and see
+# http://cocodataset.org/#overview for an overview of the dataset.
+#
+# usage:
+#  bash download_and_preprocess_coco.sh /data-dir/coco
+set -e
+set -x
+
+
+if [ -z "$1" ]; then
+  echo "usage download_and_preprocess_coco.sh [data dir]"
+  exit
+fi
+
+#sudo apt install -y protobuf-compiler python-pil python-lxml\
+#  python-pip python-dev git unzip
+
+#pip install Cython git+https://github.com/cocodataset/cocoapi#subdirectory=PythonAPI
+
+echo "Cloning Tensorflow models directory (for conversion utilities)"
+if [ ! -e tf-models ]; then
+  git clone http://github.com/tensorflow/models tf-models
+fi
+
+(cd tf-models/research && protoc object_detection/protos/*.proto --python_out=.)
+
+UNZIP="unzip -nq"
+
+# Create the output directories.
+OUTPUT_DIR="${1%/}"
+SCRATCH_DIR="${OUTPUT_DIR}/raw-data"
+mkdir -p "${OUTPUT_DIR}"
+mkdir -p "${SCRATCH_DIR}"
+CURRENT_DIR=$(pwd)
+
+# Helper function to download and unpack a .zip file.
+function download_and_unzip() {
+  local BASE_URL=${1}
+  local FILENAME=${2}
+
+  if [ ! -f ${FILENAME} ]; then
+    echo "Downloading ${FILENAME} to $(pwd)"
+    wget -nd -c "${BASE_URL}/${FILENAME}"
+  else
+    echo "Skipping download of ${FILENAME}"
+  fi
+  echo "Unzipping ${FILENAME}"
+  ${UNZIP} ${FILENAME}
+}
+
+cd ${SCRATCH_DIR}
+
+# Download the images.
+BASE_IMAGE_URL="http://images.cocodataset.org/zips"
+
+TRAIN_IMAGE_FILE="train2017.zip"
+download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE}
+TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2017"
+
+VAL_IMAGE_FILE="val2017.zip"
+download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE}
+VAL_IMAGE_DIR="${SCRATCH_DIR}/val2017"
+
+TEST_IMAGE_FILE="test2017.zip"
+download_and_unzip ${BASE_IMAGE_URL} ${TEST_IMAGE_FILE}
+TEST_IMAGE_DIR="${SCRATCH_DIR}/test2017"
+
+# Download the annotations.
+BASE_INSTANCES_URL="http://images.cocodataset.org/annotations"
+INSTANCES_FILE="annotations_trainval2017.zip"
+download_and_unzip ${BASE_INSTANCES_URL} ${INSTANCES_FILE}
+
+TRAIN_OBJ_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_train2017.json"
+VAL_OBJ_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_val2017.json"
+
+TRAIN_CAPTION_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/captions_train2017.json"
+VAL_CAPTION_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/captions_val2017.json"
+
+# Download the test image info.
+BASE_IMAGE_INFO_URL="http://images.cocodataset.org/annotations"
+IMAGE_INFO_FILE="image_info_test2017.zip"
+download_and_unzip ${BASE_IMAGE_INFO_URL} ${IMAGE_INFO_FILE}
+
+TESTDEV_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/image_info_test-dev2017.json"
+
+# # Build TFRecords of the image data.
+cd "${CURRENT_DIR}"
+
+# Setup packages
+touch tf-models/__init__.py
+touch tf-models/research/__init__.py
+
+# Run our conversion
+SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
+
+PYTHONPATH="tf-models:tf-models/research" python $SCRIPT_DIR/create_coco_tf_record.py \
+  --logtostderr \
+  --include_masks \
+  --train_image_dir="${TRAIN_IMAGE_DIR}" \
+  --val_image_dir="${VAL_IMAGE_DIR}" \
+  --test_image_dir="${TEST_IMAGE_DIR}" \
+  --train_object_annotations_file="${TRAIN_OBJ_ANNOTATIONS_FILE}" \
+  --val_object_annotations_file="${VAL_OBJ_ANNOTATIONS_FILE}" \
+  --train_caption_annotations_file="${TRAIN_CAPTION_ANNOTATIONS_FILE}" \
+  --val_caption_annotations_file="${VAL_CAPTION_ANNOTATIONS_FILE}" \
+  --testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \
+  --output_dir="${OUTPUT_DIR}"
+
+mv ${SCRATCH_DIR}/annotations/ ${OUTPUT_DIR}
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/download_and_process_pretrained_weights.sh
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/download_and_process_pretrained_weights.sh
+#!/usr/bin/env bash
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+mkdir -p /model
+cd /model
+
+# DOWNLOAD CHECKPOINTS
+
+## Mask RCNN
+## ====================== Mask RCNN ====================== ##
+BASE_URL="https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850"
+DEST_DIR="mask-rcnn/1555659850"
+
+wget -N ${BASE_URL}/saved_model.pb -P ${DEST_DIR}
+wget -N ${BASE_URL}/variables/variables.data-00000-of-00001 -P ${DEST_DIR}/variables
+wget -N ${BASE_URL}/variables/variables.index -P ${DEST_DIR}/variables
+
+## ====================== resnet-nhwc-2018-02-07 ====================== ##
+BASE_URL="https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07"
+DEST_DIR="resnet/resnet-nhwc-2018-02-07"
+
+wget -N ${BASE_URL}/checkpoint -P ${DEST_DIR}
+wget -N ${BASE_URL}/model.ckpt-112603.data-00000-of-00001 -P ${DEST_DIR}
+wget -N ${BASE_URL}/model.ckpt-112603.index  -P ${DEST_DIR}
+wget -N ${BASE_URL}/model.ckpt-112603.meta -P ${DEST_DIR}
+
+## ====================== resnet-nhwc-2018-10-14 ====================== ##
+#BASE_URL="https://storage.googleapis.com/cloud-tpu-artifacts/resnet/resnet-nhwc-2018-10-14"
+#DEST_DIR="resnet/resnet-nhwc-2018-10-14"
+#
+#wget -N ${BASE_URL}/model.ckpt-112602.data-00000-of-00001 -P ${DEST_DIR}
+#wget -N ${BASE_URL}/model.ckpt-112602.index -P ${DEST_DIR}
+#wget -N ${BASE_URL}/model.ckpt-112602.meta -P ${DEST_DIR}
+
+# VERIFY CHECKPOINTS
+echo "Verifying and Processing Checkpoints..."
+
+python pb_to_ckpt.py \
+    --frozen_model_filename=mask-rcnn/1555659850/ \
+    --output_filename=mask-rcnn/1555659850/ckpt/model.ckpt
+
+python extract_RN50_weights.py \
+    --checkpoint_dir=mask-rcnn/1555659850/ckpt/model.ckpt \
+    --save_to=resnet/extracted_from_maskrcnn
+
+echo "Generating list of tensors and their shape..."
+
+python inspect_checkpoint.py --file_name=mask-rcnn/1555659850/ckpt/model.ckpt \
+    > mask-rcnn/1555659850/tensors_and_shape.txt
+
+python inspect_checkpoint.py --file_name=resnet/resnet-nhwc-2018-02-07/model.ckpt-112603 \
+    > resnet/resnet-nhwc-2018-02-07/tensors_and_shape.txt
+
+#python inspect_checkpoint.py --file_name=resnet/resnet-nhwc-2018-10-14/model.ckpt-112602 \
+#    > resnet/resnet-nhwc-2018-10-14/tensors_and_shape.txt
+
+python inspect_checkpoint.py --file_name=resnet/extracted_from_maskrcnn/resnet50.ckpt \
+    > resnet/extracted_from_maskrcnn/tensors_and_shape.txt
+
+echo "Script Finished with Success"
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/images/MaskRCNN_TF1_conv.png
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/images/MaskRCNN_TF1_conv.png
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/images/MaskRCNN_TF2_conv.png
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/images/MaskRCNN_TF2_conv.png
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/__init__.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/__init__.py
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/anchors.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/anchors.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mask-RCNN anchor definition."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import OrderedDict
+
+import numpy as np
+import tensorflow as tf
+
+from mask_rcnn.object_detection import argmax_matcher
+from mask_rcnn.object_detection import balanced_positive_negative_sampler
+from mask_rcnn.object_detection import box_list
+from mask_rcnn.object_detection import faster_rcnn_box_coder
+from mask_rcnn.object_detection import region_similarity_calculator
+from mask_rcnn.object_detection import target_assigner
+
+
+def _generate_anchor_configs(min_level, max_level, num_scales, aspect_ratios):
+  """Generates mapping from output level to a list of anchor configurations.
+
+  A configuration is a tuple of (num_anchors, scale, aspect_ratio).
+
+  Args:
+      min_level: integer number of minimum level of the output feature pyramid.
+      max_level: integer number of maximum level of the output feature pyramid.
+      num_scales: integer number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds two additional
+        anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: list of tuples representing the aspect raito anchors added
+        on each level. For instances, aspect_ratios =
+        [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
+  Returns:
+    anchor_configs: a dictionary with keys as the levels of anchors and
+      values as a list of anchor configuration.
+  """
+  anchor_configs = {}
+  for level in range(min_level, max_level + 1):
+    anchor_configs[level] = []
+    for scale_octave in range(num_scales):
+      for aspect in aspect_ratios:
+        anchor_configs[level].append(
+            (2**level, scale_octave / float(num_scales), aspect))
+  return anchor_configs
+
+
+def _generate_anchor_boxes(image_size, anchor_scale, anchor_configs):
+  """Generates multiscale anchor boxes.
+
+  Args:
+    image_size: integer number of input image size. The input image has the
+      same dimension for width and height. The image_size should be divided by
+      the largest feature stride 2^max_level.
+    anchor_scale: float number representing the scale of size of the base
+      anchor to the feature stride 2^level.
+    anchor_configs: a dictionary with keys as the levels of anchors and
+      values as a list of anchor configuration.
+  Returns:
+    anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
+      feature levels.
+  Raises:
+    ValueError: input size must be the multiple of largest feature stride.
+  """
+  boxes_all = []
+  for _, configs in anchor_configs.items():
+    boxes_level = []
+    for config in configs:
+      stride, octave_scale, aspect = config
+      if image_size[0] % stride != 0 or image_size[1] % stride != 0:
+        raise ValueError('input size must be divided by the stride.')
+      base_anchor_size = anchor_scale * stride * 2**octave_scale
+      anchor_size_x_2 = base_anchor_size * aspect[0] / 2.0
+      anchor_size_y_2 = base_anchor_size * aspect[1] / 2.0
+
+      x = np.arange(stride / 2, image_size[1], stride)
+      y = np.arange(stride / 2, image_size[0], stride)
+      xv, yv = np.meshgrid(x, y)
+      xv = xv.reshape(-1)
+      yv = yv.reshape(-1)
+
+      boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
+                         yv + anchor_size_y_2, xv + anchor_size_x_2))
+      boxes = np.swapaxes(boxes, 0, 1)
+      boxes_level.append(np.expand_dims(boxes, axis=1))
+    # concat anchors on the same level to the reshape NxAx4
+    boxes_level = np.concatenate(boxes_level, axis=1)
+    boxes_all.append(boxes_level.reshape([-1, 4]))
+
+  anchor_boxes = np.vstack(boxes_all)
+  return anchor_boxes
+
+
+class Anchors(object):
+  """Mask-RCNN Anchors class."""
+
+  def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size):
+    """Constructs multiscale Mask-RCNN anchors.
+
+    Args:
+      min_level: integer number of minimum level of the output feature pyramid.
+      max_level: integer number of maximum level of the output feature pyramid.
+      num_scales: integer number representing intermediate scales added
+        on each level. For instances, num_scales=2 adds two additional
+        anchor scales [2^0, 2^0.5] on each level.
+      aspect_ratios: list of tuples representing the aspect raito anchors added
+        on each level. For instances, aspect_ratios =
+        [(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
+      anchor_scale: float number representing the scale of size of the base
+        anchor to the feature stride 2^level.
+      image_size: integer number of input image size. The input image has the
+        same dimension for width and height. The image_size should be divided by
+        the largest feature stride 2^max_level.
+    """
+    self.min_level = min_level
+    self.max_level = max_level
+    self.num_scales = num_scales
+    self.aspect_ratios = aspect_ratios
+    self.anchor_scale = anchor_scale
+    self.image_size = image_size
+    self.config = self._generate_configs()
+    self.boxes = self._generate_boxes()
+
+  def _generate_configs(self):
+    """Generate configurations of anchor boxes."""
+    return _generate_anchor_configs(self.min_level, self.max_level,
+                                    self.num_scales, self.aspect_ratios)
+
+  def _generate_boxes(self):
+    """Generates multiscale anchor boxes."""
+    boxes = _generate_anchor_boxes(self.image_size, self.anchor_scale,
+                                   self.config)
+    boxes = tf.convert_to_tensor(value=boxes, dtype=tf.float32)
+    return boxes
+
+  def get_anchors_per_location(self):
+    return self.num_scales * len(self.aspect_ratios)
+
+  def get_unpacked_boxes(self):
+    return self.unpack_labels(self.boxes)
+
+  def unpack_labels(self, labels):
+    """Unpacks an array of labels into multiscales labels."""
+    labels_unpacked = OrderedDict()
+    count = 0
+    for level in range(self.min_level, self.max_level + 1):
+      feat_size0 = int(self.image_size[0] / 2**level)
+      feat_size1 = int(self.image_size[1] / 2**level)
+      steps = feat_size0 * feat_size1 * self.get_anchors_per_location()
+      indices = tf.range(count, count + steps)
+      count += steps
+      labels_unpacked[level] = tf.reshape(
+          tf.gather(labels, indices), [feat_size0, feat_size1, -1])
+    return labels_unpacked
+
+
+class AnchorLabeler(object):
+  """Labeler for multiscale anchor boxes."""
+
+  def __init__(self, anchors, num_classes, match_threshold=0.7,
+               unmatched_threshold=0.3, rpn_batch_size_per_im=256,
+               rpn_fg_fraction=0.5):
+    """Constructs anchor labeler to assign labels to anchors.
+
+    Args:
+      anchors: an instance of class Anchors.
+      num_classes: integer number representing number of classes in the dataset.
+      match_threshold: a float number between 0 and 1 representing the
+        lower-bound threshold to assign positive labels for anchors. An anchor
+        with a score over the threshold is labeled positive.
+      unmatched_threshold: a float number between 0 and 1 representing the
+        upper-bound threshold to assign negative labels for anchors. An anchor
+        with a score below the threshold is labeled negative.
+      rpn_batch_size_per_im: a integer number that represents the number of
+        sampled anchors per image in the first stage (region proposal network).
+      rpn_fg_fraction: a float number between 0 and 1 representing the fraction
+        of positive anchors (foreground) in the first stage.
+    """
+    similarity_calc = region_similarity_calculator.IouSimilarity()
+    matcher = argmax_matcher.ArgMaxMatcher(
+        match_threshold,
+        unmatched_threshold=unmatched_threshold,
+        negatives_lower_than_unmatched=True,
+        force_match_for_each_row=True)
+    box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
+
+    self._target_assigner = target_assigner.TargetAssigner(
+        similarity_calc, matcher, box_coder)
+    self._anchors = anchors
+    self._match_threshold = match_threshold
+    self._unmatched_threshold = unmatched_threshold
+    self._rpn_batch_size_per_im = rpn_batch_size_per_im
+    self._rpn_fg_fraction = rpn_fg_fraction
+    self._num_classes = num_classes
+
+  def _get_rpn_samples(self, match_results):
+    """Computes anchor labels.
+
+    This function performs subsampling for foreground (fg) and background (bg)
+    anchors.
+    Args:
+      match_results: A integer tensor with shape [N] representing the
+        matching results of anchors. (1) match_results[i]>=0,
+        meaning that column i is matched with row match_results[i].
+        (2) match_results[i]=-1, meaning that column i is not matched.
+        (3) match_results[i]=-2, meaning that column i is ignored.
+    Returns:
+      score_targets: a integer tensor with the a shape of [N].
+        (1) score_targets[i]=1, the anchor is a positive sample.
+        (2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
+        don't care (ignore).
+    """
+    sampler = (
+        balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
+            positive_fraction=self._rpn_fg_fraction, is_static=False))
+    # indicator includes both positive and negative labels.
+    # labels includes only positives labels.
+    # positives = indicator & labels.
+    # negatives = indicator & !labels.
+    # ignore = !indicator.
+    indicator = tf.greater(match_results, -2)
+    labels = tf.greater(match_results, -1)
+
+    samples = sampler.subsample(
+        indicator, self._rpn_batch_size_per_im, labels)
+    positive_labels = tf.where(
+        tf.logical_and(samples, labels),
+        tf.constant(2, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    negative_labels = tf.where(
+        tf.logical_and(samples, tf.logical_not(labels)),
+        tf.constant(1, dtype=tf.int32, shape=match_results.shape),
+        tf.constant(0, dtype=tf.int32, shape=match_results.shape))
+    ignore_labels = tf.fill(match_results.shape, -1)
+
+    return (ignore_labels + positive_labels + negative_labels,
+            positive_labels, negative_labels)
+
+  def label_anchors(self, gt_boxes, gt_labels):
+    """Labels anchors with ground truth inputs.
+
+    Args:
+      gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
+        For each row, it stores [y0, x0, y1, x1] for four corners of a box.
+      gt_labels: A integer tensor with shape [N, 1] representing groundtruth
+        classes.
+    Returns:
+      score_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors]. The height_l and width_l
+        represent the dimension of class logits at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors * 4]. The height_l and
+        width_l represent the dimension of bounding box regression output at
+        l-th level.
+    """
+    gt_box_list = box_list.BoxList(gt_boxes)
+    anchor_box_list = box_list.BoxList(self._anchors.boxes)
+
+    # cls_targets, cls_weights, box_weights are not used
+    _, _, box_targets, _, matches = self._target_assigner.assign(
+        anchor_box_list, gt_box_list, gt_labels)
+
+    # score_targets contains the subsampled positive and negative anchors.
+    score_targets, _, _ = self._get_rpn_samples(matches.match_results)
+
+    # Unpack labels.
+    score_targets_dict = self._anchors.unpack_labels(score_targets)
+    box_targets_dict = self._anchors.unpack_labels(box_targets)
+
+    return score_targets_dict, box_targets_dict
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/coco_metric.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/coco_metric.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""COCO-style evaluation metrics.
+
+Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
+
+COCO API: github.com/cocodataset/cocoapi/
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import atexit
+
+import copy
+import tempfile
+import numpy as np
+
+import tensorflow as tf
+
+from mask_rcnn.utils.logging_formatter import logging
+
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+import pycocotools.mask as maskUtils
+
+import cv2
+
+
+class MaskCOCO(COCO):
+  """COCO object for mask evaluation.
+  """
+
+  def reset(self, dataset):
+    """Reset the dataset and groundtruth data index in this object.
+
+    Args:
+      dataset: dict of groundtruth data. It should has similar structure as the
+        COCO groundtruth JSON file. Must contains three keys: {'images',
+          'annotations', 'categories'}.
+        'images': list of image information dictionary. Required keys: 'id',
+          'width' and 'height'.
+        'annotations': list of dict. Bounding boxes and segmentations related
+          information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
+            'iscrowd', 'area', 'segmentation'}.
+        'categories': list of dict of the category information.
+          Required key: 'id'.
+        Refer to http://cocodataset.org/#format-data for more details.
+
+    Raises:
+      AttributeError: If the dataset is empty or not a dict.
+    """
+    assert dataset, 'Groundtruth should not be empty.'
+    assert isinstance(dataset,
+                      dict), 'annotation file format {} not supported'.format(
+                          type(dataset))
+    self.anns, self.cats, self.imgs = dict(), dict(), dict()
+    self.dataset = copy.deepcopy(dataset)
+    self.createIndex()
+
+  def loadRes(self, detection_results, include_mask, is_image_mask=False):
+    """Load result file and return a result api object.
+
+    Args:
+      detection_results: a dictionary containing predictions results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      res: result MaskCOCO api object
+    """
+    res = MaskCOCO()
+    res.dataset['images'] = [img for img in self.dataset['images']]
+    logging.info('Loading and preparing results...')
+    predictions = self.load_predictions(
+        detection_results,
+        include_mask=include_mask,
+        is_image_mask=is_image_mask)
+    assert isinstance(predictions, list), 'results in not an array of objects'
+    if predictions:
+      image_ids = [pred['image_id'] for pred in predictions]
+      assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
+             'Results do not correspond to current coco set'
+
+      if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          bb = pred['bbox']
+          x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+          if 'segmentation' not in pred:
+            pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+          pred['area'] = bb[2] * bb[3]
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+      elif 'segmentation' in predictions[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for idx, pred in enumerate(predictions):
+          # now only support compressed RLE format as segmentation results
+          pred['area'] = maskUtils.area(pred['segmentation'])
+          if 'bbox' not in pred:
+            pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
+          pred['id'] = idx + 1
+          pred['iscrowd'] = 0
+
+      res.dataset['annotations'] = predictions
+
+    res.createIndex()
+    return res
+
+  def load_predictions(self,
+                       detection_results,
+                       include_mask,
+                       is_image_mask=False):
+    """Create prediction dictionary list from detection and mask results.
+
+    Args:
+      detection_results: a dictionary containing numpy arrays which corresponds
+        to prediction results.
+      include_mask: a boolean, whether to include mask in detection results.
+      is_image_mask: a boolean, where the predict mask is a whole image mask.
+
+    Returns:
+      a list of dictionary including different prediction results from the model
+        in numpy form.
+    """
+    predictions = []
+    num_detections = detection_results['detection_scores'].size
+    current_index = 0
+    for i, image_id in enumerate(detection_results['source_id']):
+
+      if include_mask:
+        box_coorindates_in_image = detection_results['detection_boxes'][i]
+        segments = generate_segmentation_from_masks(
+            detection_results['detection_masks'][i],
+            box_coorindates_in_image,
+            int(detection_results['image_info'][i][3]),
+            int(detection_results['image_info'][i][4]),
+            is_image_mask=is_image_mask
+        )
+
+        # Convert the mask to uint8 and then to fortranarray for RLE encoder.
+        encoded_masks = [
+            maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
+            for instance_mask in segments
+        ]
+
+      for box_index in range(int(detection_results['num_detections'][i])):
+        if current_index % 1000 == 0:
+          logging.info('{}/{}'.format(current_index, num_detections))
+
+        current_index += 1
+
+        prediction = {
+            'image_id': int(image_id),
+            'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
+            'score': detection_results['detection_scores'][i][box_index],
+            'category_id': int(
+                detection_results['detection_classes'][i][box_index]),
+        }
+
+        if include_mask:
+          prediction['segmentation'] = encoded_masks[box_index]
+
+        predictions.append(prediction)
+
+    return predictions
+
+
+def generate_segmentation_from_masks(masks,
+                                     detected_boxes,
+                                     image_height,
+                                     image_width,
+                                     is_image_mask=False):
+  """Generates segmentation result from instance masks.
+
+  Args:
+    masks: a numpy array of shape [N, mask_height, mask_width] representing the
+      instance masks w.r.t. the `detected_boxes`.
+    detected_boxes: a numpy array of shape [N, 4] representing the reference
+      bounding boxes.
+    image_height: an integer representing the height of the image.
+    image_width: an integer representing the width of the image.
+    is_image_mask: bool. True: input masks are whole-image masks. False: input
+      masks are bounding-box level masks.
+
+  Returns:
+    segms: a numpy array of shape [N, image_height, image_width] representing
+      the instance masks *pasted* on the image canvas.
+  """
+
+  def expand_boxes(boxes, scale):
+    """Expands an array of boxes by a given scale."""
+    # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227
+    # The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
+    # whereas `boxes` here is in [x1, y1, w, h] form
+    w_half = boxes[:, 2] * .5
+    h_half = boxes[:, 3] * .5
+    x_c = boxes[:, 0] + w_half
+    y_c = boxes[:, 1] + h_half
+
+    w_half *= scale
+    h_half *= scale
+
+    boxes_exp = np.zeros(boxes.shape)
+    boxes_exp[:, 0] = x_c - w_half
+    boxes_exp[:, 2] = x_c + w_half
+    boxes_exp[:, 1] = y_c - h_half
+    boxes_exp[:, 3] = y_c + h_half
+
+    return boxes_exp
+
+  # Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812
+  # To work around an issue with cv2.resize (it seems to automatically pad
+  # with repeated border values), we manually zero-pad the masks by 1 pixel
+  # prior to resizing back to the original image resolution. This prevents
+  # "top hat" artifacts. We therefore need to expand the reference boxes by an
+  # appropriate factor.
+
+  _, mask_height, mask_width = masks.shape
+  scale = max((mask_width + 2.0) / mask_width,
+              (mask_height + 2.0) / mask_height)
+
+  ref_boxes = expand_boxes(detected_boxes, scale)
+  ref_boxes = ref_boxes.astype(np.int32)
+  padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
+  segms = []
+  for mask_ind, mask in enumerate(masks):
+    im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
+    if is_image_mask:
+      # Process whole-image masks.
+      im_mask[:, :] = mask[:, :]
+    else:
+      # Process mask inside bounding boxes.
+      padded_mask[1:-1, 1:-1] = mask[:, :]
+
+      ref_box = ref_boxes[mask_ind, :]
+      w = ref_box[2] - ref_box[0] + 1
+      h = ref_box[3] - ref_box[1] + 1
+      w = np.maximum(w, 1)
+      h = np.maximum(h, 1)
+
+      mask = cv2.resize(padded_mask, (w, h))
+      mask = np.array(mask > 0.5, dtype=np.uint8)
+
+      x_0 = max(ref_box[0], 0)
+      x_1 = min(ref_box[2] + 1, image_width)
+      y_0 = max(ref_box[1], 0)
+      y_1 = min(ref_box[3] + 1, image_height)
+
+      im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
+          x_0 - ref_box[0]):(x_1 - ref_box[0])]
+    segms.append(im_mask)
+
+  segms = np.array(segms)
+  assert masks.shape[0] == segms.shape[0]
+  return segms
+
+
+class EvaluationMetric(object):
+  """COCO evaluation metric class."""
+
+  def __init__(self, filename, include_mask):
+    """Constructs COCO evaluation class.
+
+    The class provides the interface to metrics_fn in TPUEstimator. The
+    _evaluate() loads a JSON file in COCO annotation format as the
+    groundtruths and runs COCO evaluation.
+
+    Args:
+      filename: Ground truth JSON file name. If filename is None, use
+        groundtruth data passed from the dataloader for evaluation.
+      include_mask: boolean to indicate whether or not to include mask eval.
+    """
+    if filename:
+      if filename.startswith('gs://'):
+        _, local_val_json = tempfile.mkstemp(suffix='.json')
+        tf.io.gfile.remove(local_val_json)
+
+        tf.io.gfile.copy(filename, local_val_json)
+        atexit.register(tf.io.gfile.remove, local_val_json)
+      else:
+        local_val_json = filename
+      self.coco_gt = MaskCOCO(local_val_json)
+    self.filename = filename
+    self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
+                         'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
+    self._include_mask = include_mask
+    if self._include_mask:
+      mask_metric_names = ['mask_' + x for x in self.metric_names]
+      self.metric_names.extend(mask_metric_names)
+
+    self._reset()
+
+  def _reset(self):
+    """Reset COCO API object."""
+    if self.filename is None and not hasattr(self, 'coco_gt'):
+      self.coco_gt = MaskCOCO()
+
+  def predict_metric_fn(self,
+                        predictions,
+                        is_predict_image_mask=False,
+                        groundtruth_data=None):
+    """Generates COCO metrics."""
+    image_ids = list(set(predictions['source_id']))
+    if groundtruth_data is not None:
+      self.coco_gt.reset(groundtruth_data)
+    coco_dt = self.coco_gt.loadRes(
+        predictions, self._include_mask, is_image_mask=is_predict_image_mask)
+    coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
+    coco_eval.params.imgIds = image_ids
+    coco_eval.evaluate()
+    coco_eval.accumulate()
+    coco_eval.summarize()
+    coco_metrics = coco_eval.stats
+
+    if self._include_mask:
+      # Create another object for instance segmentation metric evaluation.
+      mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
+      mcoco_eval.params.imgIds = image_ids
+      mcoco_eval.evaluate()
+      mcoco_eval.accumulate()
+      mcoco_eval.summarize()
+      mask_coco_metrics = mcoco_eval.stats
+
+    if self._include_mask:
+      metrics = np.hstack((coco_metrics, mask_coco_metrics))
+    else:
+      metrics = coco_metrics
+
+    # clean up after evaluation is done.
+    self._reset()
+    metrics = metrics.astype(np.float32)
+
+    metrics_dict = {}
+    for i, name in enumerate(self.metric_names):
+      metrics_dict[name] = metrics[i]
+    return metrics_dict
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data loader and processing.
+
+Defines input_fn of Mask-RCNN for TF Estimator. The input_fn includes training
+data for category classification, bounding box regression, and number of
+positive examples to normalize the loss during training.
+
+"""
+import functools
+import math
+import multiprocessing
+
+import tensorflow as tf
+
+from mask_rcnn.utils.logging_formatter import logging
+
+from mask_rcnn.utils.distributed_utils import MPI_is_distributed
+from mask_rcnn.utils.distributed_utils import MPI_rank_and_size
+from mask_rcnn.utils.distributed_utils import MPI_rank
+from mask_rcnn.utils.distributed_utils import MPI_size
+
+# common functions
+from mask_rcnn.dataloader_utils import dataset_parser
+
+from distutils.version import LooseVersion
+
+
+class InputReader(object):
+    """Input reader for dataset."""
+
+    def __init__(
+        self,
+        file_pattern,
+        mode=tf.estimator.ModeKeys.TRAIN,
+        num_examples=0,
+        use_fake_data=False,
+        use_instance_mask=False,
+        seed=None
+    ):
+
+        self._mode = mode
+        self._file_pattern = file_pattern
+        self._num_examples = num_examples
+        self._use_fake_data = use_fake_data
+        self._use_instance_mask = use_instance_mask
+        self._seed = seed
+
+    def _create_dataset_parser_fn(self, params):
+        """Create parser for parsing input data (dictionary)."""
+
+        return functools.partial(
+            dataset_parser,
+            mode=self._mode,
+            params=params,
+            use_instance_mask=self._use_instance_mask,
+            seed=self._seed
+        )
+
+    def __call__(self, params, input_context=None):
+
+        batch_size = params['batch_size'] if 'batch_size' in params else 1
+
+        try:
+            seed = params['seed'] if not MPI_is_distributed() else params['seed'] * MPI_rank()
+        except (KeyError, TypeError):
+            seed = None
+
+        if MPI_is_distributed():
+            n_gpus = MPI_size()
+
+        elif input_context is not None:
+            n_gpus = input_context.num_input_pipelines
+
+        else:
+            n_gpus = 1
+
+        ##################################################
+
+        dataset = tf.data.Dataset.list_files(
+            self._file_pattern,
+            shuffle=False
+        )
+
+        if self._mode == tf.estimator.ModeKeys.TRAIN:
+
+            if input_context is not None:
+                logging.info("Using Dataset Sharding with TF Distributed")
+                _num_shards = input_context.num_input_pipelines
+                _shard_idx = input_context.input_pipeline_id
+
+            elif MPI_is_distributed():
+                logging.info("Using Dataset Sharding with Horovod")
+                _shard_idx, _num_shards = MPI_rank_and_size()
+
+            try:
+                dataset = dataset.shard(
+                    num_shards=_num_shards,
+                    index=_shard_idx
+                )
+                dataset = dataset.shuffle(math.ceil(256 / _num_shards))
+
+            except NameError:  # Not a distributed training setup
+                pass
+
+        def _prefetch_dataset(filename):
+            return tf.data.TFRecordDataset(filename).prefetch(1)
+
+        dataset = dataset.interleave(
+            map_func=_prefetch_dataset,
+            cycle_length=32,
+            block_length=64,
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        )
+
+        if self._num_examples is not None and self._num_examples > 0:
+            logging.info("[*] Limiting the amount of sample to: %d" % self._num_examples)
+            dataset = dataset.take(self._num_examples)
+
+        dataset = dataset.cache()
+
+        if self._mode == tf.estimator.ModeKeys.TRAIN:
+            dataset = dataset.shuffle(
+                buffer_size=4096,
+                reshuffle_each_iteration=True,
+                seed=seed
+            )
+
+            dataset = dataset.repeat()
+
+        # Parse the fetched records to input tensors for model function.
+        dataset = dataset.map(
+            map_func=self._create_dataset_parser_fn(params),
+            num_parallel_calls=tf.data.experimental.AUTOTUNE,
+        )
+
+        dataset = dataset.batch(
+            batch_size=batch_size,
+            drop_remainder=True
+        )
+
+        if self._use_fake_data:
+            # Turn this dataset into a semi-fake dataset which always loop at the
+            # first batch. This reduces variance in performance and is useful in
+            # testing.
+            logging.info("Using Fake Dataset Loop...")
+            dataset = dataset.take(1).cache().repeat()
+
+            if self._mode != tf.estimator.ModeKeys.TRAIN:
+                dataset = dataset.take(int(5000 / batch_size))
+
+        dataset = dataset.prefetch(
+            buffer_size=tf.data.experimental.AUTOTUNE,
+        )
+
+        if self._mode == tf.estimator.ModeKeys.PREDICT or n_gpus > 1:
+            if not tf.distribute.has_strategy():
+                dataset = dataset.apply(
+                    tf.data.experimental.prefetch_to_device(
+                        '/gpu:0',  # With Horovod the local GPU is always 0
+                        buffer_size=1,
+                    )
+                )
+
+        data_options = tf.data.Options()
+
+        data_options.experimental_deterministic = seed is not None
+        if LooseVersion(tf.__version__) <= LooseVersion("2.0.0"):
+            data_options.experimental_distribute.auto_shard = False
+        else:
+            data_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+        # data_options.experimental_distribute.auto_shard = False
+        data_options.experimental_slack = True
+
+        data_options.experimental_threading.max_intra_op_parallelism = 1
+        # data_options.experimental_threading.private_threadpool_size = int(multiprocessing.cpu_count() / n_gpus) * 2
+
+        # ================= experimental_optimization ================= #
+
+        data_options.experimental_optimization.apply_default_optimizations = False
+
+        # data_options.experimental_optimization.autotune = True
+        data_options.experimental_optimization.filter_fusion = True
+        data_options.experimental_optimization.map_and_batch_fusion = True
+        data_options.experimental_optimization.map_and_filter_fusion = True
+        data_options.experimental_optimization.map_fusion = True
+        data_options.experimental_optimization.map_parallelization = True
+
+        map_vectorization_options = tf.data.experimental.MapVectorizationOptions()
+        map_vectorization_options.enabled = True
+        map_vectorization_options.use_choose_fastest = True
+
+        data_options.experimental_optimization.map_vectorization = map_vectorization_options
+
+        data_options.experimental_optimization.noop_elimination = True
+        data_options.experimental_optimization.parallel_batch = True
+        data_options.experimental_optimization.shuffle_and_repeat_fusion = True
+
+        # ========== Stats on TF Data =============
+        # aggregator = tf.data.experimental.StatsAggregator()
+        # data_options.experimental_stats.aggregator = aggregator
+        # data_options.experimental_stats.latency_all_edges = True
+
+        dataset = dataset.with_options(data_options)
+
+        return dataset
+
+
+if __name__ == "__main__":
+    '''
+    Data Loading Benchmark Usage:
+
+    # Real Data - Training
+    python -m mask_rcnn.dataloader \
+        --data_dir="/data/" \
+        --batch_size=2 \
+        --warmup_steps=200 \
+        --benchmark_steps=2000 \
+        --training
+
+    # Real Data - Inference
+    python -m mask_rcnn.dataloader \
+        --data_dir="/data/" \
+        --batch_size=8 \
+        --warmup_steps=200 \
+        --benchmark_steps=2000
+
+    # --------------- #
+
+    # Synthetic Data - Training
+    python -m mask_rcnn.dataloader \
+        --data_dir="/data/" \
+        --batch_size=2 \
+        --warmup_steps=200 \
+        --benchmark_steps=2000 \
+        --training \
+        --use_synthetic_data
+
+    # Synthetic Data - Inference
+    python -m mask_rcnn.dataloader \
+        --data_dir="/data/" \
+        --batch_size=8 \
+        --warmup_steps=200 \
+        --benchmark_steps=2000 \
+        --use_synthetic_data
+
+    # --------------- #
+    '''
+
+    import os
+    import time
+    import argparse
+
+    import numpy as np
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
+
+    tf.compat.v1.disable_eager_execution()
+
+    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
+    logging.set_verbosity(logging.INFO)
+
+    parser = argparse.ArgumentParser(description="MaskRCNN Dataloader Benchmark")
+
+    parser.add_argument(
+        '--data_dir', required=True, type=str, help="Directory path which contains the preprocessed DAGM 2007 dataset"
+    )
+
+    parser.add_argument(
+        '--batch_size', default=64, type=int, required=True, help="""Batch size used to measure performance."""
+    )
+
+    parser.add_argument(
+        '--warmup_steps',
+        default=200,
+        type=int,
+        required=True,
+        help="""Number of steps considered as warmup and not taken into account for performance measurements."""
+    )
+
+    parser.add_argument(
+        '--benchmark_steps',
+        default=200,
+        type=int,
+        required=True,
+        help="Number of steps used to benchmark dataloading performance. Only used in training"
+    )
+
+    parser.add_argument(
+        '--seed',
+        default=666,
+        type=int,
+        required=False,
+        help="""Reproducibility Seed."""
+    )
+
+    parser.add_argument("--training", default=False, action="store_true", help="Benchmark in training mode")
+
+    parser.add_argument("--use_synthetic_data", default=False, action="store_true", help="Use synthetic dataset")
+
+    FLAGS, unknown_args = parser.parse_known_args()
+
+    if len(unknown_args) > 0:
+
+        for bad_arg in unknown_args:
+            print("ERROR: Unknown command line arg: %s" % bad_arg)
+
+        raise ValueError("Invalid command line arg(s)")
+
+    BURNIN_STEPS = FLAGS.warmup_steps
+
+    if FLAGS.training:
+        TOTAL_STEPS = FLAGS.warmup_steps + FLAGS.benchmark_steps
+    else:
+        TOTAL_STEPS = int(1e6)  # Wait for end of dataset
+
+    if FLAGS.training:
+        input_dataset = InputReader(
+            file_pattern=os.path.join(FLAGS.data_dir, "train*.tfrecord"),
+            mode=tf.estimator.ModeKeys.TRAIN,
+            use_fake_data=FLAGS.use_synthetic_data,
+            use_instance_mask=True,
+            seed=FLAGS.seed
+        )
+
+    else:
+        input_dataset = InputReader(
+            file_pattern=os.path.join(FLAGS.data_dir, "val*.tfrecord"),
+            mode=tf.estimator.ModeKeys.PREDICT,
+            num_examples=5000,
+            use_fake_data=FLAGS.use_synthetic_data,
+            use_instance_mask=True,
+            seed=FLAGS.seed
+        )
+
+    logging.info("[*] Executing Benchmark in %s mode" % ("training" if FLAGS.training else "inference"))
+    logging.info("[*] Benchmark using %s data" % ("synthetic" if FLAGS.use_synthetic_data else "real"))
+
+    time.sleep(1)
+
+    # Build the data input
+    dataset = input_dataset(
+        params={
+            "anchor_scale": 8.0,
+            "aspect_ratios": [[1.0, 1.0], [1.4, 0.7], [0.7, 1.4]],
+            "batch_size": FLAGS.batch_size,
+            "gt_mask_size": 112,
+            "image_size": [1024, 1024],
+            "include_groundtruth_in_features": False,
+            "augment_input_data": True,
+            "max_level": 6,
+            "min_level": 2,
+            "num_classes": 91,
+            "num_scales": 1,
+            "rpn_batch_size_per_im": 256,
+            "rpn_fg_fraction": 0.5,
+            "rpn_min_size": 0.,
+            "rpn_nms_threshold": 0.7,
+            "rpn_negative_overlap": 0.3,
+            "rpn_positive_overlap": 0.7,
+            "rpn_post_nms_topn": 1000,
+            "rpn_pre_nms_topn": 2000,
+            "skip_crowd_during_training": True,
+            "use_category": True,
+            "visualize_images_summary": False,
+        }
+    )
+
+    dataset_iterator = dataset.make_initializable_iterator()
+
+    if FLAGS.training:
+        X, Y = dataset_iterator.get_next()
+    else:
+        X = dataset_iterator.get_next()
+
+    config = tf.compat.v1.ConfigProto()
+    config.gpu_options.allow_growth = True
+    config.log_device_placement = False
+
+    with tf.device("gpu:0"):
+
+        X_gpu_ops = list()
+        Y_gpu_ops = list()
+
+        if FLAGS.training:
+
+            for _, _x in X.items():
+                X_gpu_ops.append(tf.identity(_x))
+
+            for _, _y in Y.items():
+                Y_gpu_ops.append(tf.identity(_y))
+
+        else:
+
+            for _, _x in X["features"].items():
+                X_gpu_ops.append(tf.identity(_x))
+
+        with tf.control_dependencies(X_gpu_ops + Y_gpu_ops):
+            input_op = tf.constant(1.0)
+
+        with tf.compat.v1.Session(config=config) as sess:
+
+            sess.run(dataset_iterator.initializer)
+
+            sess.run(tf.compat.v1.global_variables_initializer())
+
+            total_files_processed = 0
+
+            img_per_sec_arr = []
+            processing_time_arr = []
+
+            processing_start_time = time.time()
+
+            for step in range(TOTAL_STEPS):
+
+                try:
+
+                    start_time = time.time()
+                    sess.run(input_op)
+                    elapsed_time = (time.time() - start_time) * 1000
+
+                    imgs_per_sec = (FLAGS.batch_size / elapsed_time) * 1000
+                    total_files_processed += FLAGS.batch_size
+
+                    if (step + 1) > BURNIN_STEPS:
+                        processing_time_arr.append(elapsed_time)
+                        img_per_sec_arr.append(imgs_per_sec)
+
+                    if (step + 1) % 20 == 0 or (step + 1) == TOTAL_STEPS:
+                        print(
+                            "[STEP %04d] # Batch Size: %03d - Time: %03d msecs - Speed: %6d img/s" %
+                            (step + 1, FLAGS.batch_size, elapsed_time, imgs_per_sec)
+                        )
+
+                except tf.errors.OutOfRangeError:
+                    break
+
+            processing_time = time.time() - processing_start_time
+
+            avg_processing_speed = np.mean(img_per_sec_arr)
+
+            print("\n###################################################################")
+            print("*** Data Loading Performance Metrics ***\n")
+            print("\t=> Number of Steps: %d" % (step + 1))
+            print("\t=> Batch Size: %d" % FLAGS.batch_size)
+            print("\t=> Files Processed: %d" % total_files_processed)
+            print("\t=> Total Execution Time: %d secs" % processing_time)
+            print("\t=> Median Time per step: %3d msecs" % np.median(processing_time_arr))
+            print("\t=> Median Processing Speed: %d images/secs" % np.median(img_per_sec_arr))
+            print("\t=> Median Processing Time: %.2f msecs/image" % (1 / float(np.median(img_per_sec_arr)) * 1000))
--- a/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader_utils.py
+++ b/TensorFlow2x/ComputeVision/Detection/MaskRCNN/mask_rcnn/dataloader_utils.py
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Data loader and processing.
+
+Defines input_fn of Mask-RCNN for TF Estimator. The input_fn includes training
+data for category classification, bounding box regression, and number of
+positive examples to normalize the loss during training.
+
+"""
+
+import tensorflow as tf
+
+from mask_rcnn import anchors
+from mask_rcnn.utils import coco_utils
+from mask_rcnn.ops import preprocess_ops
+
+from mask_rcnn.object_detection import tf_example_decoder
+
+MAX_NUM_INSTANCES = 100
+MAX_NUM_VERTICES_PER_INSTANCE = 1500
+MAX_NUM_POLYGON_LIST_LEN = 2 * MAX_NUM_VERTICES_PER_INSTANCE * MAX_NUM_INSTANCES
+POLYGON_PAD_VALUE = coco_utils.POLYGON_PAD_VALUE
+
+__all__ = [
+    # dataset parser
+    "dataset_parser",
+    # common functions
+    "preprocess_image",
+    "process_groundtruth_is_crowd",
+    "process_source_id",
+    # eval
+    "prepare_labels_for_eval",
+    # training
+    "augment_image",
+    "process_boxes_classes_indices_for_training",
+    "process_gt_masks_for_training",
+    "process_labels_for_training",
+    "process_targets_for_training"
+]
+
+
+###############################################################################################################
+
+def dataset_parser(value, mode, params, use_instance_mask, seed=None, regenerate_source_id=False):
+    """Parse data to a fixed dimension input image and learning targets.
+
+    Args:
+    value: A dictionary contains an image and groundtruth annotations.
+
+    Returns:
+    features: a dictionary that contains the image and auxiliary
+      information. The following describes {key: value} pairs in the
+      dictionary.
+      image: Image tensor that is preproessed to have normalized value and
+        fixed dimension [image_size, image_size, 3]
+      image_info: image information that includes the original height and
+        width, the scale of the proccessed image to the original image, and
+        the scaled height and width.
+      source_ids: Source image id. Default value -1 if the source id is
+        empty in the groundtruth annotation.
+    labels: a dictionary that contains auxiliary information plus (optional)
+      labels. The following describes {key: value} pairs in the dictionary.
+      `labels` is only for training.
+      score_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors]. The height_l and width_l
+        represent the dimension of objectiveness score at l-th level.
+      box_targets_dict: ordered dictionary with keys
+        [min_level, min_level+1, ..., max_level]. The values are tensor with
+        shape [height_l, width_l, num_anchors * 4]. The height_l and
+        width_l represent the dimension of bounding box regression output at
+        l-th level.
+      gt_boxes: Groundtruth bounding box annotations. The box is represented
+         in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
+         fixed dimension [MAX_NUM_INSTANCES, 4].
+      gt_classes: Groundtruth classes annotations. The tennsor is padded
+        with -1 to the fixed dimension [MAX_NUM_INSTANCES].
+      cropped_gt_masks: groundtrugh masks cropped by the bounding box and
+        resized to a fixed size determined by params['gt_mask_size']
+      regenerate_source_id: `bool`, if True TFExampleParser will use hashed
+        value of `image/encoded` for `image/source_id`.
+    """
+    if mode not in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.EVAL]:
+        raise ValueError("Unknown execution mode received: %s" % mode)
+
+    def create_example_decoder():
+        return tf_example_decoder.TfExampleDecoder(
+            use_instance_mask=use_instance_mask,
+            regenerate_source_id=regenerate_source_id
+    )
+
+    example_decoder = create_example_decoder()
+
+    with tf.xla.experimental.jit_scope(compile_ops=True):
+
+        with tf.name_scope('parser'):
+
+            data = example_decoder.decode(value)
+
+            data['groundtruth_is_crowd'] = process_groundtruth_is_crowd(data)
+
+            image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32)
+
+            source_id = process_source_id(data['source_id'])
+
+            if mode == tf.estimator.ModeKeys.PREDICT:
+
+                features = {
+                    'source_ids': source_id,
+                }
+
+                if params['visualize_images_summary']:
+                    features['orig_images'] = tf.image.resize(image, params['image_size'])
+
+                features["images"], features["image_info"], _, _ = preprocess_image(
+                    image,
+                    boxes=None,
+                    instance_masks=None,
+                    image_size=params['image_size'],
+                    max_level=params['max_level'],
+                    augment_input_data=False,
+                    seed=seed
+                )
+
+                if params['include_groundtruth_in_features']:
+                    labels = prepare_labels_for_eval(
+                        data,
+                        target_num_instances=MAX_NUM_INSTANCES,
+                        target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN,
+                        use_instance_mask=params['include_mask']
+                    )
+                    return {'features': features, 'labels': labels}
+
+                else:
+                    return {'features': features}
+
+            elif mode == tf.estimator.ModeKeys.TRAIN:
+
+                labels = {}
+                features = {
+                    'source_ids': source_id
+                }
+
+                boxes, classes, indices, instance_masks = process_boxes_classes_indices_for_training(
+                    data,
+                    skip_crowd_during_training=params['skip_crowd_during_training'],
+                    use_category=params['use_category'],
+                    use_instance_mask=use_instance_mask
+                )
+
+                image, image_info, boxes, instance_masks = preprocess_image(
+                    image,
+                    boxes=boxes,
+                    instance_masks=instance_masks,
+                    image_size=params['image_size'],
+                    max_level=params['max_level'],
+                    augment_input_data=params['augment_input_data'],
+                    seed=seed
+                )
+
+                features.update({
+                    'images': image,
+                    'image_info': image_info,
+                })
+
+                padded_image_size = image.get_shape().as_list()[:2]
+
+                # Pads cropped_gt_masks.
+                if use_instance_mask:
+                    labels['cropped_gt_masks'] = process_gt_masks_for_training(
+                        instance_masks,
+                        boxes,
+                        gt_mask_size=params['gt_mask_size'],
+                        padded_image_size=padded_image_size,
+                        max_num_instances=MAX_NUM_INSTANCES
+                    )
+
+                with tf.xla.experimental.jit_scope(compile_ops=False):
+                    # Assign anchors.
+                    (score_targets, box_targets), input_anchor = process_targets_for_training(
+                        padded_image_size=padded_image_size,
+                        boxes=boxes,
+                        classes=classes,
+                        params=params
+                    )
+
+                additional_labels = process_labels_for_training(
+                    image_info, boxes, classes, score_targets, box_targets,
+                    max_num_instances=MAX_NUM_INSTANCES,
+                    min_level=params["min_level"],
+                    max_level=params["max_level"]
+                )
+
+                labels.update(additional_labels)
+                # labels["input_anchor"] = input_anchor
+
+                # Features
+                # {
+                #   'source_ids': <tf.Tensor 'parser/StringToNumber:0' shape=() dtype=float32>,
+                #   'images': <tf.Tensor 'parser/pad_to_bounding_box/Squeeze:0' shape=(1024, 1024, 3) dtype=float32>,
+                #   'image_info': <tf.Tensor 'parser/stack_1:0' shape=(5,) dtype=float32>
+                # }
+
+                FAKE_FEATURES = False
+
+                if FAKE_FEATURES:
+                    labels["source_ids"] = tf.ones(shape=(), dtype=tf.float32)
+                    labels["images"] = tf.ones(shape=(1024, 1024, 3), dtype=tf.float32)
+                    labels["image_info"] = tf.ones(shape=(5,), dtype=tf.float32)
+
+                # Labels
+                # {
+                #   'cropped_gt_masks': <tf.Tensor 'parser/Reshape_4:0' shape=(100, 116, 116) dtype=float32>,
+                #   'score_targets_2': <tf.Tensor 'parser/Reshape_9:0' shape=(256, 256, 3) dtype=int32>,
+                #   'box_targets_2': <tf.Tensor 'parser/Reshape_14:0' shape=(256, 256, 12) dtype=float32>,
+                #   'score_targets_3': <tf.Tensor 'parser/Reshape_10:0' shape=(128, 128, 3) dtype=int32>,
+                #   'box_targets_3': <tf.Tensor 'parser/Reshape_15:0' shape=(128, 128, 12) dtype=float32>,
+                #   'score_targets_4': <tf.Tensor 'parser/Reshape_11:0' shape=(64, 64, 3) dtype=int32>,
+                #   'box_targets_4': <tf.Tensor 'parser/Reshape_16:0' shape=(64, 64, 12) dtype=float32>,
+                #   'score_targets_5': <tf.Tensor 'parser/Reshape_12:0' shape=(32, 32, 3) dtype=int32>,
+                #   'box_targets_5': <tf.Tensor 'parser/Reshape_17:0' shape=(32, 32, 12) dtype=float32>,
+                #   'score_targets_6': <tf.Tensor 'parser/Reshape_13:0' shape=(16, 16, 3) dtype=int32>,
+                #   'box_targets_6': <tf.Tensor 'parser/Reshape_18:0' shape=(16, 16, 12) dtype=float32>,
+                #   'gt_boxes': <tf.Tensor 'parser/Reshape_20:0' shape=(100, 4) dtype=float32>,
+                #   'gt_classes': <tf.Tensor 'parser/Reshape_22:0' shape=(100, 1) dtype=float32>
+                # }
+
+                FAKE_LABELS = False
+
+                if FAKE_LABELS:
+                    labels["cropped_gt_masks"] = tf.ones(shape=(100, 116, 116), dtype=tf.float32)
+                    labels["gt_boxes"] = tf.ones(shape=(100, 4), dtype=tf.float32)
+                    labels["gt_classes"] = tf.ones(shape=(100, 1), dtype=tf.float32)
+
+                    idx = 1
+                    for dim in [256, 128, 64, 32, 16]:
+                        idx += 1  # Starts at 2
+
+                        labels["score_targets_%d" % idx] = tf.ones(shape=(dim, dim, 3), dtype=tf.float32)
+                        labels["box_targets_%d" % idx] = tf.ones(shape=(dim, dim, 12), dtype=tf.float32)
+
+                return features, labels
+
+###############################################################################################################
+
+# common functions
+
+
+def preprocess_image(image, boxes, instance_masks, image_size, max_level, augment_input_data=False, seed=None):
+    image = preprocess_ops.normalize_image(image)
+
+    if augment_input_data:
+        image, boxes, instance_masks = augment_image(image=image, boxes=boxes, instance_masks=instance_masks, seed=seed)
+
+    # Scaling and padding.
+    image, image_info, boxes, instance_masks = preprocess_ops.resize_and_pad(
+        image=image,
+        target_size=image_size,
+        stride=2 ** max_level,
+        boxes=boxes,
+        masks=instance_masks
+    )
+    return image, image_info, boxes, instance_masks
+
+
+def process_groundtruth_is_crowd(data):
+    return tf.cond(
+        pred=tf.greater(tf.size(input=data['groundtruth_is_crowd']), 0),
+        true_fn=lambda: data['groundtruth_is_crowd'],
+        false_fn=lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)
+    )
+
+
+# def process_source_id(data):
+#     source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id)
+#     source_id = tf.strings.to_number(source_id)
+#     return source_id
+
+
+def process_source_id(source_id):
+    """Processes source_id to the right format."""
+    if source_id.dtype == tf.string:
+        source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
+
+    with tf.control_dependencies([source_id]):
+        source_id = tf.cond(
+            tf.equal(tf.size(source_id), 0),
+            lambda: tf.cast(tf.constant(-1), tf.int64),
+            lambda: tf.identity(source_id)
+        )
+
+    return source_id
+
+
+# eval
+def prepare_labels_for_eval(
+        data,
+        target_num_instances=MAX_NUM_INSTANCES,
+        target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN,
+        use_instance_mask=False
+):
+
+    """Create labels dict for infeed from data of tf.Example."""
+    image = data['image']
+
+    height, width = tf.shape(input=image)[:2]
+
+    boxes = data['groundtruth_boxes']
+
+    classes = tf.cast(data['groundtruth_classes'], dtype=tf.float32)
+
+    num_labels = tf.shape(input=classes)[0]
+
+    boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [target_num_instances, 4])
+    classes = preprocess_ops.pad_to_fixed_size(classes, -1, [target_num_instances, 1])
+
+    is_crowd = tf.cast(data['groundtruth_is_crowd'], dtype=tf.float32)
+    is_crowd = preprocess_ops.pad_to_fixed_size(is_crowd, 0, [target_num_instances, 1])
+
+    labels = dict()
+
+    labels['width'] = width
+    labels['height'] = height
+    labels['groundtruth_boxes'] = boxes
+    labels['groundtruth_classes'] = classes
+    labels['num_groundtruth_labels'] = num_labels
+    labels['groundtruth_is_crowd'] = is_crowd
+
+    if use_instance_mask:
+        data['groundtruth_polygons'] = preprocess_ops.pad_to_fixed_size(
+            data=data['groundtruth_polygons'],
+            pad_value=POLYGON_PAD_VALUE,
+            output_shape=[target_polygon_list_len, 1]
+        )
+
+        if 'groundtruth_area' in data:
+            labels['groundtruth_area'] = preprocess_ops.pad_to_fixed_size(
+                data=labels['groundtruth_area'],
+                pad_value=0,
+                output_shape=[target_num_instances, 1]
+            )
+
+    return labels
+
+
+# training
+def augment_image(image, boxes, instance_masks, seed):
+    flipped_results = preprocess_ops.random_horizontal_flip(
+        image,
+        boxes=boxes,
+        masks=instance_masks,
+        seed=seed
+    )
+
+    if instance_masks is not None:
+        image, boxes, instance_masks = flipped_results
+
+    else:
+        image, boxes = flipped_results
+
+    # image = tf.image.random_brightness(image, max_delta=0.1, seed=seed)
+    # image = tf.image.random_contrast(image, lower=0.9, upper=1.1, seed=seed)
+    # image = tf.image.random_saturation(image, lower=0.9, upper=1.1, seed=seed)
+    # image = tf.image.random_jpeg_quality(image, min_jpeg_quality=80, max_jpeg_quality=100, seed=seed)
+
+    return image, boxes, instance_masks
+
+
+def process_boxes_classes_indices_for_training(data, skip_crowd_during_training, use_category, use_instance_mask):
+    boxes = data['groundtruth_boxes']
+    classes = data['groundtruth_classes']
+    classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
+    indices = None
+    instance_masks = None
+
+    if not use_category:
+        classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)
+
+    if skip_crowd_during_training:
+        indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
+        classes = tf.gather_nd(classes, indices)
+        boxes = tf.gather_nd(boxes, indices)
+
+        if use_instance_mask:
+            instance_masks = tf.gather_nd(data['groundtruth_instance_masks'], indices)
+
+    return boxes, classes, indices, instance_masks
+
+
+def process_gt_masks_for_training(instance_masks, boxes, gt_mask_size, padded_image_size, max_num_instances):
+    cropped_gt_masks = preprocess_ops.crop_gt_masks(
+        instance_masks=instance_masks,
+        boxes=boxes,
+        gt_mask_size=gt_mask_size,
+        image_size=padded_image_size
+    )
+
+    # cropped_gt_masks = tf.reshape(cropped_gt_masks, [max_num_instances, -1])
+
+    cropped_gt_masks = preprocess_ops.pad_to_fixed_size(
+        data=cropped_gt_masks,
+        pad_value=-1,
+        output_shape=[max_num_instances, (gt_mask_size + 4) ** 2]
+    )
+
+    return tf.reshape(cropped_gt_masks, [max_num_instances, gt_mask_size + 4, gt_mask_size + 4])
+
+
+def process_labels_for_training(
+    image_info, boxes, classes,
+    score_targets, box_targets,
+    max_num_instances, min_level, max_level
+):
+    labels = {}
+
+    # Pad groundtruth data.
+    # boxes *= image_info[2]
+    boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [max_num_instances, 4])
+
+    classes = preprocess_ops.pad_to_fixed_size(classes, -1, [max_num_instances, 1])
+
+    for level in range(min_level, max_level + 1):
+        labels['score_targets_%d' % level] = score_targets[level]
+        labels['box_targets_%d' % level] = box_targets[level]
+
+    labels['gt_boxes'] = boxes
+    labels['gt_classes'] = classes
+
+    return labels
+
+
+def process_targets_for_training(padded_image_size, boxes, classes, params):
+    input_anchors = anchors.Anchors(
+        params['min_level'],
+        params['max_level'],
+        params['num_scales'],
+        params['aspect_ratios'],
+        params['anchor_scale'],
+        padded_image_size
+    )
+
+    anchor_labeler = anchors.AnchorLabeler(
+        input_anchors,
+        params['num_classes'],
+        params['rpn_positive_overlap'],
+        params['rpn_negative_overlap'],
+        params['rpn_batch_size_per_im'],
+        params['rpn_fg_fraction']
+    )
+
+    return anchor_labeler.label_anchors(boxes, classes), input_anchors