Commit d0d91e12 authored by huchen's avatar huchen
Browse files

Merge branch 'tf2' into 'main'

tf2 detection

See merge request dcutoolkit/deeplearing/dlexamples_new!2
parents 2795dc1f c320b6ef
# Default ignored files
/shelf/
/workspace.xml
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="format" value="PLAIN" />
<option name="myDocStringFormat" value="Plain" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/MaskRCNN.iml" filepath="$PROJECT_DIR$/.idea/MaskRCNN.iml" />
</modules>
</component>
</project>
\ No newline at end of file
import torch
import torchvision
print(torch.__version__)
print(torchvision.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
a=torch.Tensor([[1,1,2,2],[1,1,3.100001,3],[1,1,3.1,3]])
b=torch.Tensor([0.9,0.98,0.980005])
from torchvision.ops import nms
ccc=nms(a,b,0.4)
print(ccc)
print(a[ccc])
\ No newline at end of file
#===============================================================================
#
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# ==============================================================================
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.06-tf1-py3
FROM ${FROM_IMAGE_NAME}
ENV DEBIAN_FRONTEND=noninteractive
RUN rm -rf /workspace && mkdir -p /workspace
ADD . /workspace
WORKDIR /workspace
RUN apt-get update && \
apt-get install -y libsm6 libxext6 libxrender-dev python3-tk cmake && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Make sure python and pip points to pip3 and python3
RUN python -m pip install --upgrade pip && \
pip --no-cache-dir --no-cache install \
Cython \
matplotlib \
opencv-python-headless \
mpi4py \
Pillow \
pytest \
pyyaml && \
git clone https://github.com/pybind/pybind11 /opt/pybind11 && \
cd /opt/pybind11 && cmake . && make install && pip install . && \
pip --no-cache-dir --no-cache install \
'git+https://github.com/NVIDIA/cocoapi#egg=pycocotools&subdirectory=PythonAPI' && \
pip --no-cache-dir --no-cache install \
'git+https://github.com/NVIDIA/dllogger'
# Update protobuf 3 to 3.3.0
RUN \
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip && \
unzip -u protoc-3.3.0-linux-x86_64.zip -d protoc3 && \
mv protoc3/bin/* /usr/local/bin/ && \
mv protoc3/include/* /usr/local/include/
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
\ No newline at end of file
# 简介
* Tensorflow训练Mask R-CNN模型
<br>
# 环境准备
## 1)安装工具包
* rocm3.3环境安装tensorflow1.15
* 安装pycocotools
pip3 install pycocotools -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
* 更新pandas
pip3 install -U pandas -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
* 安装dllogger
git clone --recursive https://github.com/NVIDIA/dllogger.git
python3 setup.py install
<br>
## 2)数据处理(train 和 val)
```
cd dataset/
git clone http://github.com/tensorflow/models tf-models
cd tf-models/research
wget -O protobuf.zip https://github.com/google/protobuf/releases/download/v3.0.0/protoc-3.0.0-linux-x86_64.zip protobuf.zip
unzip protobuf.zip
./bin/protoc object_detection/protos/.proto --python_out=.
```
返回dataset目录
vim create_coco_tf_record.py
注释掉310 316行
<br>
```
PYTHONPATH="tf-models:tf-models/research" python3 create_coco_tf_record.py \
--logtostderr \
--include_masks \
--train_image_dir=/path/to/COCO2017/images/train2017 \
--val_image_dir=/path/to/COCO2017/images/val2017 \
--train_object_annotations_file=/path/to/COCO2017/annotations/instances_train2017.json \
--val_object_annotations_file=/path/to/COCO2017/annotations/instances_val2017.json \
--train_caption_annotations_file=/path/to/COCO2017/annotations/captions_train2017.json \
--val_caption_annotations_file=/path/to/COCO2017/annotations/captions_val2017.json \
--output_dir=coco2017_tfrecord
```
生成coco2017_tfrecord文件夹
## 3)预训练模型下载
<br>
生成的模型文件结构如下:
```
weights/
>mask-rcnn/1555659850/
https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/saved_model.pb
>>variables/
https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/variables/variables.data-00000-of-00001
https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850/variables/variables.index
>resnet/
>>extracted_from_maskrcnn/
>>resnet-nhwc-2018-02-07/
https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/checkpoint
>>>model.ckpt-112603/
https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.data-00000-of-00001
https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.index
https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07/model.ckpt-112603.meta
>>resnet-nhwc-2018-10-14/
```
# 测试
## 单卡训练
```
python3 scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4}
python3 scripts/benchmark_training.py --gpus 1 --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
```
## 多卡训练
```
python3 scripts/benchmark_training.py --gpus 2 --batch_size 4 --model_dir save_model_2dcu --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
```
## 推理
```
python3 scripts/benchmark_inference.py --batch_size 2 --model_dir save_model --data_dir /public/home/tianlh/AI-application/Tensorflow/MaskRCNN_tf2/dataset/coco2017_tfrecord --weights_dir weights
```
# 参考资料
[https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN](https://github.com/NVIDIA/DeepLearningExamples/tree/master/TensorFlow2/Segmentation/MaskRCNN)
\ No newline at end of file
# Mask R-CNN For Tensorflow
This repository provides a script and recipe to train the Mask R-CNN model for Tensorflow to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
## Table of Contents
- [Model overview](#model-overview)
* [Model architecture](#model-architecture)
* [Default configuration](#default-configuration)
* [Feature support matrix](#feature-support-matrix)
* [Features](#features)
* [Mixed precision training](#mixed-precision-training)
* [Enabling mixed precision](#enabling-mixed-precision)
* [Enabling TF32](#enabling-tf32)
- [Setup](#setup)
* [Requirements](#requirements)
- [Quick Start Guide](#quick-start-guide)
- [Advanced](#advanced)
* [Scripts and sample code](#scripts-and-sample-code)
* [Parameters](#parameters)
* [Command-line options](#command-line-options)
* [Getting the data](#getting-the-data)
* [Dataset guidelines](#dataset-guidelines)
* [Multi-dataset](#multi-dataset)
* [Training process](#training-process)
* [Inference process](#inference-process)
- [Performance](#performance)
* [Benchmarking](#benchmarking)
* [Training performance benchmark](#training-performance-benchmark)
* [Inference performance benchmark](#inference-performance-benchmark)
* [Results](#results)
* [Training accuracy results TensorFlow 1.1x](#training-accuracy-results-tensorflow-11x)
* [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb)
* [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16GB)
* [Training performance results Tensorflow 1.1x](#training-performance-results-tensorflow-11x)
* [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
* [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
* [Training accuracy results TensorFlow 2.x](#training-accuracy-results-tensorflow-2x)
* [Training accuracy: NVIDIA DGX A100 (8x A100 40GB)](#training-accuracy-nvidia-dgx-a100-8x-a100-40gb-1)
* [Training accuracy: NVIDIA DGX-1 (8x V100 16GB)](#training-accuracy-nvidia-dgx-1-8x-v100-16gb-1)
* [Training performance results Tensorflow 2.x](#training-performance-results-tensorflow-2x)
* [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb-1)
* [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb-1)
* [Inference performance results TensorFlow 1.1x](#inference-performance-results-tensorflow-11x)
* [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
* [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb)
* [Inference performance results TensorFlow 2.x](#inference-performance-results-tensorflow-2x)
* [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb-1)
* [Inference performance: NVIDIA DGX-1 (1x V100 16GB)](#inference-performance-nvidia-dgx-1-1x-v100-16gb-1)
- [Release notes](#release-notes)
* [Changelog](#changelog)
* [Known issues](#known-issues)
## Model overview
Mask R-CNN is a convolution-based neural network for the task of object instance segmentation. The paper describing the model can be found [here](https://arxiv.org/abs/1703.06870). NVIDIA’s Mask R-CNN 20.06 is an optimized version of [Google's TPU implementation](https://github.com/tensorflow/tpu/tree/master/models/official/mask_rcnn), leveraging mixed precision arithmetic using Tensor Cores on NVIDIA Volta, Turing, and Ampere GPUs while maintaining target accuracy.
This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Therefore, researchers can get results 2.2x faster than training without Tensor Cores, while experiencing the benefits of mixed precision training. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
This repository also contains scripts to interactively launch training,
benchmarking and inference routines in a Docker container.
The major differences between the official implementation of the paper and our version of Mask R-CNN are as follows:
- Mixed precision support with [TensorFlow AMP](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-user-guide/index.html#tfamp).
- Gradient accumulation to simulate larger batches.
- Custom fused CUDA kernels for faster computations.
There are other publicly NVIDIA available implementations of Mask R-CNN:
- [NVIDIA PyTorch implementation](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Segmentation/MaskRCNN)
- [Matterport](https://github.com/matterport/Mask_RCNN)
- [Tensorpack](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN)
### Model architecture
Mask R-CNN builds on top of Faster R-CNN adding an additional mask head for the task of image segmentation.
The architecture consists of the following:
- ResNet-50 backbone with Feature Pyramid Network (FPN)
- Region proposal network (RPN) head
- RoI Align
- Bounding and classification box head
- Mask head
### Default configuration
The Mask R-CNN configuration and the hyper-parameters for training and testing purposes are in separate files.
The default configuration of this model can be found at `mask-rcnn/hyperparameters/mask_rcnn_params.py`.
The default configuration is as follows:
- Feature extractor:
- Images resized with aspect ratio maintained and smaller side length between [832,1344]
- Ground Truth mask size 112
- Backbone network weights are frozen after second epoch
- RPN:
- Anchor stride set to 16
- Anchor sizes set to (32, 64, 128, 256, 512)
- Foreground IOU Threshold set to 0.7, Background IOU Threshold set to 0.3
- RPN target fraction of positive proposals set to 0.5
- Train Pre-NMS Top proposals set to 2000 per FPN layer
- Train Post-NMS Top proposals set to 1000
- Test Pre-NMS Top proposals set to 1000 per FPN layer
- Test Post-NMS Top proposals set to 1000
- RPN NMS Threshold set to 0.7
- RoI heads:
- Foreground threshold set to 0.5
- Batch size per image set to 512
- Positive fraction of batch set to 0.25
The default hyper-parameters can be found at `mask-rcnn/hyperparameters/cmdline_utils.py`.
These hyperparameters can be overridden through the command-line options, in the launch scripts.
### Feature support matrix
The following features are supported by this model:
| **Feature** | **Mask R-CNN** |
-------------|---------------------|
| Automatic mixed precision (AMP) | Yes |
| Horovod Multi-GPU (NCCL) | Yes |
| Accelerated Linear Algebra (XLA)| Yes |
#### Features
**Automatic Mixed Precision (AMP)**
This implementation of Mask-RCNN uses AMP to implement mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying just a few lines of code.
**Horovod**
Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
Multi-GPU training with Horovod
Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
**XLA support (experimental)**
XLA is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage: most internal benchmarks run ~1.1-1.5x faster after XLA is enabled.
### Mixed precision training
Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in the Volta and Turing architecture, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using [mixed precision training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) previously required two steps:
1. Porting the model to use the FP16 data type where appropriate.
2. Adding loss scaling to preserve small gradient values.
This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code. AMP enables mixed precision training on Volta and Turing GPUs automatically. The TensorFlow framework code makes all necessary model changes internally.
In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
- How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
- Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
- How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
#### Enabling mixed precision
Mixed precision is enabled in TensorFlow by using the Automatic Mixed Precision (TF-AMP) extension which casts variables to half-precision upon retrieval, while storing variables in single-precision format. Furthermore, to preserve small gradient magnitudes in backpropagation, a [loss scaling](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#lossscaling) step must be included when applying gradients. In TensorFlow, loss scaling can be applied statically by using simple multiplication of loss by a constant value or automatically, by TF-AMP. Automatic mixed precision makes all the adjustments internally in TensorFlow, providing two benefits over manual operations. First, programmers need not modify network model code, reducing development and maintenance effort. Second, using AMP maintains forward and backward compatibility with all the APIs for defining and running TensorFlow models.
#### Enabling TF32
TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs.
TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
## Setup
The following section lists the requirements that you need to meet in order to start training the Mask R-CNN model.
### Requirements
This repository contains Dockerfile which extends the TensorFlow NGC container and encapsulates some dependencies.
Aside from these dependencies, ensure you have the following components:
- [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
- TensorFlow 20.06-tf1-py3 [NGC container](https://ngc.nvidia.com/registry/nvidia-tensorflow)
- GPU-based architecture:
- [NVIDIA Volta](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
- [NVIDIA Turing](https://www.nvidia.com/en-us/geforce/turing/)
- [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
For more information about how to get started with NGC containers, see the following sections from the
NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
- [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
- [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html#accessing_registry)
- Running [TensorFlow](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
For those unable to use the TensorFlow NGC container, to set up the required environment or create your own
container, see the versioned
[NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html).
## Quick Start Guide
To train your model using mixed precision with Tensor Cores or using 32-bit, perform the following steps using
the default parameters of the Mask R-CNN model on the COCO 2014 dataset.
1. Clone the repository.
```bash
git clone https://github.com/NVIDIA/DeepLearningExamples.git
cd DeepLearningExamples/TensorFlow/Segmentation/MaskRCNN
```
2. Build the Mask R-CNN TensorFlow NGC container.
**For TensorFlow 1.1x:** `bash ./scripts/docker/build_tf1.sh`
**For TensorFlow 2.x:** `bash ./scripts/docker/build_tf2.sh`
3. Start an interactive session in the NGC container to run training/inference.
Run the following command to launch the Docker container, the only argument is the *absolute path* to the
`data directory` which holds or will hold the `tfrecords` data. If data has not already been downloaded in the `data directory` then download it in step 4, else step 4 can be skipped.
**For TensorFlow 1.1x:** `bash ./scripts/docker/launch_tf1.sh [data directory]`
**For TensorFlow 2.x:** `bash ./scripts/docker/launch_tf2.sh [data directory]`
4. Download and preprocess the dataset.
This repository provides scripts to download and extract the [COCO 2017 dataset](http://cocodataset.org/#download).
If you already have the data then you do not need to run the following script, proceed to downloading the pre-trained weights.
Data will be downloaded to the `data directory` provided in step 3.
```bash
cd dataset
bash download_and_preprocess_coco.sh /data
```
By default, the data is organized into the following structure:
```bash
<data/dir>
annotations/
instances_train2017.json
instances_val2017.json
train2017/
COCO_train2017_*.jpg
val2017/
COCO_val2017_*.jpg
```
This repository also provides scripts to download the pre-trained weights of ResNet-50 backbone.
The script will make a new directory with the name `weights` in the current directory and
download the pre-trained weights in it.
```bash
./download_and_process_pretrained_weights.sh
```
Ensure that the `weights` folder created has a `resnet` folder in it. Inside the `resnet` folder there
should be 3 folders for checkpoints and weights: `extracted_from_maskrcnn`, `resnet-nhwc-2018-02-07` and
`resnet-nhwc-2018-10-14`. Before moving to the next step, ensure the above folders are not empty.
5. Start training.
To run training for a default configuration (on 1/4/8 GPUs, AMP/32-bit), run one of the scripts in the
`./scripts` directory called `./scripts/train{_AMP}_{1,4,8}GPU.sh`. For example:
`bash ./scripts/train_AMP_8GPU.sh`
The above script trains a model and performs an evaluation on the COCO 2017 dataset. By default, this training script:
- Uses 8 GPUs.
- Saves a checkpoint every 3696 iterations and at the end of training. All checkpoints, evaluation results and training logs are saved to the `/results` directory (in the container which can be mounted to a local directory).
- Mixed precision training with Tensor Cores.
6. Start validation/evaluation.
- For evaluation with AMP precision: `bash ./scripts/evaluation_AMP.sh`
- For evaluation with 32-bit precision: `bash ./scripts/evaluation.sh`
## Advanced
The following sections provide greater details of the dataset, running training and inference, and the training results.
### Scripts and sample code
Descriptions of the key scripts and folders are provided below.
- `mask_rcnn` - Contains codes to build individual components of the model such as
backbone, FPN, RPN, mask and bbox heads etc.
- `download_and_process_pretrained_weights.sh` - Can be used to download backbone pre-trained weights.
- `scripts/` - A folder that contains shell scripts to train the model and perform inferences.
- `train{_AMP}_{1,4,8}GPU.sh` - Training script on 1, 4, 8 GPUs with AMP or 32-bit precision.
- `evaluation_{AMP}.sh` - Evaluations script on either AMP precision or 32-bit precision.
- `benchmark_training.py` - Script for running train performance benchmarks.
- `benchmark_inference.py` - Script for running inference performance benchmarks.
- `dataset/` - A folder that contains shell scripts and Python files to download the dataset.
- `mask_rcnn_main.py` - Is the main function that is the starting point for the training and evaluation process.
- `docker/` - A folder that contains scripts to build a Docker image and start an interactive session.
### Parameters
#### `mask_rcnn_main.py` script parameters
You can modify the training behavior through the various flags in both the `train_net.py` script and through overriding specific parameters in the config files. Flags in the `mask_rcnn_main.py` script are as follows:
- `--mode` - Specifies the action to take like `train`, `train_and_eval` or `eval`.
- `--checkpoint` - The checkpoint of the backbone.
- `--eval_samples` - Number of samples to evaluate.
- `--init_learning_rate` - Initial learning rate.
- `--learning_rate_steps` - Specifies at which steps to reduce the learning rate.
- `--num_steps_per_eval` - Specifies after how many steps of training evaluation should be performed.
- `--total_steps` - Specifies the total number of steps for which training should be run.
- `--train_batch_size` - Training batch size per GPU.
- `--eval_batch_size` - Evaluation batch size per GPU.
- `--amp` - Specifies to use AMP precision or 32-bit.
- `--xla` - Specifies to use XLA (Accelerated Linear Algebra) of TensorFlow or not.
### Command-line options
To see the full list of available options and their descriptions, use the `-h` or `--help` command-line option, for example:
`python mask_rcnn_main.py --helpfull`
### Getting the data
The Mask R-CNN model was trained on the COCO 2017 dataset. This dataset comes with a training and validation set.
This repository contains the `./dataset/download_and_preprocess_coco.sh` script which automatically downloads and preprocesses the training and validation sets. The helper scripts are also present in the `dataset/` folder.
#### Dataset guidelines
The data should be organized into the following structure:
```bash
<data/dir>
annotations/
instances_train2017.json
instances_val2017.json
train2017/
COCO_train2017_*.jpg
val2017/
COCO_val2017_*.jpg
```
### Training process
Training is performed using the `mask_rcnn_main.py` script along with parameters defined in the config files.
The default config files can be found in the
`mask_rcnn_tf/mask_rcnn/mask_rcnn_params.py, mask_rcnn_tf/mask_rcnn/cmd_utils.py` files. To specify which GPUs to train on, `CUDA_VISIBLE_DEVICES` variable can be changed in the training scripts
provided in the `scripts` folder.
This script outputs results to the `/results` directory by default. The training log will contain information about:
- Loss, time per iteration, learning rate and memory metrics
- Performance values such as throughput per step
- Test accuracy and test performance values after evaluation
### Inference process
To run inference run `mask_rcnn_main.py` with commandline parameter
`mode=eval`. To run inference with a checkpoint, set the commandline
parameter `--model_dir` to `[absolute path of checkpoint folder]`.
The inference log will contain information about:
- Inference time per step
- Inference throughput per step
- Evaluation accuracy and performance values
## Performance
### Benchmarking
The following section shows how to run benchmarks measuring the model performance in training and inference modes.
#### Training performance benchmark
To run training benchmarking on a selected number of GPUs with either AMP or 32-bit precision, run the following script:
```bash
python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]
```
#### Inference performance benchmark
To run inference benchmarking on a single GPU with either AMP or 32-bit precision, run the following script:
```bash
python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]
```
### Results
The following sections provide details on how we achieved our performance and accuracy in training and inference.
#### Training accuracy results Tensorflow 1.1x
##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by building and launching the docker containers for TensorFlow 1.1x `./scripts/docker/build_tf1.sh`, `bash ./scripts/docker/launch_tf1.sh [data directory]` respectively and running the `scripts/train{_AMP}_{1,4,8}GPU.sh` training script on NVIDIA DGX A100 (8x A100 40GB) GPUs.
| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train | Time to train speedup |
|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
| 8 | 4 | TF32 | 0.3777 | 0.3435 | 5 h | - |
| 8 | 4 | AMP | 0.3782 | 0.3432 | 4 h | 1.25 |
##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by building and launching the docker containers for TensorFlow 1.1x `./scripts/docker/build_tf1.sh`, `bash ./scripts/docker/launch_tf1.sh [data directory]` respectively and running the `scripts/train{_AMP}_{1,4,8}GPU.sh` training script on NVIDIA DGX-1 with 8x V100 16GB GPUs.
| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train | Time to train speedup |
|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
| 8 | 4 | FP32 | 0.3767 | 0.3420 | 14 h | - |
| 8 | 4 | AMP | 0.3770 | 0.3423 | 9 h | 1.50 |
**Learning curves**
The following image shows the training loss as a function of iteration for training using DGX A100 (TF32 and TF-AMP) and DGX-1 V100 (FP32 and TF-AMP).
![LearningCurvesTF1](images/MaskRCNN_TF1_conv.png)
#### Training performance results Tensorflow 1.1x
##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3
NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
| GPUs | Batch size / GPU | Throughput - TF32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
|---|---|-------|--------|------|------|------|
| 1 | 2 | 11.38 | 18.51 | 1.63 | - | - |
| 1 | 4 | 12.49 | 21.20 | 1.70 | - | - |
| 4 | 2 | 43.95 | 65.74 | 1.50 | 3.86 | 3.55 |
| 4 | 4 | 48.26 | 72.96 | 1.51 | 3.86 | 3.44 |
| 8 | 2 | 81.69 | 114.59 | 1.40 | 7.18 | 6.19 |
| 8 | 4 | 89.02 | 132.31 | 1.49 | 7.13 | 6.24 |
##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3
NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
|---|---|-------|-------|------|------|------|
| 1 | 2 | 6.37 | 12.19 | 1.91 | - | - |
| 1 | 4 | 6.79 | 12.79 | 1.88 | - | - |
| 4 | 2 | 23.32 | 30.82 | 1.32 | 3.66 | 2.53 |
| 4 | 4 | 22.96 | 36.45 | 1.59 | 3.38 | 2.85 |
| 8 | 2 | 40.18 | 58.41 | 1.45 | 6.31 | 4.79 |
| 8 | 4 | 42.65 | 62.80 | 1.47 | 6.28 | 4.91 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Training accuracy results Tensorflow 2.x
##### Training accuracy: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running the `scripts/train{_AMP}_{1,4,8}GPU.sh` training script in the
TensorFlow 20.06-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs.
| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train | Time to train speedup |
|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
| 8 | 4 | TF32 | 0.3783 | 0.3400 | 5 h | - |
| 8 | 4 | AMP | 0.3796 | 0.3415 | 4 h | 1.25 |
##### Training accuracy: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running the `scripts/train{_AMP}_{1,4,8}GPU.sh` training script in the
TensorFlow 20.06-py3 NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs.
| GPUs | Batch size / GPU | Precision | Final AP BBox | Final AP Segm | Time to train | Time to train speedup |
|------|------------------|-----------|---------------|---------------|----------------|-----------------------|
| 8 | 4 | FP32 | 0.3784 | 0.3400 | 14 h | - |
| 8 | 4 | AMP | 0.3786 | 0.3410 | 9 h | 1.50 |
**Learning curves**
The following image shows the training loss as a function of iteration for training using DGX A100 (TF32 and TF-AMP) and DGX-1 V100 (FP32 and TF-AMP).
![LearningCurvesTF2](images/MaskRCNN_TF2_conv.png)
#### Training performance results Tensorflow 2.x
##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3
NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
| GPUs | Batch size / GPU | Throughput - TF32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
|---|---|-------------|-------------|------|------|------|
| 1 | 2 | 11.83822087 | 18.5130037 | 1.56 | - | - |
| 1 | 4 | 12.67925418 | 19.93965192 | 1.57 | - | - |
| 4 | 2 | 44.50704695 | 58.11168627 | 1.31 | 3.76 | 3.14 |
| 4 | 4 | 47.38663139 | 64.66523539 | 1.36 | 3.74 | 3.24 |
| 8 | 2 | 80.21134592 | 110.9716499 | 1.38 | 6.78 | 5.99 |
| 8 | 4 | 89.93247608 | 150.0217503 | 1.67 | 7.09 | 7.52 |
##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
Our results were obtained by running `python scripts/benchmark_training.py --gpus {1,4,8} --batch_size {2,4} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3
NGC container on NVIDIA DGX-1 V100 (8x V100 16GB) GPUs. Performance numbers (in images per second) were averaged over 200 steps omitting the first 100 warm-up steps.
| GPUs | Batch size / GPU | Throughput - FP32 [img/s] | Throughput - mixed precision [img/s] | Throughput speedup (FP32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
|---|---|-------|-------|------|------|------|
| 1 | 2 | 5.70 | 11.63 | 2.04 | - | - |
| 1 | 4 | 6.20 | 12.63 | 2.04 | - | - |
| 4 | 2 | 21.22 | 25.18 | 1.19 | 3.72 | 2.16 |
| 4 | 4 | 21.79 | 30.63 | 1.41 | 3.51 | 2.42 |
| 8 | 2 | 38.64 | 52.13 | 1.35 | 6.78 | 4.48 |
| 8 | 4 | 40.76 | 59.62 | 1.46 | 6.57 | 4.72 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Inference performance results TensorFlow 1.1x
##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3
NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
FP16
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 28.37 |
| 4 | 31.35 |
| 8 | 33.79 |
TF32
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 21.81 |
| 4 | 23.77 |
| 8 | 24.59 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 1.1x 20.06-py3
NGC container on NVIDIA DGX-1 V100 (1x V100 16GB) GPU.
FP16
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 23.52 |
| 4 | 24.64 |
| 8 | 26.83 |
FP32
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 14.85 |
| 4 | 15.45 |
| 8 | 16.00 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
#### Inference performance results TensorFlow 2.x
##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3
NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
FP16
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 26.28 |
| 4 | 36.23 |
| 8 | 40.84 |
TF32
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 20.20 |
| 4 | 24.94 |
| 8 | 31.38 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
##### Inference performance: NVIDIA DGX-1 (1x V100 16GB)
Our results were obtained by running `python scripts/benchmark_inference.py --batch_size {2,4,8} [--amp]` benchmark script in the TensorFlow 2.x 20.06-py3
NGC container on NVIDIA DGX-1 V100 (1x V100 16GB) GPU.
FP16
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 23.63 |
| 4 | 27.64 |
| 8 | 33.60 |
FP32
| Batch size | Throughput Avg [img/s] |
|:----------:|:----------------------:|
| 2 | 15.45 |
| 4 | 16.71 |
| 8 | 18.78 |
To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
## Release notes
### Changelog
June 2020
- Updated accuracy tables with A100 results
- Updated training and inference performance tables with A100 results
March 2020
- Initial release
### Known issues
There are no known issues with this model.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
r"""Convert raw COCO dataset to TFRecord for object_detection.
Example usage:
python create_coco_tf_record.py --logtostderr \
--train_image_dir="${TRAIN_IMAGE_DIR}" \
--val_image_dir="${VAL_IMAGE_DIR}" \
--test_image_dir="${TEST_IMAGE_DIR}" \
--train_annotations_file="${TRAIN_ANNOTATIONS_FILE}" \
--val_annotations_file="${VAL_ANNOTATIONS_FILE}" \
--testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \
--output_dir="${OUTPUT_DIR}"
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import hashlib
import io
import json
import multiprocessing
import os
from absl import app
from absl import flags
import numpy as np
import PIL.Image
from pycocotools import mask
from research.object_detection.utils import dataset_util
from research.object_detection.utils import label_map_util
import tensorflow as tf
flags.DEFINE_boolean('include_masks', False,
'Whether to include instance segmentations masks '
'(PNG encoded) in the result. default: False.')
flags.DEFINE_string('train_image_dir', '', 'Training image directory.')
flags.DEFINE_string('val_image_dir', '', 'Validation image directory.')
flags.DEFINE_string('test_image_dir', '', 'Test image directory.')
flags.DEFINE_string('train_object_annotations_file', '', '')
flags.DEFINE_string('val_object_annotations_file', '', '')
flags.DEFINE_string('train_caption_annotations_file', '', '')
flags.DEFINE_string('val_caption_annotations_file', '', '')
flags.DEFINE_string('testdev_annotations_file', '',
'Test-dev annotations JSON file.')
flags.DEFINE_string('output_dir', '/tmp/', 'Output data directory.')
FLAGS = flags.FLAGS
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
def create_tf_example(image,
bbox_annotations,
caption_annotations,
image_dir,
category_index,
include_masks=False):
"""Converts image and annotations to a tf.Example proto.
Args:
image: dict with keys:
[u'license', u'file_name', u'coco_url', u'height', u'width',
u'date_captured', u'flickr_url', u'id']
bbox_annotations:
list of dicts with keys:
[u'segmentation', u'area', u'iscrowd', u'image_id',
u'bbox', u'category_id', u'id']
Notice that bounding box coordinates in the official COCO dataset are
given as [x, y, width, height] tuples using absolute coordinates where
x, y represent the top-left (0-indexed) corner. This function converts
to the format expected by the Tensorflow Object Detection API (which is
which is [ymin, xmin, ymax, xmax] with coordinates normalized relative
to image size).
image_dir: directory containing the image files.
category_index: a dict containing COCO category information keyed
by the 'id' field of each category. See the
label_map_util.create_category_index function.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
Returns:
example: The converted tf.Example
num_annotations_skipped: Number of (invalid) annotations that were ignored.
Raises:
ValueError: if the image pointed to by data['filename'] is not a valid JPEG
"""
image_height = image['height']
image_width = image['width']
filename = image['file_name']
image_id = image['id']
full_path = os.path.join(image_dir, filename)
with tf.io.gfile.GFile(full_path, 'rb') as fid:
encoded_jpg = fid.read()
encoded_jpg_io = io.BytesIO(encoded_jpg)
image = PIL.Image.open(encoded_jpg_io)
key = hashlib.sha256(encoded_jpg).hexdigest()
xmin = []
xmax = []
ymin = []
ymax = []
is_crowd = []
category_names = []
category_ids = []
area = []
encoded_mask_png = []
num_annotations_skipped = 0
for object_annotations in bbox_annotations:
(x, y, width, height) = tuple(object_annotations['bbox'])
if width <= 0 or height <= 0:
num_annotations_skipped += 1
continue
if x + width > image_width or y + height > image_height:
num_annotations_skipped += 1
continue
xmin.append(float(x) / image_width)
xmax.append(float(x + width) / image_width)
ymin.append(float(y) / image_height)
ymax.append(float(y + height) / image_height)
is_crowd.append(object_annotations['iscrowd'])
category_id = int(object_annotations['category_id'])
category_ids.append(category_id)
category_names.append(category_index[category_id]['name'].encode('utf8'))
area.append(object_annotations['area'])
if include_masks:
run_len_encoding = mask.frPyObjects(object_annotations['segmentation'],
image_height, image_width)
binary_mask = mask.decode(run_len_encoding)
if not object_annotations['iscrowd']:
binary_mask = np.amax(binary_mask, axis=2)
pil_image = PIL.Image.fromarray(binary_mask)
output_io = io.BytesIO()
pil_image.save(output_io, format='PNG')
encoded_mask_png.append(output_io.getvalue())
captions = []
for caption_annotation in caption_annotations:
captions.append(caption_annotation['caption'].encode('utf8'))
feature_dict = {
'image/height':
dataset_util.int64_feature(image_height),
'image/width':
dataset_util.int64_feature(image_width),
'image/filename':
dataset_util.bytes_feature(filename.encode('utf8')),
'image/source_id':
dataset_util.bytes_feature(str(image_id).encode('utf8')),
'image/key/sha256':
dataset_util.bytes_feature(key.encode('utf8')),
'image/encoded':
dataset_util.bytes_feature(encoded_jpg),
'image/caption':
dataset_util.bytes_list_feature(captions),
'image/format':
dataset_util.bytes_feature('jpeg'.encode('utf8')),
'image/object/bbox/xmin':
dataset_util.float_list_feature(xmin),
'image/object/bbox/xmax':
dataset_util.float_list_feature(xmax),
'image/object/bbox/ymin':
dataset_util.float_list_feature(ymin),
'image/object/bbox/ymax':
dataset_util.float_list_feature(ymax),
'image/object/class/text':
dataset_util.bytes_list_feature(category_names),
'image/object/class/label':
dataset_util.int64_list_feature(category_ids),
'image/object/is_crowd':
dataset_util.int64_list_feature(is_crowd),
'image/object/area':
dataset_util.float_list_feature(area),
}
if include_masks:
feature_dict['image/object/mask'] = (
dataset_util.bytes_list_feature(encoded_mask_png))
example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
return key, example, num_annotations_skipped
def _pool_create_tf_example(args):
return create_tf_example(*args)
def _load_object_annotations(object_annotations_file):
with tf.io.gfile.GFile(object_annotations_file, 'r') as fid:
obj_annotations = json.load(fid)
images = obj_annotations['images']
category_index = label_map_util.create_category_index(
obj_annotations['categories'])
img_to_obj_annotation = collections.defaultdict(list)
tf.compat.v1.logging.info('Building bounding box index.')
for annotation in obj_annotations['annotations']:
image_id = annotation['image_id']
img_to_obj_annotation[image_id].append(annotation)
missing_annotation_count = 0
for image in images:
image_id = image['id']
if image_id not in img_to_obj_annotation:
missing_annotation_count += 1
tf.compat.v1.logging.info('%d images are missing bboxes.', missing_annotation_count)
return images, img_to_obj_annotation, category_index
def _load_caption_annotations(caption_annotations_file):
with tf.io.gfile.GFile(caption_annotations_file, 'r') as fid:
caption_annotations = json.load(fid)
img_to_caption_annotation = collections.defaultdict(list)
tf.compat.v1.logging.info('Building caption index.')
for annotation in caption_annotations['annotations']:
image_id = annotation['image_id']
img_to_caption_annotation[image_id].append(annotation)
missing_annotation_count = 0
images = caption_annotations['images']
for image in images:
image_id = image['id']
if image_id not in img_to_caption_annotation:
missing_annotation_count += 1
tf.compat.v1.logging.info('%d images are missing captions.', missing_annotation_count)
return img_to_caption_annotation
def _create_tf_record_from_coco_annotations(
object_annotations_file,
caption_annotations_file,
image_dir, output_path, include_masks, num_shards):
"""Loads COCO annotation json files and converts to tf.Record format.
Args:
object_annotations_file: JSON file containing bounding box annotations.
caption_annotations_file: JSON file containing caption annotations.
image_dir: Directory containing the image files.
output_path: Path to output tf.Record file.
include_masks: Whether to include instance segmentations masks
(PNG encoded) in the result. default: False.
num_shards: Number of output files to create.
"""
tf.compat.v1.logging.info('writing to output path: %s', output_path)
writers = [
tf.io.TFRecordWriter(output_path + '-%05d-of-%05d.tfrecord' %
(i, num_shards)) for i in range(num_shards)
]
images, img_to_obj_annotation, category_index = (
_load_object_annotations(object_annotations_file))
img_to_caption_annotation = (
_load_caption_annotations(caption_annotations_file))
pool = multiprocessing.Pool()
total_num_annotations_skipped = 0
for idx, (_, tf_example, num_annotations_skipped) in enumerate(
pool.imap(_pool_create_tf_example,
[(image,
img_to_obj_annotation[image['id']],
img_to_caption_annotation[image['id']],
image_dir,
category_index,
include_masks)
for image in images])):
if idx % 100 == 0:
tf.compat.v1.logging.info('On image %d of %d', idx, len(images))
total_num_annotations_skipped += num_annotations_skipped
writers[idx % num_shards].write(tf_example.SerializeToString())
pool.close()
pool.join()
for writer in writers:
writer.close()
tf.compat.v1.logging.info('Finished writing, skipped %d annotations.',
total_num_annotations_skipped)
def main(_):
assert FLAGS.train_image_dir, '`train_image_dir` missing.'
assert FLAGS.val_image_dir, '`val_image_dir` missing.'
assert FLAGS.test_image_dir, '`test_image_dir` missing.'
if not tf.io.gfile.isdir(FLAGS.output_dir):
tf.io.gfile.makedirs(FLAGS.output_dir)
train_output_path = os.path.join(FLAGS.output_dir, 'train')
val_output_path = os.path.join(FLAGS.output_dir, 'val')
testdev_output_path = os.path.join(FLAGS.output_dir, 'test-dev')
_create_tf_record_from_coco_annotations(
FLAGS.train_object_annotations_file,
FLAGS.train_caption_annotations_file,
FLAGS.train_image_dir,
train_output_path,
FLAGS.include_masks,
num_shards=256)
_create_tf_record_from_coco_annotations(
FLAGS.val_object_annotations_file,
FLAGS.val_caption_annotations_file,
FLAGS.val_image_dir,
val_output_path,
FLAGS.include_masks,
num_shards=32)
if __name__ == '__main__':
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
app.run(main)
#!/bin/bash
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Script to download and preprocess the COCO data set for detection.
#
# The outputs of this script are TFRecord files containing serialized
# tf.Example protocol buffers. See create_coco_tf_record.py for details of how
# the tf.Example protocol buffers are constructed and see
# http://cocodataset.org/#overview for an overview of the dataset.
#
# usage:
# bash download_and_preprocess_coco.sh /data-dir/coco
set -e
set -x
if [ -z "$1" ]; then
echo "usage download_and_preprocess_coco.sh [data dir]"
exit
fi
#sudo apt install -y protobuf-compiler python-pil python-lxml\
# python-pip python-dev git unzip
#pip install Cython git+https://github.com/cocodataset/cocoapi#subdirectory=PythonAPI
echo "Cloning Tensorflow models directory (for conversion utilities)"
if [ ! -e tf-models ]; then
git clone http://github.com/tensorflow/models tf-models
fi
(cd tf-models/research && protoc object_detection/protos/*.proto --python_out=.)
UNZIP="unzip -nq"
# Create the output directories.
OUTPUT_DIR="${1%/}"
SCRATCH_DIR="${OUTPUT_DIR}/raw-data"
mkdir -p "${OUTPUT_DIR}"
mkdir -p "${SCRATCH_DIR}"
CURRENT_DIR=$(pwd)
# Helper function to download and unpack a .zip file.
function download_and_unzip() {
local BASE_URL=${1}
local FILENAME=${2}
if [ ! -f ${FILENAME} ]; then
echo "Downloading ${FILENAME} to $(pwd)"
wget -nd -c "${BASE_URL}/${FILENAME}"
else
echo "Skipping download of ${FILENAME}"
fi
echo "Unzipping ${FILENAME}"
${UNZIP} ${FILENAME}
}
cd ${SCRATCH_DIR}
# Download the images.
BASE_IMAGE_URL="http://images.cocodataset.org/zips"
TRAIN_IMAGE_FILE="train2017.zip"
download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE}
TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2017"
VAL_IMAGE_FILE="val2017.zip"
download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE}
VAL_IMAGE_DIR="${SCRATCH_DIR}/val2017"
TEST_IMAGE_FILE="test2017.zip"
download_and_unzip ${BASE_IMAGE_URL} ${TEST_IMAGE_FILE}
TEST_IMAGE_DIR="${SCRATCH_DIR}/test2017"
# Download the annotations.
BASE_INSTANCES_URL="http://images.cocodataset.org/annotations"
INSTANCES_FILE="annotations_trainval2017.zip"
download_and_unzip ${BASE_INSTANCES_URL} ${INSTANCES_FILE}
TRAIN_OBJ_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_train2017.json"
VAL_OBJ_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/instances_val2017.json"
TRAIN_CAPTION_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/captions_train2017.json"
VAL_CAPTION_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/captions_val2017.json"
# Download the test image info.
BASE_IMAGE_INFO_URL="http://images.cocodataset.org/annotations"
IMAGE_INFO_FILE="image_info_test2017.zip"
download_and_unzip ${BASE_IMAGE_INFO_URL} ${IMAGE_INFO_FILE}
TESTDEV_ANNOTATIONS_FILE="${SCRATCH_DIR}/annotations/image_info_test-dev2017.json"
# # Build TFRecords of the image data.
cd "${CURRENT_DIR}"
# Setup packages
touch tf-models/__init__.py
touch tf-models/research/__init__.py
# Run our conversion
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
PYTHONPATH="tf-models:tf-models/research" python $SCRIPT_DIR/create_coco_tf_record.py \
--logtostderr \
--include_masks \
--train_image_dir="${TRAIN_IMAGE_DIR}" \
--val_image_dir="${VAL_IMAGE_DIR}" \
--test_image_dir="${TEST_IMAGE_DIR}" \
--train_object_annotations_file="${TRAIN_OBJ_ANNOTATIONS_FILE}" \
--val_object_annotations_file="${VAL_OBJ_ANNOTATIONS_FILE}" \
--train_caption_annotations_file="${TRAIN_CAPTION_ANNOTATIONS_FILE}" \
--val_caption_annotations_file="${VAL_CAPTION_ANNOTATIONS_FILE}" \
--testdev_annotations_file="${TESTDEV_ANNOTATIONS_FILE}" \
--output_dir="${OUTPUT_DIR}"
mv ${SCRATCH_DIR}/annotations/ ${OUTPUT_DIR}
#!/usr/bin/env bash
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
mkdir -p /model
cd /model
# DOWNLOAD CHECKPOINTS
## Mask RCNN
## ====================== Mask RCNN ====================== ##
BASE_URL="https://storage.googleapis.com/cloud-tpu-checkpoints/mask-rcnn/1555659850"
DEST_DIR="mask-rcnn/1555659850"
wget -N ${BASE_URL}/saved_model.pb -P ${DEST_DIR}
wget -N ${BASE_URL}/variables/variables.data-00000-of-00001 -P ${DEST_DIR}/variables
wget -N ${BASE_URL}/variables/variables.index -P ${DEST_DIR}/variables
## ====================== resnet-nhwc-2018-02-07 ====================== ##
BASE_URL="https://storage.googleapis.com/cloud-tpu-checkpoints/retinanet/resnet50-checkpoint-2018-02-07"
DEST_DIR="resnet/resnet-nhwc-2018-02-07"
wget -N ${BASE_URL}/checkpoint -P ${DEST_DIR}
wget -N ${BASE_URL}/model.ckpt-112603.data-00000-of-00001 -P ${DEST_DIR}
wget -N ${BASE_URL}/model.ckpt-112603.index -P ${DEST_DIR}
wget -N ${BASE_URL}/model.ckpt-112603.meta -P ${DEST_DIR}
## ====================== resnet-nhwc-2018-10-14 ====================== ##
#BASE_URL="https://storage.googleapis.com/cloud-tpu-artifacts/resnet/resnet-nhwc-2018-10-14"
#DEST_DIR="resnet/resnet-nhwc-2018-10-14"
#
#wget -N ${BASE_URL}/model.ckpt-112602.data-00000-of-00001 -P ${DEST_DIR}
#wget -N ${BASE_URL}/model.ckpt-112602.index -P ${DEST_DIR}
#wget -N ${BASE_URL}/model.ckpt-112602.meta -P ${DEST_DIR}
# VERIFY CHECKPOINTS
echo "Verifying and Processing Checkpoints..."
python pb_to_ckpt.py \
--frozen_model_filename=mask-rcnn/1555659850/ \
--output_filename=mask-rcnn/1555659850/ckpt/model.ckpt
python extract_RN50_weights.py \
--checkpoint_dir=mask-rcnn/1555659850/ckpt/model.ckpt \
--save_to=resnet/extracted_from_maskrcnn
echo "Generating list of tensors and their shape..."
python inspect_checkpoint.py --file_name=mask-rcnn/1555659850/ckpt/model.ckpt \
> mask-rcnn/1555659850/tensors_and_shape.txt
python inspect_checkpoint.py --file_name=resnet/resnet-nhwc-2018-02-07/model.ckpt-112603 \
> resnet/resnet-nhwc-2018-02-07/tensors_and_shape.txt
#python inspect_checkpoint.py --file_name=resnet/resnet-nhwc-2018-10-14/model.ckpt-112602 \
# > resnet/resnet-nhwc-2018-10-14/tensors_and_shape.txt
python inspect_checkpoint.py --file_name=resnet/extracted_from_maskrcnn/resnet50.ckpt \
> resnet/extracted_from_maskrcnn/tensors_and_shape.txt
echo "Script Finished with Success"
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Mask-RCNN anchor definition."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import OrderedDict
import numpy as np
import tensorflow as tf
from mask_rcnn.object_detection import argmax_matcher
from mask_rcnn.object_detection import balanced_positive_negative_sampler
from mask_rcnn.object_detection import box_list
from mask_rcnn.object_detection import faster_rcnn_box_coder
from mask_rcnn.object_detection import region_similarity_calculator
from mask_rcnn.object_detection import target_assigner
def _generate_anchor_configs(min_level, max_level, num_scales, aspect_ratios):
"""Generates mapping from output level to a list of anchor configurations.
A configuration is a tuple of (num_anchors, scale, aspect_ratio).
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds two additional
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of tuples representing the aspect raito anchors added
on each level. For instances, aspect_ratios =
[(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
Returns:
anchor_configs: a dictionary with keys as the levels of anchors and
values as a list of anchor configuration.
"""
anchor_configs = {}
for level in range(min_level, max_level + 1):
anchor_configs[level] = []
for scale_octave in range(num_scales):
for aspect in aspect_ratios:
anchor_configs[level].append(
(2**level, scale_octave / float(num_scales), aspect))
return anchor_configs
def _generate_anchor_boxes(image_size, anchor_scale, anchor_configs):
"""Generates multiscale anchor boxes.
Args:
image_size: integer number of input image size. The input image has the
same dimension for width and height. The image_size should be divided by
the largest feature stride 2^max_level.
anchor_scale: float number representing the scale of size of the base
anchor to the feature stride 2^level.
anchor_configs: a dictionary with keys as the levels of anchors and
values as a list of anchor configuration.
Returns:
anchor_boxes: a numpy array with shape [N, 4], which stacks anchors on all
feature levels.
Raises:
ValueError: input size must be the multiple of largest feature stride.
"""
boxes_all = []
for _, configs in anchor_configs.items():
boxes_level = []
for config in configs:
stride, octave_scale, aspect = config
if image_size[0] % stride != 0 or image_size[1] % stride != 0:
raise ValueError('input size must be divided by the stride.')
base_anchor_size = anchor_scale * stride * 2**octave_scale
anchor_size_x_2 = base_anchor_size * aspect[0] / 2.0
anchor_size_y_2 = base_anchor_size * aspect[1] / 2.0
x = np.arange(stride / 2, image_size[1], stride)
y = np.arange(stride / 2, image_size[0], stride)
xv, yv = np.meshgrid(x, y)
xv = xv.reshape(-1)
yv = yv.reshape(-1)
boxes = np.vstack((yv - anchor_size_y_2, xv - anchor_size_x_2,
yv + anchor_size_y_2, xv + anchor_size_x_2))
boxes = np.swapaxes(boxes, 0, 1)
boxes_level.append(np.expand_dims(boxes, axis=1))
# concat anchors on the same level to the reshape NxAx4
boxes_level = np.concatenate(boxes_level, axis=1)
boxes_all.append(boxes_level.reshape([-1, 4]))
anchor_boxes = np.vstack(boxes_all)
return anchor_boxes
class Anchors(object):
"""Mask-RCNN Anchors class."""
def __init__(self, min_level, max_level, num_scales, aspect_ratios, anchor_scale, image_size):
"""Constructs multiscale Mask-RCNN anchors.
Args:
min_level: integer number of minimum level of the output feature pyramid.
max_level: integer number of maximum level of the output feature pyramid.
num_scales: integer number representing intermediate scales added
on each level. For instances, num_scales=2 adds two additional
anchor scales [2^0, 2^0.5] on each level.
aspect_ratios: list of tuples representing the aspect raito anchors added
on each level. For instances, aspect_ratios =
[(1, 1), (1.4, 0.7), (0.7, 1.4)] adds three anchors on each level.
anchor_scale: float number representing the scale of size of the base
anchor to the feature stride 2^level.
image_size: integer number of input image size. The input image has the
same dimension for width and height. The image_size should be divided by
the largest feature stride 2^max_level.
"""
self.min_level = min_level
self.max_level = max_level
self.num_scales = num_scales
self.aspect_ratios = aspect_ratios
self.anchor_scale = anchor_scale
self.image_size = image_size
self.config = self._generate_configs()
self.boxes = self._generate_boxes()
def _generate_configs(self):
"""Generate configurations of anchor boxes."""
return _generate_anchor_configs(self.min_level, self.max_level,
self.num_scales, self.aspect_ratios)
def _generate_boxes(self):
"""Generates multiscale anchor boxes."""
boxes = _generate_anchor_boxes(self.image_size, self.anchor_scale,
self.config)
boxes = tf.convert_to_tensor(value=boxes, dtype=tf.float32)
return boxes
def get_anchors_per_location(self):
return self.num_scales * len(self.aspect_ratios)
def get_unpacked_boxes(self):
return self.unpack_labels(self.boxes)
def unpack_labels(self, labels):
"""Unpacks an array of labels into multiscales labels."""
labels_unpacked = OrderedDict()
count = 0
for level in range(self.min_level, self.max_level + 1):
feat_size0 = int(self.image_size[0] / 2**level)
feat_size1 = int(self.image_size[1] / 2**level)
steps = feat_size0 * feat_size1 * self.get_anchors_per_location()
indices = tf.range(count, count + steps)
count += steps
labels_unpacked[level] = tf.reshape(
tf.gather(labels, indices), [feat_size0, feat_size1, -1])
return labels_unpacked
class AnchorLabeler(object):
"""Labeler for multiscale anchor boxes."""
def __init__(self, anchors, num_classes, match_threshold=0.7,
unmatched_threshold=0.3, rpn_batch_size_per_im=256,
rpn_fg_fraction=0.5):
"""Constructs anchor labeler to assign labels to anchors.
Args:
anchors: an instance of class Anchors.
num_classes: integer number representing number of classes in the dataset.
match_threshold: a float number between 0 and 1 representing the
lower-bound threshold to assign positive labels for anchors. An anchor
with a score over the threshold is labeled positive.
unmatched_threshold: a float number between 0 and 1 representing the
upper-bound threshold to assign negative labels for anchors. An anchor
with a score below the threshold is labeled negative.
rpn_batch_size_per_im: a integer number that represents the number of
sampled anchors per image in the first stage (region proposal network).
rpn_fg_fraction: a float number between 0 and 1 representing the fraction
of positive anchors (foreground) in the first stage.
"""
similarity_calc = region_similarity_calculator.IouSimilarity()
matcher = argmax_matcher.ArgMaxMatcher(
match_threshold,
unmatched_threshold=unmatched_threshold,
negatives_lower_than_unmatched=True,
force_match_for_each_row=True)
box_coder = faster_rcnn_box_coder.FasterRcnnBoxCoder()
self._target_assigner = target_assigner.TargetAssigner(
similarity_calc, matcher, box_coder)
self._anchors = anchors
self._match_threshold = match_threshold
self._unmatched_threshold = unmatched_threshold
self._rpn_batch_size_per_im = rpn_batch_size_per_im
self._rpn_fg_fraction = rpn_fg_fraction
self._num_classes = num_classes
def _get_rpn_samples(self, match_results):
"""Computes anchor labels.
This function performs subsampling for foreground (fg) and background (bg)
anchors.
Args:
match_results: A integer tensor with shape [N] representing the
matching results of anchors. (1) match_results[i]>=0,
meaning that column i is matched with row match_results[i].
(2) match_results[i]=-1, meaning that column i is not matched.
(3) match_results[i]=-2, meaning that column i is ignored.
Returns:
score_targets: a integer tensor with the a shape of [N].
(1) score_targets[i]=1, the anchor is a positive sample.
(2) score_targets[i]=0, negative. (3) score_targets[i]=-1, the anchor is
don't care (ignore).
"""
sampler = (
balanced_positive_negative_sampler.BalancedPositiveNegativeSampler(
positive_fraction=self._rpn_fg_fraction, is_static=False))
# indicator includes both positive and negative labels.
# labels includes only positives labels.
# positives = indicator & labels.
# negatives = indicator & !labels.
# ignore = !indicator.
indicator = tf.greater(match_results, -2)
labels = tf.greater(match_results, -1)
samples = sampler.subsample(
indicator, self._rpn_batch_size_per_im, labels)
positive_labels = tf.where(
tf.logical_and(samples, labels),
tf.constant(2, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
negative_labels = tf.where(
tf.logical_and(samples, tf.logical_not(labels)),
tf.constant(1, dtype=tf.int32, shape=match_results.shape),
tf.constant(0, dtype=tf.int32, shape=match_results.shape))
ignore_labels = tf.fill(match_results.shape, -1)
return (ignore_labels + positive_labels + negative_labels,
positive_labels, negative_labels)
def label_anchors(self, gt_boxes, gt_labels):
"""Labels anchors with ground truth inputs.
Args:
gt_boxes: A float tensor with shape [N, 4] representing groundtruth boxes.
For each row, it stores [y0, x0, y1, x1] for four corners of a box.
gt_labels: A integer tensor with shape [N, 1] representing groundtruth
classes.
Returns:
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of class logits at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
"""
gt_box_list = box_list.BoxList(gt_boxes)
anchor_box_list = box_list.BoxList(self._anchors.boxes)
# cls_targets, cls_weights, box_weights are not used
_, _, box_targets, _, matches = self._target_assigner.assign(
anchor_box_list, gt_box_list, gt_labels)
# score_targets contains the subsampled positive and negative anchors.
score_targets, _, _ = self._get_rpn_samples(matches.match_results)
# Unpack labels.
score_targets_dict = self._anchors.unpack_labels(score_targets)
box_targets_dict = self._anchors.unpack_labels(box_targets)
return score_targets_dict, box_targets_dict
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""COCO-style evaluation metrics.
Implements the interface of COCO API and metric_fn in tf.TPUEstimator.
COCO API: github.com/cocodataset/cocoapi/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import atexit
import copy
import tempfile
import numpy as np
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
import pycocotools.mask as maskUtils
import cv2
class MaskCOCO(COCO):
"""COCO object for mask evaluation.
"""
def reset(self, dataset):
"""Reset the dataset and groundtruth data index in this object.
Args:
dataset: dict of groundtruth data. It should has similar structure as the
COCO groundtruth JSON file. Must contains three keys: {'images',
'annotations', 'categories'}.
'images': list of image information dictionary. Required keys: 'id',
'width' and 'height'.
'annotations': list of dict. Bounding boxes and segmentations related
information. Required keys: {'id', 'image_id', 'category_id', 'bbox',
'iscrowd', 'area', 'segmentation'}.
'categories': list of dict of the category information.
Required key: 'id'.
Refer to http://cocodataset.org/#format-data for more details.
Raises:
AttributeError: If the dataset is empty or not a dict.
"""
assert dataset, 'Groundtruth should not be empty.'
assert isinstance(dataset,
dict), 'annotation file format {} not supported'.format(
type(dataset))
self.anns, self.cats, self.imgs = dict(), dict(), dict()
self.dataset = copy.deepcopy(dataset)
self.createIndex()
def loadRes(self, detection_results, include_mask, is_image_mask=False):
"""Load result file and return a result api object.
Args:
detection_results: a dictionary containing predictions results.
include_mask: a boolean, whether to include mask in detection results.
is_image_mask: a boolean, where the predict mask is a whole image mask.
Returns:
res: result MaskCOCO api object
"""
res = MaskCOCO()
res.dataset['images'] = [img for img in self.dataset['images']]
logging.info('Loading and preparing results...')
predictions = self.load_predictions(
detection_results,
include_mask=include_mask,
is_image_mask=is_image_mask)
assert isinstance(predictions, list), 'results in not an array of objects'
if predictions:
image_ids = [pred['image_id'] for pred in predictions]
assert set(image_ids) == (set(image_ids) & set(self.getImgIds())), \
'Results do not correspond to current coco set'
if (predictions and 'bbox' in predictions[0] and predictions[0]['bbox']):
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for idx, pred in enumerate(predictions):
bb = pred['bbox']
x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
if 'segmentation' not in pred:
pred['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
pred['area'] = bb[2] * bb[3]
pred['id'] = idx + 1
pred['iscrowd'] = 0
elif 'segmentation' in predictions[0]:
res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
for idx, pred in enumerate(predictions):
# now only support compressed RLE format as segmentation results
pred['area'] = maskUtils.area(pred['segmentation'])
if 'bbox' not in pred:
pred['bbox'] = maskUtils.toBbox(pred['segmentation'])
pred['id'] = idx + 1
pred['iscrowd'] = 0
res.dataset['annotations'] = predictions
res.createIndex()
return res
def load_predictions(self,
detection_results,
include_mask,
is_image_mask=False):
"""Create prediction dictionary list from detection and mask results.
Args:
detection_results: a dictionary containing numpy arrays which corresponds
to prediction results.
include_mask: a boolean, whether to include mask in detection results.
is_image_mask: a boolean, where the predict mask is a whole image mask.
Returns:
a list of dictionary including different prediction results from the model
in numpy form.
"""
predictions = []
num_detections = detection_results['detection_scores'].size
current_index = 0
for i, image_id in enumerate(detection_results['source_id']):
if include_mask:
box_coorindates_in_image = detection_results['detection_boxes'][i]
segments = generate_segmentation_from_masks(
detection_results['detection_masks'][i],
box_coorindates_in_image,
int(detection_results['image_info'][i][3]),
int(detection_results['image_info'][i][4]),
is_image_mask=is_image_mask
)
# Convert the mask to uint8 and then to fortranarray for RLE encoder.
encoded_masks = [
maskUtils.encode(np.asfortranarray(instance_mask.astype(np.uint8)))
for instance_mask in segments
]
for box_index in range(int(detection_results['num_detections'][i])):
if current_index % 1000 == 0:
logging.info('{}/{}'.format(current_index, num_detections))
current_index += 1
prediction = {
'image_id': int(image_id),
'bbox': detection_results['detection_boxes'][i][box_index].tolist(),
'score': detection_results['detection_scores'][i][box_index],
'category_id': int(
detection_results['detection_classes'][i][box_index]),
}
if include_mask:
prediction['segmentation'] = encoded_masks[box_index]
predictions.append(prediction)
return predictions
def generate_segmentation_from_masks(masks,
detected_boxes,
image_height,
image_width,
is_image_mask=False):
"""Generates segmentation result from instance masks.
Args:
masks: a numpy array of shape [N, mask_height, mask_width] representing the
instance masks w.r.t. the `detected_boxes`.
detected_boxes: a numpy array of shape [N, 4] representing the reference
bounding boxes.
image_height: an integer representing the height of the image.
image_width: an integer representing the width of the image.
is_image_mask: bool. True: input masks are whole-image masks. False: input
masks are bounding-box level masks.
Returns:
segms: a numpy array of shape [N, image_height, image_width] representing
the instance masks *pasted* on the image canvas.
"""
def expand_boxes(boxes, scale):
"""Expands an array of boxes by a given scale."""
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/boxes.py#L227
# The `boxes` in the reference implementation is in [x1, y1, x2, y2] form,
# whereas `boxes` here is in [x1, y1, w, h] form
w_half = boxes[:, 2] * .5
h_half = boxes[:, 3] * .5
x_c = boxes[:, 0] + w_half
y_c = boxes[:, 1] + h_half
w_half *= scale
h_half *= scale
boxes_exp = np.zeros(boxes.shape)
boxes_exp[:, 0] = x_c - w_half
boxes_exp[:, 2] = x_c + w_half
boxes_exp[:, 1] = y_c - h_half
boxes_exp[:, 3] = y_c + h_half
return boxes_exp
# Reference: https://github.com/facebookresearch/Detectron/blob/master/detectron/core/test.py#L812
# To work around an issue with cv2.resize (it seems to automatically pad
# with repeated border values), we manually zero-pad the masks by 1 pixel
# prior to resizing back to the original image resolution. This prevents
# "top hat" artifacts. We therefore need to expand the reference boxes by an
# appropriate factor.
_, mask_height, mask_width = masks.shape
scale = max((mask_width + 2.0) / mask_width,
(mask_height + 2.0) / mask_height)
ref_boxes = expand_boxes(detected_boxes, scale)
ref_boxes = ref_boxes.astype(np.int32)
padded_mask = np.zeros((mask_height + 2, mask_width + 2), dtype=np.float32)
segms = []
for mask_ind, mask in enumerate(masks):
im_mask = np.zeros((image_height, image_width), dtype=np.uint8)
if is_image_mask:
# Process whole-image masks.
im_mask[:, :] = mask[:, :]
else:
# Process mask inside bounding boxes.
padded_mask[1:-1, 1:-1] = mask[:, :]
ref_box = ref_boxes[mask_ind, :]
w = ref_box[2] - ref_box[0] + 1
h = ref_box[3] - ref_box[1] + 1
w = np.maximum(w, 1)
h = np.maximum(h, 1)
mask = cv2.resize(padded_mask, (w, h))
mask = np.array(mask > 0.5, dtype=np.uint8)
x_0 = max(ref_box[0], 0)
x_1 = min(ref_box[2] + 1, image_width)
y_0 = max(ref_box[1], 0)
y_1 = min(ref_box[3] + 1, image_height)
im_mask[y_0:y_1, x_0:x_1] = mask[(y_0 - ref_box[1]):(y_1 - ref_box[1]), (
x_0 - ref_box[0]):(x_1 - ref_box[0])]
segms.append(im_mask)
segms = np.array(segms)
assert masks.shape[0] == segms.shape[0]
return segms
class EvaluationMetric(object):
"""COCO evaluation metric class."""
def __init__(self, filename, include_mask):
"""Constructs COCO evaluation class.
The class provides the interface to metrics_fn in TPUEstimator. The
_evaluate() loads a JSON file in COCO annotation format as the
groundtruths and runs COCO evaluation.
Args:
filename: Ground truth JSON file name. If filename is None, use
groundtruth data passed from the dataloader for evaluation.
include_mask: boolean to indicate whether or not to include mask eval.
"""
if filename:
if filename.startswith('gs://'):
_, local_val_json = tempfile.mkstemp(suffix='.json')
tf.io.gfile.remove(local_val_json)
tf.io.gfile.copy(filename, local_val_json)
atexit.register(tf.io.gfile.remove, local_val_json)
else:
local_val_json = filename
self.coco_gt = MaskCOCO(local_val_json)
self.filename = filename
self.metric_names = ['AP', 'AP50', 'AP75', 'APs', 'APm', 'APl', 'ARmax1',
'ARmax10', 'ARmax100', 'ARs', 'ARm', 'ARl']
self._include_mask = include_mask
if self._include_mask:
mask_metric_names = ['mask_' + x for x in self.metric_names]
self.metric_names.extend(mask_metric_names)
self._reset()
def _reset(self):
"""Reset COCO API object."""
if self.filename is None and not hasattr(self, 'coco_gt'):
self.coco_gt = MaskCOCO()
def predict_metric_fn(self,
predictions,
is_predict_image_mask=False,
groundtruth_data=None):
"""Generates COCO metrics."""
image_ids = list(set(predictions['source_id']))
if groundtruth_data is not None:
self.coco_gt.reset(groundtruth_data)
coco_dt = self.coco_gt.loadRes(
predictions, self._include_mask, is_image_mask=is_predict_image_mask)
coco_eval = COCOeval(self.coco_gt, coco_dt, iouType='bbox')
coco_eval.params.imgIds = image_ids
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
coco_metrics = coco_eval.stats
if self._include_mask:
# Create another object for instance segmentation metric evaluation.
mcoco_eval = COCOeval(self.coco_gt, coco_dt, iouType='segm')
mcoco_eval.params.imgIds = image_ids
mcoco_eval.evaluate()
mcoco_eval.accumulate()
mcoco_eval.summarize()
mask_coco_metrics = mcoco_eval.stats
if self._include_mask:
metrics = np.hstack((coco_metrics, mask_coco_metrics))
else:
metrics = coco_metrics
# clean up after evaluation is done.
self._reset()
metrics = metrics.astype(np.float32)
metrics_dict = {}
for i, name in enumerate(self.metric_names):
metrics_dict[name] = metrics[i]
return metrics_dict
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Data loader and processing.
Defines input_fn of Mask-RCNN for TF Estimator. The input_fn includes training
data for category classification, bounding box regression, and number of
positive examples to normalize the loss during training.
"""
import functools
import math
import multiprocessing
import tensorflow as tf
from mask_rcnn.utils.logging_formatter import logging
from mask_rcnn.utils.distributed_utils import MPI_is_distributed
from mask_rcnn.utils.distributed_utils import MPI_rank_and_size
from mask_rcnn.utils.distributed_utils import MPI_rank
from mask_rcnn.utils.distributed_utils import MPI_size
# common functions
from mask_rcnn.dataloader_utils import dataset_parser
from distutils.version import LooseVersion
class InputReader(object):
"""Input reader for dataset."""
def __init__(
self,
file_pattern,
mode=tf.estimator.ModeKeys.TRAIN,
num_examples=0,
use_fake_data=False,
use_instance_mask=False,
seed=None
):
self._mode = mode
self._file_pattern = file_pattern
self._num_examples = num_examples
self._use_fake_data = use_fake_data
self._use_instance_mask = use_instance_mask
self._seed = seed
def _create_dataset_parser_fn(self, params):
"""Create parser for parsing input data (dictionary)."""
return functools.partial(
dataset_parser,
mode=self._mode,
params=params,
use_instance_mask=self._use_instance_mask,
seed=self._seed
)
def __call__(self, params, input_context=None):
batch_size = params['batch_size'] if 'batch_size' in params else 1
try:
seed = params['seed'] if not MPI_is_distributed() else params['seed'] * MPI_rank()
except (KeyError, TypeError):
seed = None
if MPI_is_distributed():
n_gpus = MPI_size()
elif input_context is not None:
n_gpus = input_context.num_input_pipelines
else:
n_gpus = 1
##################################################
dataset = tf.data.Dataset.list_files(
self._file_pattern,
shuffle=False
)
if self._mode == tf.estimator.ModeKeys.TRAIN:
if input_context is not None:
logging.info("Using Dataset Sharding with TF Distributed")
_num_shards = input_context.num_input_pipelines
_shard_idx = input_context.input_pipeline_id
elif MPI_is_distributed():
logging.info("Using Dataset Sharding with Horovod")
_shard_idx, _num_shards = MPI_rank_and_size()
try:
dataset = dataset.shard(
num_shards=_num_shards,
index=_shard_idx
)
dataset = dataset.shuffle(math.ceil(256 / _num_shards))
except NameError: # Not a distributed training setup
pass
def _prefetch_dataset(filename):
return tf.data.TFRecordDataset(filename).prefetch(1)
dataset = dataset.interleave(
map_func=_prefetch_dataset,
cycle_length=32,
block_length=64,
num_parallel_calls=tf.data.experimental.AUTOTUNE,
)
if self._num_examples is not None and self._num_examples > 0:
logging.info("[*] Limiting the amount of sample to: %d" % self._num_examples)
dataset = dataset.take(self._num_examples)
dataset = dataset.cache()
if self._mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(
buffer_size=4096,
reshuffle_each_iteration=True,
seed=seed
)
dataset = dataset.repeat()
# Parse the fetched records to input tensors for model function.
dataset = dataset.map(
map_func=self._create_dataset_parser_fn(params),
num_parallel_calls=tf.data.experimental.AUTOTUNE,
)
dataset = dataset.batch(
batch_size=batch_size,
drop_remainder=True
)
if self._use_fake_data:
# Turn this dataset into a semi-fake dataset which always loop at the
# first batch. This reduces variance in performance and is useful in
# testing.
logging.info("Using Fake Dataset Loop...")
dataset = dataset.take(1).cache().repeat()
if self._mode != tf.estimator.ModeKeys.TRAIN:
dataset = dataset.take(int(5000 / batch_size))
dataset = dataset.prefetch(
buffer_size=tf.data.experimental.AUTOTUNE,
)
if self._mode == tf.estimator.ModeKeys.PREDICT or n_gpus > 1:
if not tf.distribute.has_strategy():
dataset = dataset.apply(
tf.data.experimental.prefetch_to_device(
'/gpu:0', # With Horovod the local GPU is always 0
buffer_size=1,
)
)
data_options = tf.data.Options()
data_options.experimental_deterministic = seed is not None
if LooseVersion(tf.__version__) <= LooseVersion("2.0.0"):
data_options.experimental_distribute.auto_shard = False
else:
data_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
# data_options.experimental_distribute.auto_shard = False
data_options.experimental_slack = True
data_options.experimental_threading.max_intra_op_parallelism = 1
# data_options.experimental_threading.private_threadpool_size = int(multiprocessing.cpu_count() / n_gpus) * 2
# ================= experimental_optimization ================= #
data_options.experimental_optimization.apply_default_optimizations = False
# data_options.experimental_optimization.autotune = True
data_options.experimental_optimization.filter_fusion = True
data_options.experimental_optimization.map_and_batch_fusion = True
data_options.experimental_optimization.map_and_filter_fusion = True
data_options.experimental_optimization.map_fusion = True
data_options.experimental_optimization.map_parallelization = True
map_vectorization_options = tf.data.experimental.MapVectorizationOptions()
map_vectorization_options.enabled = True
map_vectorization_options.use_choose_fastest = True
data_options.experimental_optimization.map_vectorization = map_vectorization_options
data_options.experimental_optimization.noop_elimination = True
data_options.experimental_optimization.parallel_batch = True
data_options.experimental_optimization.shuffle_and_repeat_fusion = True
# ========== Stats on TF Data =============
# aggregator = tf.data.experimental.StatsAggregator()
# data_options.experimental_stats.aggregator = aggregator
# data_options.experimental_stats.latency_all_edges = True
dataset = dataset.with_options(data_options)
return dataset
if __name__ == "__main__":
'''
Data Loading Benchmark Usage:
# Real Data - Training
python -m mask_rcnn.dataloader \
--data_dir="/data/" \
--batch_size=2 \
--warmup_steps=200 \
--benchmark_steps=2000 \
--training
# Real Data - Inference
python -m mask_rcnn.dataloader \
--data_dir="/data/" \
--batch_size=8 \
--warmup_steps=200 \
--benchmark_steps=2000
# --------------- #
# Synthetic Data - Training
python -m mask_rcnn.dataloader \
--data_dir="/data/" \
--batch_size=2 \
--warmup_steps=200 \
--benchmark_steps=2000 \
--training \
--use_synthetic_data
# Synthetic Data - Inference
python -m mask_rcnn.dataloader \
--data_dir="/data/" \
--batch_size=8 \
--warmup_steps=200 \
--benchmark_steps=2000 \
--use_synthetic_data
# --------------- #
'''
import os
import time
import argparse
import numpy as np
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.disable_eager_execution()
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
logging.set_verbosity(logging.INFO)
parser = argparse.ArgumentParser(description="MaskRCNN Dataloader Benchmark")
parser.add_argument(
'--data_dir', required=True, type=str, help="Directory path which contains the preprocessed DAGM 2007 dataset"
)
parser.add_argument(
'--batch_size', default=64, type=int, required=True, help="""Batch size used to measure performance."""
)
parser.add_argument(
'--warmup_steps',
default=200,
type=int,
required=True,
help="""Number of steps considered as warmup and not taken into account for performance measurements."""
)
parser.add_argument(
'--benchmark_steps',
default=200,
type=int,
required=True,
help="Number of steps used to benchmark dataloading performance. Only used in training"
)
parser.add_argument(
'--seed',
default=666,
type=int,
required=False,
help="""Reproducibility Seed."""
)
parser.add_argument("--training", default=False, action="store_true", help="Benchmark in training mode")
parser.add_argument("--use_synthetic_data", default=False, action="store_true", help="Use synthetic dataset")
FLAGS, unknown_args = parser.parse_known_args()
if len(unknown_args) > 0:
for bad_arg in unknown_args:
print("ERROR: Unknown command line arg: %s" % bad_arg)
raise ValueError("Invalid command line arg(s)")
BURNIN_STEPS = FLAGS.warmup_steps
if FLAGS.training:
TOTAL_STEPS = FLAGS.warmup_steps + FLAGS.benchmark_steps
else:
TOTAL_STEPS = int(1e6) # Wait for end of dataset
if FLAGS.training:
input_dataset = InputReader(
file_pattern=os.path.join(FLAGS.data_dir, "train*.tfrecord"),
mode=tf.estimator.ModeKeys.TRAIN,
use_fake_data=FLAGS.use_synthetic_data,
use_instance_mask=True,
seed=FLAGS.seed
)
else:
input_dataset = InputReader(
file_pattern=os.path.join(FLAGS.data_dir, "val*.tfrecord"),
mode=tf.estimator.ModeKeys.PREDICT,
num_examples=5000,
use_fake_data=FLAGS.use_synthetic_data,
use_instance_mask=True,
seed=FLAGS.seed
)
logging.info("[*] Executing Benchmark in %s mode" % ("training" if FLAGS.training else "inference"))
logging.info("[*] Benchmark using %s data" % ("synthetic" if FLAGS.use_synthetic_data else "real"))
time.sleep(1)
# Build the data input
dataset = input_dataset(
params={
"anchor_scale": 8.0,
"aspect_ratios": [[1.0, 1.0], [1.4, 0.7], [0.7, 1.4]],
"batch_size": FLAGS.batch_size,
"gt_mask_size": 112,
"image_size": [1024, 1024],
"include_groundtruth_in_features": False,
"augment_input_data": True,
"max_level": 6,
"min_level": 2,
"num_classes": 91,
"num_scales": 1,
"rpn_batch_size_per_im": 256,
"rpn_fg_fraction": 0.5,
"rpn_min_size": 0.,
"rpn_nms_threshold": 0.7,
"rpn_negative_overlap": 0.3,
"rpn_positive_overlap": 0.7,
"rpn_post_nms_topn": 1000,
"rpn_pre_nms_topn": 2000,
"skip_crowd_during_training": True,
"use_category": True,
"visualize_images_summary": False,
}
)
dataset_iterator = dataset.make_initializable_iterator()
if FLAGS.training:
X, Y = dataset_iterator.get_next()
else:
X = dataset_iterator.get_next()
config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
config.log_device_placement = False
with tf.device("gpu:0"):
X_gpu_ops = list()
Y_gpu_ops = list()
if FLAGS.training:
for _, _x in X.items():
X_gpu_ops.append(tf.identity(_x))
for _, _y in Y.items():
Y_gpu_ops.append(tf.identity(_y))
else:
for _, _x in X["features"].items():
X_gpu_ops.append(tf.identity(_x))
with tf.control_dependencies(X_gpu_ops + Y_gpu_ops):
input_op = tf.constant(1.0)
with tf.compat.v1.Session(config=config) as sess:
sess.run(dataset_iterator.initializer)
sess.run(tf.compat.v1.global_variables_initializer())
total_files_processed = 0
img_per_sec_arr = []
processing_time_arr = []
processing_start_time = time.time()
for step in range(TOTAL_STEPS):
try:
start_time = time.time()
sess.run(input_op)
elapsed_time = (time.time() - start_time) * 1000
imgs_per_sec = (FLAGS.batch_size / elapsed_time) * 1000
total_files_processed += FLAGS.batch_size
if (step + 1) > BURNIN_STEPS:
processing_time_arr.append(elapsed_time)
img_per_sec_arr.append(imgs_per_sec)
if (step + 1) % 20 == 0 or (step + 1) == TOTAL_STEPS:
print(
"[STEP %04d] # Batch Size: %03d - Time: %03d msecs - Speed: %6d img/s" %
(step + 1, FLAGS.batch_size, elapsed_time, imgs_per_sec)
)
except tf.errors.OutOfRangeError:
break
processing_time = time.time() - processing_start_time
avg_processing_speed = np.mean(img_per_sec_arr)
print("\n###################################################################")
print("*** Data Loading Performance Metrics ***\n")
print("\t=> Number of Steps: %d" % (step + 1))
print("\t=> Batch Size: %d" % FLAGS.batch_size)
print("\t=> Files Processed: %d" % total_files_processed)
print("\t=> Total Execution Time: %d secs" % processing_time)
print("\t=> Median Time per step: %3d msecs" % np.median(processing_time_arr))
print("\t=> Median Processing Speed: %d images/secs" % np.median(img_per_sec_arr))
print("\t=> Median Processing Time: %.2f msecs/image" % (1 / float(np.median(img_per_sec_arr)) * 1000))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Data loader and processing.
Defines input_fn of Mask-RCNN for TF Estimator. The input_fn includes training
data for category classification, bounding box regression, and number of
positive examples to normalize the loss during training.
"""
import tensorflow as tf
from mask_rcnn import anchors
from mask_rcnn.utils import coco_utils
from mask_rcnn.ops import preprocess_ops
from mask_rcnn.object_detection import tf_example_decoder
MAX_NUM_INSTANCES = 100
MAX_NUM_VERTICES_PER_INSTANCE = 1500
MAX_NUM_POLYGON_LIST_LEN = 2 * MAX_NUM_VERTICES_PER_INSTANCE * MAX_NUM_INSTANCES
POLYGON_PAD_VALUE = coco_utils.POLYGON_PAD_VALUE
__all__ = [
# dataset parser
"dataset_parser",
# common functions
"preprocess_image",
"process_groundtruth_is_crowd",
"process_source_id",
# eval
"prepare_labels_for_eval",
# training
"augment_image",
"process_boxes_classes_indices_for_training",
"process_gt_masks_for_training",
"process_labels_for_training",
"process_targets_for_training"
]
###############################################################################################################
def dataset_parser(value, mode, params, use_instance_mask, seed=None, regenerate_source_id=False):
"""Parse data to a fixed dimension input image and learning targets.
Args:
value: A dictionary contains an image and groundtruth annotations.
Returns:
features: a dictionary that contains the image and auxiliary
information. The following describes {key: value} pairs in the
dictionary.
image: Image tensor that is preproessed to have normalized value and
fixed dimension [image_size, image_size, 3]
image_info: image information that includes the original height and
width, the scale of the proccessed image to the original image, and
the scaled height and width.
source_ids: Source image id. Default value -1 if the source id is
empty in the groundtruth annotation.
labels: a dictionary that contains auxiliary information plus (optional)
labels. The following describes {key: value} pairs in the dictionary.
`labels` is only for training.
score_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors]. The height_l and width_l
represent the dimension of objectiveness score at l-th level.
box_targets_dict: ordered dictionary with keys
[min_level, min_level+1, ..., max_level]. The values are tensor with
shape [height_l, width_l, num_anchors * 4]. The height_l and
width_l represent the dimension of bounding box regression output at
l-th level.
gt_boxes: Groundtruth bounding box annotations. The box is represented
in [y1, x1, y2, x2] format. The tennsor is padded with -1 to the
fixed dimension [MAX_NUM_INSTANCES, 4].
gt_classes: Groundtruth classes annotations. The tennsor is padded
with -1 to the fixed dimension [MAX_NUM_INSTANCES].
cropped_gt_masks: groundtrugh masks cropped by the bounding box and
resized to a fixed size determined by params['gt_mask_size']
regenerate_source_id: `bool`, if True TFExampleParser will use hashed
value of `image/encoded` for `image/source_id`.
"""
if mode not in [tf.estimator.ModeKeys.TRAIN, tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.EVAL]:
raise ValueError("Unknown execution mode received: %s" % mode)
def create_example_decoder():
return tf_example_decoder.TfExampleDecoder(
use_instance_mask=use_instance_mask,
regenerate_source_id=regenerate_source_id
)
example_decoder = create_example_decoder()
with tf.xla.experimental.jit_scope(compile_ops=True):
with tf.name_scope('parser'):
data = example_decoder.decode(value)
data['groundtruth_is_crowd'] = process_groundtruth_is_crowd(data)
image = tf.image.convert_image_dtype(data['image'], dtype=tf.float32)
source_id = process_source_id(data['source_id'])
if mode == tf.estimator.ModeKeys.PREDICT:
features = {
'source_ids': source_id,
}
if params['visualize_images_summary']:
features['orig_images'] = tf.image.resize(image, params['image_size'])
features["images"], features["image_info"], _, _ = preprocess_image(
image,
boxes=None,
instance_masks=None,
image_size=params['image_size'],
max_level=params['max_level'],
augment_input_data=False,
seed=seed
)
if params['include_groundtruth_in_features']:
labels = prepare_labels_for_eval(
data,
target_num_instances=MAX_NUM_INSTANCES,
target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN,
use_instance_mask=params['include_mask']
)
return {'features': features, 'labels': labels}
else:
return {'features': features}
elif mode == tf.estimator.ModeKeys.TRAIN:
labels = {}
features = {
'source_ids': source_id
}
boxes, classes, indices, instance_masks = process_boxes_classes_indices_for_training(
data,
skip_crowd_during_training=params['skip_crowd_during_training'],
use_category=params['use_category'],
use_instance_mask=use_instance_mask
)
image, image_info, boxes, instance_masks = preprocess_image(
image,
boxes=boxes,
instance_masks=instance_masks,
image_size=params['image_size'],
max_level=params['max_level'],
augment_input_data=params['augment_input_data'],
seed=seed
)
features.update({
'images': image,
'image_info': image_info,
})
padded_image_size = image.get_shape().as_list()[:2]
# Pads cropped_gt_masks.
if use_instance_mask:
labels['cropped_gt_masks'] = process_gt_masks_for_training(
instance_masks,
boxes,
gt_mask_size=params['gt_mask_size'],
padded_image_size=padded_image_size,
max_num_instances=MAX_NUM_INSTANCES
)
with tf.xla.experimental.jit_scope(compile_ops=False):
# Assign anchors.
(score_targets, box_targets), input_anchor = process_targets_for_training(
padded_image_size=padded_image_size,
boxes=boxes,
classes=classes,
params=params
)
additional_labels = process_labels_for_training(
image_info, boxes, classes, score_targets, box_targets,
max_num_instances=MAX_NUM_INSTANCES,
min_level=params["min_level"],
max_level=params["max_level"]
)
labels.update(additional_labels)
# labels["input_anchor"] = input_anchor
# Features
# {
# 'source_ids': <tf.Tensor 'parser/StringToNumber:0' shape=() dtype=float32>,
# 'images': <tf.Tensor 'parser/pad_to_bounding_box/Squeeze:0' shape=(1024, 1024, 3) dtype=float32>,
# 'image_info': <tf.Tensor 'parser/stack_1:0' shape=(5,) dtype=float32>
# }
FAKE_FEATURES = False
if FAKE_FEATURES:
labels["source_ids"] = tf.ones(shape=(), dtype=tf.float32)
labels["images"] = tf.ones(shape=(1024, 1024, 3), dtype=tf.float32)
labels["image_info"] = tf.ones(shape=(5,), dtype=tf.float32)
# Labels
# {
# 'cropped_gt_masks': <tf.Tensor 'parser/Reshape_4:0' shape=(100, 116, 116) dtype=float32>,
# 'score_targets_2': <tf.Tensor 'parser/Reshape_9:0' shape=(256, 256, 3) dtype=int32>,
# 'box_targets_2': <tf.Tensor 'parser/Reshape_14:0' shape=(256, 256, 12) dtype=float32>,
# 'score_targets_3': <tf.Tensor 'parser/Reshape_10:0' shape=(128, 128, 3) dtype=int32>,
# 'box_targets_3': <tf.Tensor 'parser/Reshape_15:0' shape=(128, 128, 12) dtype=float32>,
# 'score_targets_4': <tf.Tensor 'parser/Reshape_11:0' shape=(64, 64, 3) dtype=int32>,
# 'box_targets_4': <tf.Tensor 'parser/Reshape_16:0' shape=(64, 64, 12) dtype=float32>,
# 'score_targets_5': <tf.Tensor 'parser/Reshape_12:0' shape=(32, 32, 3) dtype=int32>,
# 'box_targets_5': <tf.Tensor 'parser/Reshape_17:0' shape=(32, 32, 12) dtype=float32>,
# 'score_targets_6': <tf.Tensor 'parser/Reshape_13:0' shape=(16, 16, 3) dtype=int32>,
# 'box_targets_6': <tf.Tensor 'parser/Reshape_18:0' shape=(16, 16, 12) dtype=float32>,
# 'gt_boxes': <tf.Tensor 'parser/Reshape_20:0' shape=(100, 4) dtype=float32>,
# 'gt_classes': <tf.Tensor 'parser/Reshape_22:0' shape=(100, 1) dtype=float32>
# }
FAKE_LABELS = False
if FAKE_LABELS:
labels["cropped_gt_masks"] = tf.ones(shape=(100, 116, 116), dtype=tf.float32)
labels["gt_boxes"] = tf.ones(shape=(100, 4), dtype=tf.float32)
labels["gt_classes"] = tf.ones(shape=(100, 1), dtype=tf.float32)
idx = 1
for dim in [256, 128, 64, 32, 16]:
idx += 1 # Starts at 2
labels["score_targets_%d" % idx] = tf.ones(shape=(dim, dim, 3), dtype=tf.float32)
labels["box_targets_%d" % idx] = tf.ones(shape=(dim, dim, 12), dtype=tf.float32)
return features, labels
###############################################################################################################
# common functions
def preprocess_image(image, boxes, instance_masks, image_size, max_level, augment_input_data=False, seed=None):
image = preprocess_ops.normalize_image(image)
if augment_input_data:
image, boxes, instance_masks = augment_image(image=image, boxes=boxes, instance_masks=instance_masks, seed=seed)
# Scaling and padding.
image, image_info, boxes, instance_masks = preprocess_ops.resize_and_pad(
image=image,
target_size=image_size,
stride=2 ** max_level,
boxes=boxes,
masks=instance_masks
)
return image, image_info, boxes, instance_masks
def process_groundtruth_is_crowd(data):
return tf.cond(
pred=tf.greater(tf.size(input=data['groundtruth_is_crowd']), 0),
true_fn=lambda: data['groundtruth_is_crowd'],
false_fn=lambda: tf.zeros_like(data['groundtruth_classes'], dtype=tf.bool)
)
# def process_source_id(data):
# source_id = tf.where(tf.equal(source_id, tf.constant('')), '-1', source_id)
# source_id = tf.strings.to_number(source_id)
# return source_id
def process_source_id(source_id):
"""Processes source_id to the right format."""
if source_id.dtype == tf.string:
source_id = tf.cast(tf.strings.to_number(source_id), tf.int64)
with tf.control_dependencies([source_id]):
source_id = tf.cond(
tf.equal(tf.size(source_id), 0),
lambda: tf.cast(tf.constant(-1), tf.int64),
lambda: tf.identity(source_id)
)
return source_id
# eval
def prepare_labels_for_eval(
data,
target_num_instances=MAX_NUM_INSTANCES,
target_polygon_list_len=MAX_NUM_POLYGON_LIST_LEN,
use_instance_mask=False
):
"""Create labels dict for infeed from data of tf.Example."""
image = data['image']
height, width = tf.shape(input=image)[:2]
boxes = data['groundtruth_boxes']
classes = tf.cast(data['groundtruth_classes'], dtype=tf.float32)
num_labels = tf.shape(input=classes)[0]
boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [target_num_instances, 4])
classes = preprocess_ops.pad_to_fixed_size(classes, -1, [target_num_instances, 1])
is_crowd = tf.cast(data['groundtruth_is_crowd'], dtype=tf.float32)
is_crowd = preprocess_ops.pad_to_fixed_size(is_crowd, 0, [target_num_instances, 1])
labels = dict()
labels['width'] = width
labels['height'] = height
labels['groundtruth_boxes'] = boxes
labels['groundtruth_classes'] = classes
labels['num_groundtruth_labels'] = num_labels
labels['groundtruth_is_crowd'] = is_crowd
if use_instance_mask:
data['groundtruth_polygons'] = preprocess_ops.pad_to_fixed_size(
data=data['groundtruth_polygons'],
pad_value=POLYGON_PAD_VALUE,
output_shape=[target_polygon_list_len, 1]
)
if 'groundtruth_area' in data:
labels['groundtruth_area'] = preprocess_ops.pad_to_fixed_size(
data=labels['groundtruth_area'],
pad_value=0,
output_shape=[target_num_instances, 1]
)
return labels
# training
def augment_image(image, boxes, instance_masks, seed):
flipped_results = preprocess_ops.random_horizontal_flip(
image,
boxes=boxes,
masks=instance_masks,
seed=seed
)
if instance_masks is not None:
image, boxes, instance_masks = flipped_results
else:
image, boxes = flipped_results
# image = tf.image.random_brightness(image, max_delta=0.1, seed=seed)
# image = tf.image.random_contrast(image, lower=0.9, upper=1.1, seed=seed)
# image = tf.image.random_saturation(image, lower=0.9, upper=1.1, seed=seed)
# image = tf.image.random_jpeg_quality(image, min_jpeg_quality=80, max_jpeg_quality=100, seed=seed)
return image, boxes, instance_masks
def process_boxes_classes_indices_for_training(data, skip_crowd_during_training, use_category, use_instance_mask):
boxes = data['groundtruth_boxes']
classes = data['groundtruth_classes']
classes = tf.reshape(tf.cast(classes, dtype=tf.float32), [-1, 1])
indices = None
instance_masks = None
if not use_category:
classes = tf.cast(tf.greater(classes, 0), dtype=tf.float32)
if skip_crowd_during_training:
indices = tf.where(tf.logical_not(data['groundtruth_is_crowd']))
classes = tf.gather_nd(classes, indices)
boxes = tf.gather_nd(boxes, indices)
if use_instance_mask:
instance_masks = tf.gather_nd(data['groundtruth_instance_masks'], indices)
return boxes, classes, indices, instance_masks
def process_gt_masks_for_training(instance_masks, boxes, gt_mask_size, padded_image_size, max_num_instances):
cropped_gt_masks = preprocess_ops.crop_gt_masks(
instance_masks=instance_masks,
boxes=boxes,
gt_mask_size=gt_mask_size,
image_size=padded_image_size
)
# cropped_gt_masks = tf.reshape(cropped_gt_masks, [max_num_instances, -1])
cropped_gt_masks = preprocess_ops.pad_to_fixed_size(
data=cropped_gt_masks,
pad_value=-1,
output_shape=[max_num_instances, (gt_mask_size + 4) ** 2]
)
return tf.reshape(cropped_gt_masks, [max_num_instances, gt_mask_size + 4, gt_mask_size + 4])
def process_labels_for_training(
image_info, boxes, classes,
score_targets, box_targets,
max_num_instances, min_level, max_level
):
labels = {}
# Pad groundtruth data.
# boxes *= image_info[2]
boxes = preprocess_ops.pad_to_fixed_size(boxes, -1, [max_num_instances, 4])
classes = preprocess_ops.pad_to_fixed_size(classes, -1, [max_num_instances, 1])
for level in range(min_level, max_level + 1):
labels['score_targets_%d' % level] = score_targets[level]
labels['box_targets_%d' % level] = box_targets[level]
labels['gt_boxes'] = boxes
labels['gt_classes'] = classes
return labels
def process_targets_for_training(padded_image_size, boxes, classes, params):
input_anchors = anchors.Anchors(
params['min_level'],
params['max_level'],
params['num_scales'],
params['aspect_ratios'],
params['anchor_scale'],
padded_image_size
)
anchor_labeler = anchors.AnchorLabeler(
input_anchors,
params['num_classes'],
params['rpn_positive_overlap'],
params['rpn_negative_overlap'],
params['rpn_batch_size_per_im'],
params['rpn_fg_fraction']
)
return anchor_labeler.label_anchors(boxes, classes), input_anchors
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment