diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..acaf13f02c906b45ffc2f49ee5a0ce01d82b4786 --- /dev/null +++ b/GETTING_STARTED.md @@ -0,0 +1,79 @@ +## Getting Started with Detectron2 + +This document provides a brief intro of the usage of builtin command-line tools in detectron2. + +For a tutorial that involves actual coding with the API, +see our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5) +which covers how to run inference with an +existing model, and how to train a builtin model on a custom dataset. + +For more advanced tutorials, refer to our [documentation](https://detectron2.readthedocs.io/tutorials/extend.html). + + +### Inference Demo with Pre-trained Models + +1. Pick a model and its config file from + [model zoo](MODEL_ZOO.md), + for example, `mask_rcnn_R_50_FPN_3x.yaml`. +2. We provide `demo.py` that is able to run builtin standard models. Run it with: +``` +cd demo/ +python demo.py --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \ + --input input1.jpg input2.jpg \ + [--other-options] + --opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl +``` +The configs are made for training, therefore we need to specify `MODEL.WEIGHTS` to a model from model zoo for evaluation. +This command will run the inference and show visualizations in an OpenCV window. + +For details of the command line arguments, see `demo.py -h` or look at its source code +to understand its behavior. Some common arguments are: +* To run __on your webcam__, replace `--input files` with `--webcam`. +* To run __on a video__, replace `--input files` with `--video-input video.mp4`. +* To run __on cpu__, add `MODEL.DEVICE cpu` after `--opts`. +* To save outputs to a directory (for images) or a file (for webcam or video), use `--output`. + + +### Training & Evaluation in Command Line + +We provide a script in "tools/{,plain_}train_net.py", that is made to train +all the configs provided in detectron2. +You may want to use it as a reference to write your own training script. + +To train a model with "train_net.py", first +setup the corresponding datasets following +[datasets/README.md](./datasets/README.md), +then run: +``` +cd tools/ +./train_net.py --num-gpus 8 \ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml +``` + +The configs are made for 8-GPU training. +To train on 1 GPU, you may need to [change some parameters](https://arxiv.org/abs/1706.02677), e.g.: +``` +./train_net.py \ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \ + --num-gpus 1 SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 +``` + +For most models, CPU training is not supported. + +To evaluate a model's performance, use +``` +./train_net.py \ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \ + --eval-only MODEL.WEIGHTS /path/to/checkpoint_file +``` +For more options, see `./train_net.py -h`. + +### Use Detectron2 APIs in Your Code + +See our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5) +to learn how to use detectron2 APIs to: +1. run inference with an existing model +2. train a builtin model on a custom dataset + +See [detectron2/projects](https://github.com/facebookresearch/detectron2/tree/master/projects) +for more ways to build your project on detectron2. diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..3985f8ae4f5ecde26b310b4ab01c49b922f742e9 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,184 @@ +## Installation + +Our [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5) +has step-by-step instructions that install detectron2. +The [Dockerfile](docker) +also installs detectron2 with a few simple commands. + +### Requirements +- Linux or macOS with Python ≥ 3.6 +- PyTorch ≥ 1.4 +- [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. + You can install them together at [pytorch.org](https://pytorch.org) to make sure of this. +- OpenCV, optional, needed by demo and visualization +- pycocotools: `pip install cython; pip install -U 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'` + + +### Build Detectron2 from Source + +gcc & g++ ≥ 5 are required. [ninja](https://ninja-build.org/) is recommended for faster build. +After having them, run: +``` +python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' +# (add --user if you don't have permission) + +# Or, to install it from a local clone: +git clone https://github.com/facebookresearch/detectron2.git +python -m pip install -e detectron2 + +# Or if you are on macOS +# CC=clang CXX=clang++ python -m pip install -e . +``` + +To __rebuild__ detectron2 that's built from a local clone, use `rm -rf build/ **/*.so` to clean the +old build first. You often need to rebuild detectron2 after reinstalling PyTorch. + +### Install Pre-Built Detectron2 (Linux only) +``` +# for CUDA 10.1: +python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/index.html +``` +You can replace cu101 with "cu{100,92}" or "cpu". + +Note that: +1. Such installation has to be used with certain version of official PyTorch release. + See [releases](https://github.com/facebookresearch/detectron2/releases) for requirements. + It will not work with a different version of PyTorch or a non-official build of PyTorch. +2. Such installation is out-of-date w.r.t. master branch of detectron2. It may not be + compatible with the master branch of a research project that uses detectron2 (e.g. those in + [projects](projects) or [meshrcnn](https://github.com/facebookresearch/meshrcnn/)). + +### Common Installation Issues + +If you met issues using the pre-built detectron2, please uninstall it and try building it from source. + +Click each issue for its solutions: + +
+ +Undefined torch/aten/caffe2 symbols, or segmentation fault immediately when running the library. + +
+ +This usually happens when detectron2 or torchvision is not +compiled with the version of PyTorch you're running. + +Pre-built torchvision or detectron2 has to work with the corresponding official release of pytorch. +If the error comes from a pre-built torchvision, uninstall torchvision and pytorch and reinstall them +following [pytorch.org](http://pytorch.org). So the versions will match. + +If the error comes from a pre-built detectron2, check [release notes](https://github.com/facebookresearch/detectron2/releases) +to see the corresponding pytorch version required for each pre-built detectron2. + +If the error comes from detectron2 or torchvision that you built manually from source, +remove files you built (`build/`, `**/*.so`) and rebuild it so it can pick up the version of pytorch currently in your environment. + +If you cannot resolve this problem, please include the output of `gdb -ex "r" -ex "bt" -ex "quit" --args python -m detectron2.utils.collect_env` +in your issue. +
+ +
+ +Undefined C++ symbols (e.g. `GLIBCXX`) or C++ symbols not found. + +
+Usually it's because the library is compiled with a newer C++ compiler but run with an old C++ runtime. + +This often happens with old anaconda. +Try `conda update libgcc`. Then rebuild detectron2. + +The fundamental solution is to run the code with proper C++ runtime. +One way is to use `LD_PRELOAD=/path/to/libstdc++.so`. + +
+ +
+ +"Not compiled with GPU support" or "Detectron2 CUDA Compiler: not available". + +
+CUDA is not found when building detectron2. +You should make sure + +``` +python -c 'import torch; from torch.utils.cpp_extension import CUDA_HOME; print(torch.cuda.is_available(), CUDA_HOME)' +``` + +print valid outputs at the time you build detectron2. + +Most models can run inference (but not training) without GPU support. To use CPUs, set `MODEL.DEVICE='cpu'` in the config. +
+ +
+ +"invalid device function" or "no kernel image is available for execution". + +
+Two possibilities: + +* You build detectron2 with one version of CUDA but run it with a different version. + + To check whether it is the case, + use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions. + In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA" + to contain cuda libraries of the same version. + + When they are inconsistent, + you need to either install a different build of PyTorch (or build by yourself) + to match your local CUDA installation, or install a different version of CUDA to match PyTorch. + +* Detectron2 or PyTorch/torchvision is not built for the correct GPU architecture (compute compatibility). + + The GPU architecture for PyTorch/detectron2/torchvision is available in the "architecture flags" in + `python -m detectron2.utils.collect_env`. + + The GPU architecture flags of detectron2/torchvision by default matches the GPU model detected + during compilation. This means the compiled code may not work on a different GPU model. + To overwrite the GPU architecture for detectron2/torchvision, use `TORCH_CUDA_ARCH_LIST` environment variable during compilation. + + For example, `export TORCH_CUDA_ARCH_LIST=6.0,7.0` makes it compile for both P100s and V100s. + Visit [developer.nvidia.com/cuda-gpus](https://developer.nvidia.com/cuda-gpus) to find out + the correct compute compatibility number for your device. + +
+ +
+ +Undefined CUDA symbols; cannot open libcudart.so; other nvcc failures. + +
+The version of NVCC you use to build detectron2 or torchvision does +not match the version of CUDA you are running with. +This often happens when using anaconda's CUDA runtime. + +Use `python -m detectron2.utils.collect_env` to find out inconsistent CUDA versions. +In the output of this command, you should expect "Detectron2 CUDA Compiler", "CUDA_HOME", "PyTorch built with - CUDA" +to contain cuda libraries of the same version. + +When they are inconsistent, +you need to either install a different build of PyTorch (or build by yourself) +to match your local CUDA installation, or install a different version of CUDA to match PyTorch. +
+ + +
+ +"ImportError: cannot import name '_C'". + +
+Please build and install detectron2 following the instructions above. + +If you are running code from detectron2's root directory, `cd` to a different one. +Otherwise you may not import the code that you installed. +
+ +
+ +ONNX conversion segfault after some "TraceWarning". + +
+The ONNX package is compiled with too old compiler. + +Please build and install ONNX from its source code using a compiler +whose version is closer to what's used by PyTorch (available in `torch.__config__.show()`). +
diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..d4836895578c791dffd78d07d83a72a961e270a4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, +and distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by +the copyright owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all +other entities that control, are controlled by, or are under common +control with that entity. For the purposes of this definition, +"control" means (i) the power, direct or indirect, to cause the +direction or management of such entity, whether by contract or +otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity +exercising permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, +including but not limited to software source code, documentation +source, and configuration files. + +"Object" form shall mean any form resulting from mechanical +transformation or translation of a Source form, including but +not limited to compiled object code, generated documentation, +and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or +Object form, made available under the License, as indicated by a +copyright notice that is included in or attached to the work +(an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object +form, that is based on (or derived from) the Work and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. For the purposes +of this License, Derivative Works shall not include works that remain +separable from, or merely link (or bind by name) to the interfaces of, +the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including +the original version of the Work and any modifications or additions +to that Work or Derivative Works thereof, that is intentionally +submitted to Licensor for inclusion in the Work by the copyright owner +or by an individual or Legal Entity authorized to submit on behalf of +the copyright owner. For the purposes of this definition, "submitted" +means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, +and issue tracking systems that are managed by, or on behalf of, the +Licensor for the purpose of discussing and improving the Work, but +excluding communication that is conspicuously marked or otherwise +designated in writing by the copyright owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity +on behalf of whom a Contribution has been received by Licensor and +subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the +Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of +this License, each Contributor hereby grants to You a perpetual, +worldwide, non-exclusive, no-charge, royalty-free, irrevocable +(except as stated in this section) patent license to make, have made, +use, offer to sell, sell, import, and otherwise transfer the Work, +where such license applies only to those patent claims licensable +by such Contributor that are necessarily infringed by their +Contribution(s) alone or by combination of their Contribution(s) +with the Work to which such Contribution(s) was submitted. If You +institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work +or a Contribution incorporated within the Work constitutes direct +or contributory patent infringement, then any patent licenses +granted to You under this License for that Work shall terminate +as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the +Work or Derivative Works thereof in any medium, with or without +modifications, and in Source or Object form, provided that You +meet the following conditions: + +(a) You must give any other recipients of the Work or +Derivative Works a copy of this License; and + +(b) You must cause any modified files to carry prominent notices +stating that You changed the files; and + +(c) You must retain, in the Source form of any Derivative Works +that You distribute, all copyright, patent, trademark, and +attribution notices from the Source form of the Work, +excluding those notices that do not pertain to any part of +the Derivative Works; and + +(d) If the Work includes a "NOTICE" text file as part of its +distribution, then any Derivative Works that You distribute must +include a readable copy of the attribution notices contained +within such NOTICE file, excluding those notices that do not +pertain to any part of the Derivative Works, in at least one +of the following places: within a NOTICE text file distributed +as part of the Derivative Works; within the Source form or +documentation, if provided along with the Derivative Works; or, +within a display generated by the Derivative Works, if and +wherever such third-party notices normally appear. The contents +of the NOTICE file are for informational purposes only and +do not modify the License. You may add Your own attribution +notices within Derivative Works that You distribute, alongside +or as an addendum to the NOTICE text from the Work, provided +that such additional attribution notices cannot be construed +as modifying the License. + +You may add Your own copyright statement to Your modifications and +may provide additional or different license terms and conditions +for use, reproduction, or distribution of Your modifications, or +for any such Derivative Works as a whole, provided Your use, +reproduction, and distribution of the Work otherwise complies with +the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, +any Contribution intentionally submitted for inclusion in the Work +by You to the Licensor shall be under the terms and conditions of +this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify +the terms of any separate license agreement you may have executed +with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade +names, trademarks, service marks, or product names of the Licensor, +except as required for reasonable and customary use in describing the +origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or +agreed to in writing, Licensor provides the Work (and each +Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +implied, including, without limitation, any warranties or conditions +of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A +PARTICULAR PURPOSE. You are solely responsible for determining the +appropriateness of using or redistributing the Work and assume any +risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, +whether in tort (including negligence), contract, or otherwise, +unless required by applicable law (such as deliberate and grossly +negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, +incidental, or consequential damages of any character arising as a +result of this License or out of the use or inability to use the +Work (including but not limited to damages for loss of goodwill, +work stoppage, computer failure or malfunction, or any and all +other commercial damages or losses), even if such Contributor +has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing +the Work or Derivative Works thereof, You may choose to offer, +and charge a fee for, acceptance of support, warranty, indemnity, +or other liability obligations and/or rights consistent with this +License. However, in accepting such obligations, You may act only +on Your own behalf and on Your sole responsibility, not on behalf +of any other Contributor, and only if You agree to indemnify, +defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason +of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + +To apply the Apache License to your work, attach the following +boilerplate notice, with the fields enclosed by brackets "[]" +replaced with your own identifying information. (Don't include +the brackets!) The text should be enclosed in the appropriate +comment syntax for the file format. We also recommend that a +file or class name and description of purpose be included on the +same "printed page" as the copyright notice for easier +identification within third-party archives. + +Copyright 2019 - present, Facebook, Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md new file mode 100644 index 0000000000000000000000000000000000000000..07b81ffffa37d97b10f8d39f934b9f62bcb51cc1 --- /dev/null +++ b/MODEL_ZOO.md @@ -0,0 +1,903 @@ +# Detectron2 Model Zoo and Baselines + +## Introduction + +This file documents a large collection of baselines trained +with detectron2 in Sep-Oct, 2019. +All numbers were obtained on [Big Basin](https://engineering.fb.com/data-center-engineering/introducing-big-basin-our-next-generation-ai-hardware/) +servers with 8 NVIDIA V100 GPUs & NVLink. The software in use were PyTorch 1.3, CUDA 9.2, cuDNN 7.4.2 or 7.6.3. +You can access these models from code using [detectron2.model_zoo](https://detectron2.readthedocs.io/modules/model_zoo.html) APIs. + +In addition to these official baseline models, you can find more models in [projects/](projects/). + +#### How to Read the Tables +* The "Name" column contains a link to the config file. Running `tools/train_net.py` with this config file + and 8 GPUs will reproduce the model. +* Training speed is averaged across the entire training. + We keep updating the speed with latest version of detectron2/pytorch/etc., + so they might be different from the `metrics` file. + Training speed for multi-machine jobs is not provided. +* Inference speed is measured by `tools/train_net.py --eval-only`, or [inference_on_dataset()](https://detectron2.readthedocs.io/modules/evaluation.html#detectron2.evaluation.inference_on_dataset), + with batch size 1 in detectron2 directly. + Measuring it with your own code will likely introduce other overhead. + Actual deployment in production should in general be faster than the given inference + speed due to more optimizations. +* The *model id* column is provided for ease of reference. + To check downloaded file integrity, any model on this page contains its md5 prefix in its file name. +* Training curves and other statistics can be found in `metrics` for each model. + +#### Common Settings for COCO Models +* All COCO models were trained on `train2017` and evaluated on `val2017`. +* The default settings are __not directly comparable__ with Detectron's standard settings. + For example, our default training data augmentation uses scale jittering in addition to horizontal flipping. + + To make fair comparisons with Detectron's settings, see + [Detectron1-Comparisons](configs/Detectron1-Comparisons/) for accuracy comparison, + and [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html) + for speed comparison. +* For Faster/Mask R-CNN, we provide baselines based on __3 different backbone combinations__: + * __FPN__: Use a ResNet+FPN backbone with standard conv and FC heads for mask and box prediction, + respectively. It obtains the best + speed/accuracy tradeoff, but the other two are still useful for research. + * __C4__: Use a ResNet conv4 backbone with conv5 head. The original baseline in the Faster R-CNN paper. + * __DC5__ (Dilated-C5): Use a ResNet conv5 backbone with dilations in conv5, and standard conv and FC heads + for mask and box prediction, respectively. + This is used by the Deformable ConvNet paper. +* Most models are trained with the 3x schedule (~37 COCO epochs). + Although 1x models are heavily under-trained, we provide some ResNet-50 models with the 1x (~12 COCO epochs) + training schedule for comparison when doing quick research iteration. + +#### ImageNet Pretrained Models + +We provide backbone models pretrained on ImageNet-1k dataset. +These models have __different__ format from those provided in Detectron: we do not fuse BatchNorm into an affine layer. +* [R-50.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-50.pkl): converted copy of [MSRA's original ResNet-50](https://github.com/KaimingHe/deep-residual-networks) model. +* [R-101.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/MSRA/R-101.pkl): converted copy of [MSRA's original ResNet-101](https://github.com/KaimingHe/deep-residual-networks) model. +* [X-101-32x8d.pkl](https://dl.fbaipublicfiles.com/detectron2/ImageNetPretrained/FAIR/X-101-32x8d.pkl): ResNeXt-101-32x8d model trained with Caffe2 at FB. + +Pretrained models in Detectron's format can still be used. For example: +* [X-152-32x8d-IN5k.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl): + ResNeXt-152-32x8d model trained on ImageNet-5k with Caffe2 at FB (see ResNeXt paper for details on ImageNet-5k). +* [R-50-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47261647/R-50-GN.pkl): + ResNet-50 with Group Normalization. +* [R-101-GN.pkl](https://dl.fbaipublicfiles.com/detectron/ImageNetPretrained/47592356/R-101-GN.pkl): + ResNet-101 with Group Normalization. + +Torchvision's ResNet models can be used after converted by [this script](tools/convert-torchvision-to-d2.py). + +#### License + +All models available for download through this document are licensed under the +[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/). + +### COCO Object Detection Baselines + +#### Faster R-CNN: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
model iddownload
R50-C41x0.5510.1024.835.7137257644model | metrics
R50-DC51x0.3800.0685.037.3137847829model | metrics
R50-FPN1x0.2100.0383.037.9137257794model | metrics
R50-C43x0.5430.1044.838.4137849393model | metrics
R50-DC53x0.3780.0705.039.0137849425model | metrics
R50-FPN3x0.2090.0383.040.2137849458model | metrics
R101-C43x0.6190.1395.941.1138204752model | metrics
R101-DC53x0.4520.0866.140.6138204841model | metrics
R101-FPN3x0.2860.0514.142.0137851257model | metrics
X101-FPN3x0.6380.0986.743.0139173657model | metrics
+ +#### RetinaNet: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
model iddownload
R501x0.2000.0553.936.5137593951model | metrics
R503x0.2010.0553.937.9137849486model | metrics
R1013x0.2800.0685.139.9138363263model | metrics
+ +#### RPN & Fast R-CNN: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
prop.
AR
model iddownload
RPN R50-C41x0.1300.0341.551.6137258005model | metrics
RPN R50-FPN1x0.1860.0322.758.0137258492model | metrics
Fast R-CNN R50-FPN1x0.1400.0292.637.8137635226model | metrics
+ +### COCO Instance Segmentation Baselines with Mask R-CNN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
model iddownload
R50-C41x0.5840.1105.236.832.2137259246model | metrics
R50-DC51x0.4710.0766.538.334.2137260150model | metrics
R50-FPN1x0.2610.0433.438.635.2137260431model | metrics
R50-C43x0.5750.1115.239.834.4137849525model | metrics
R50-DC53x0.4700.0766.540.035.9137849551model | metrics
R50-FPN3x0.2610.0433.441.037.2137849600model | metrics
R101-C43x0.6520.1456.342.636.7138363239model | metrics
R101-DC53x0.5450.0927.641.937.3138363294model | metrics
R101-FPN3x0.3400.0564.642.938.6138205316model | metrics
X101-FPN3x0.6900.1037.244.339.5139653917model | metrics
+ +### COCO Person Keypoint Detection Baselines with Keypoint R-CNN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
kp.
AP
model iddownload
R50-FPN1x0.3150.0725.053.664.0137261548model | metrics
R50-FPN3x0.3160.0665.055.465.5137849621model | metrics
R101-FPN3x0.3900.0766.156.466.1138363331model | metrics
X101-FPN3x0.7380.1218.757.366.0139686956model | metrics
+ +### COCO Panoptic Segmentation Baselines with Panoptic FPN + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
PQmodel iddownload
R50-FPN1x0.3040.0534.837.634.739.4139514544model | metrics
R50-FPN3x0.3020.0534.840.036.541.5139514569model | metrics
R101-FPN3x0.3920.0666.042.438.543.0139514519model | metrics
+ + +### LVIS Instance Segmentation Baselines with Mask R-CNN + +Mask R-CNN baselines on the [LVIS dataset](https://lvisdataset.org), v0.5. +These baselines are described in Table 3(c) of the [LVIS paper](https://arxiv.org/abs/1908.03195). + +NOTE: the 1x schedule here has the same amount of __iterations__ as the COCO 1x baselines. +They are roughly 24 epochs of LVISv0.5 data. +The final results of these configs have large variance across different runs. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
model iddownload
R50-FPN1x0.2920.1077.123.624.4144219072model | metrics
R101-FPN1x0.3710.1147.825.625.9144219035model | metrics
X101-FPN1x0.7120.15110.226.727.1144219108model | metrics
+ + + +### Cityscapes & Pascal VOC Baselines + +Simple baselines for +* Mask R-CNN on Cityscapes instance segmentation (initialized from COCO pre-training, then trained on Cityscapes fine annotations only) +* Faster R-CNN on PASCAL VOC object detection (trained on VOC 2007 train+val + VOC 2012 train+val, tested on VOC 2007 using 11-point interpolated AP) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Nametrain
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
box
AP50
mask
AP
model iddownload
R50-FPN, Cityscapes0.2400.0784.436.5142423278model | metrics
R50-C4, VOC0.5370.0814.851.980.3142202221model | metrics
+ + + +### Other Settings + +Ablations for Deformable Conv and Cascade R-CNN: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
model iddownload
Baseline R50-FPN1x0.2610.0433.438.635.2137260431model | metrics
Deformable Conv1x0.3420.0483.541.537.5138602867model | metrics
Cascade R-CNN1x0.3170.0524.042.136.4138602847model | metrics
Baseline R50-FPN3x0.2610.0433.441.037.2137849600model | metrics
Deformable Conv3x0.3490.0473.542.738.5144998336model | metrics
Cascade R-CNN3x0.3280.0534.044.338.5144998488model | metrics
+ + +Ablations for normalization methods, and a few models trained from scratch following [Rethinking ImageNet Pre-training](https://arxiv.org/abs/1811.08883). +(Note: The baseline uses `2fc` head while the others use [`4conv1fc` head](https://arxiv.org/abs/1803.08494)) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
model iddownload
Baseline R50-FPN3x0.2610.0433.441.037.2137849600model | metrics
GN3x0.3560.0697.342.638.6138602888model | metrics
SyncBN3x0.3710.0535.541.937.8169527823model | metrics
GN (from scratch)3x0.4000.0699.839.936.6138602908model | metrics
GN (from scratch)9xN/A0.0709.843.739.6183808979model | metrics
SyncBN (from scratch)9xN/A0.0557.243.639.3184226666model | metrics
+ + +A few very large models trained for a long time, for demo purposes. They are trained using multiple machines: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Nameinference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
PQmodel iddownload
Panoptic FPN R1010.10711.447.441.346.1139797668model | metrics
Mask R-CNN X1520.24215.150.244.018131413model | metrics
above + test-time aug.51.945.9
diff --git a/README.md b/README.md index ccc9bd67dc5c467859102d53d54c5ce851273bdd..1fbb95b39ce9e9c0eab83079319a9298fca438b1 100644 --- a/README.md +++ b/README.md @@ -1 +1,56 @@ -xx + + +Detectron2 is Facebook AI Research's next generation software system +that implements state-of-the-art object detection algorithms. +It is a ground-up rewrite of the previous version, +[Detectron](https://github.com/facebookresearch/Detectron/), +and it originates from [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark/). + +
+ +
+ +### What's New +* It is powered by the [PyTorch](https://pytorch.org) deep learning framework. +* Includes more features such as panoptic segmentation, densepose, Cascade R-CNN, rotated bounding boxes, etc. +* Can be used as a library to support [different projects](projects/) on top of it. + We'll open source more research projects in this way. +* It [trains much faster](https://detectron2.readthedocs.io/notes/benchmarks.html). + +See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-/) +to see more demos and learn about detectron2. + +## Installation + +See [INSTALL.md](INSTALL.md). + +## Quick Start + +See [GETTING_STARTED.md](GETTING_STARTED.md), +or the [Colab Notebook](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5). + +Learn more at our [documentation](https://detectron2.readthedocs.org). +And see [projects/](projects/) for some projects that are built on top of detectron2. + +## Model Zoo and Baselines + +We provide a large set of baseline results and trained models available for download in the [Detectron2 Model Zoo](MODEL_ZOO.md). + + +## License + +Detectron2 is released under the [Apache 2.0 license](LICENSE). + +## Citing Detectron2 + +If you use Detectron2 in your research or wish to refer to the baseline results published in the [Model Zoo](MODEL_ZOO.md), please use the following BibTeX entry. + +```BibTeX +@misc{wu2019detectron2, + author = {Yuxin Wu and Alexander Kirillov and Francisco Massa and + Wan-Yen Lo and Ross Girshick}, + title = {Detectron2}, + howpublished = {\url{https://github.com/facebookresearch/detectron2}}, + year = {2019} +} +``` diff --git a/configs/Base-RCNN-C4.yaml b/configs/Base-RCNN-C4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fbf34a0ea57a587e09997edd94c4012d69d0b6ad --- /dev/null +++ b/configs/Base-RCNN-C4.yaml @@ -0,0 +1,18 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + RPN: + PRE_NMS_TOPK_TEST: 6000 + POST_NMS_TOPK_TEST: 1000 + ROI_HEADS: + NAME: "Res5ROIHeads" +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/configs/Base-RCNN-DilatedC5.yaml b/configs/Base-RCNN-DilatedC5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c0d6d16bdaf532f09e4976f0aa240a49e748da27 --- /dev/null +++ b/configs/Base-RCNN-DilatedC5.yaml @@ -0,0 +1,31 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + RESNETS: + OUT_FEATURES: ["res5"] + RES5_DILATION: 2 + RPN: + IN_FEATURES: ["res5"] + PRE_NMS_TOPK_TEST: 6000 + POST_NMS_TOPK_TEST: 1000 + ROI_HEADS: + NAME: "StandardROIHeads" + IN_FEATURES: ["res5"] + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_FC: 2 + POOLER_RESOLUTION: 7 + ROI_MASK_HEAD: + NAME: "MaskRCNNConvUpsampleHead" + NUM_CONV: 4 + POOLER_RESOLUTION: 14 +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/configs/Base-RCNN-FPN.yaml b/configs/Base-RCNN-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3e020f2e7b2f26765be317f907126a1556621abf --- /dev/null +++ b/configs/Base-RCNN-FPN.yaml @@ -0,0 +1,42 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + BACKBONE: + NAME: "build_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + FPN: + IN_FEATURES: ["res2", "res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map + ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) + RPN: + IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] + PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level + PRE_NMS_TOPK_TEST: 1000 # Per FPN level + # Detectron1 uses 2000 proposals per-batch, + # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) + # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. + POST_NMS_TOPK_TRAIN: 1000 + POST_NMS_TOPK_TEST: 1000 + ROI_HEADS: + NAME: "StandardROIHeads" + IN_FEATURES: ["p2", "p3", "p4", "p5"] + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_FC: 2 + POOLER_RESOLUTION: 7 + ROI_MASK_HEAD: + NAME: "MaskRCNNConvUpsampleHead" + NUM_CONV: 4 + POOLER_RESOLUTION: 14 +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/configs/Base-RetinaNet.yaml b/configs/Base-RetinaNet.yaml new file mode 100644 index 0000000000000000000000000000000000000000..12ec9d2fc20cc0438f17bde2c5f6fbee9496c1b0 --- /dev/null +++ b/configs/Base-RetinaNet.yaml @@ -0,0 +1,24 @@ +MODEL: + META_ARCHITECTURE: "RetinaNet" + BACKBONE: + NAME: "build_retinanet_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: !!python/object/apply:eval ["[[x, x * 2**(1.0/3), x * 2**(2.0/3) ] for x in [32, 64, 128, 256, 512 ]]"] + FPN: + IN_FEATURES: ["res3", "res4", "res5"] + RETINANET: + IOU_THRESHOLDS: [0.4, 0.5] + IOU_LABELS: [0, -1, 1] +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.01 # Note that RetinaNet uses a different default learning rate + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..773ac10e87c626760d00d831bf664ce9ff073c49 --- /dev/null +++ b/configs/COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,17 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + LOAD_PROPOSALS: True + RESNETS: + DEPTH: 50 + PROPOSAL_GENERATOR: + NAME: "PrecomputedProposals" +DATASETS: + TRAIN: ("coco_2017_train",) + PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_train_box_proposals_21bc3a.pkl", ) + TEST: ("coco_2017_val",) + PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", ) +DATALOADER: + # proposals are part of the dataset_dicts, and take a lot of RAM + NUM_WORKERS: 2 diff --git a/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db142cd671c1841b4f64cf130bee7f7954ecdd28 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_101_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: False + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bceb6b343618d8cd9a6c414ff9eb86ab31cc230a --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: False + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..57a098f53ee8c54ecfa354cc96efefd890dc1b72 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: False + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f96130105c3ba6ab393e0932870903875f5cb732 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc51bce390a85ee3529ffdcebde05748e1646be0 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0fe96f57febdac5790ea4cec168fa4b97ac4807a --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..33fadeb87d1ef67ab2b55926b9a652ab4ac4a27d --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3262019a1211b910d3b371569199ed1afaacf6a4 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41395182bf5c9dd8ab1241c4414068817298d554 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9c9b5ab77157baa581d90d9847c045c19ed6ffa3 --- /dev/null +++ b/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml @@ -0,0 +1,13 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + MASK_ON: False + WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" + PIXEL_STD: [57.375, 57.120, 58.395] + RESNETS: + STRIDE_IN_1X1: False # this is a C2 model + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml b/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4abb1b9a547957aa6afc0b29129e00f89cf98d59 --- /dev/null +++ b/configs/COCO-Detection/retinanet_R_101_FPN_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "../Base-RetinaNet.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml b/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4a24ce3a9a108a8792e18c8aabfb7b712f0d3725 --- /dev/null +++ b/configs/COCO-Detection/retinanet_R_50_FPN_1x.yaml @@ -0,0 +1,5 @@ +_BASE_: "../Base-RetinaNet.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml b/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3b5412d4a7aef1d6c3f7c1e34f94007de639b833 --- /dev/null +++ b/configs/COCO-Detection/retinanet_R_50_FPN_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "../Base-RetinaNet.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Detection/rpn_R_50_C4_1x.yaml b/configs/COCO-Detection/rpn_R_50_C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e04821156b0376ba5215d5ce5b7010a36b43e6a1 --- /dev/null +++ b/configs/COCO-Detection/rpn_R_50_C4_1x.yaml @@ -0,0 +1,10 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + META_ARCHITECTURE: "ProposalNetwork" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 + RPN: + PRE_NMS_TOPK_TEST: 12000 + POST_NMS_TOPK_TEST: 2000 diff --git a/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml b/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc9c95203b1c3c9cd9bb9876bb8d9a5dd9b31d9a --- /dev/null +++ b/configs/COCO-Detection/rpn_R_50_FPN_1x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "ProposalNetwork" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 + RPN: + POST_NMS_TOPK_TEST: 2000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a94cc45a0f2aaa8c92e14871c553b736545e327 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: True + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..67b70cf4be8c19f5dc735b6f55a8690698f34b69 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: True + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1935a302d2d0fa7f69553b3fd50b5a7082c6c0d1 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: True + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a9aeb4eac38026dbb867e799f9fd3a8d8eb3af80 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..38ed867d897dfec839cbcf11a2e2dc8abb92f07c --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b13eefab2a049c48d94d5051c82ceb6dbde40579 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d401016358f967f6619d88b1c9bd5673a1cdeba8 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-DilatedC5.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d50fb866ca7811a87b42555c7213f88e00bf6df1 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..be7d06b8e0f032ee7fcaabd7c122158518489fd2 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d14c63f74383bfc308750f51d51344398b02a239 --- /dev/null +++ b/configs/COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml @@ -0,0 +1,13 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + MASK_ON: True + WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" + PIXEL_STD: [57.375, 57.120, 58.395] + RESNETS: + STRIDE_IN_1X1: False # this is a C2 model + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml b/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e03944a42d2e497da5ceca17c8fda797dac3f82 --- /dev/null +++ b/configs/COCO-Keypoints/Base-Keypoint-RCNN-FPN.yaml @@ -0,0 +1,15 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + KEYPOINT_ON: True + ROI_HEADS: + NUM_CLASSES: 1 + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 0.5 # Keypoint AP degrades (though box AP improves) when using plain L1 loss + RPN: + # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2. + # 1000 proposals per-image is found to hurt box AP. + # Therefore we increase it to 1500 per-image. + POST_NMS_TOPK_TRAIN: 1500 +DATASETS: + TRAIN: ("keypoints_coco_2017_train",) + TEST: ("keypoints_coco_2017_val",) diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9309535c57a1aa7d23297aac80a9bd78a6c79fcc --- /dev/null +++ b/configs/COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-Keypoint-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7bf85cf745b53b3e7ab28fe94b7f4f9e7fe6e335 --- /dev/null +++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,5 @@ +_BASE_: "Base-Keypoint-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a07f243f650a497b9372501e3face75194cf0941 --- /dev/null +++ b/configs/COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-Keypoint-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml b/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d4bfa20a98c0a65c6bd60e93b07e8f4b7d92a867 --- /dev/null +++ b/configs/COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml @@ -0,0 +1,12 @@ +_BASE_: "Base-Keypoint-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" + PIXEL_STD: [57.375, 57.120, 58.395] + RESNETS: + STRIDE_IN_1X1: False # this is a C2 model + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml b/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..755c12018c5db8ca456d5e7fa8cbd18d90f97527 --- /dev/null +++ b/configs/COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "PanopticFPN" + MASK_ON: True + SEM_SEG_HEAD: + LOSS_WEIGHT: 0.5 +DATASETS: + TRAIN: ("coco_2017_train_panoptic_separated",) + TEST: ("coco_2017_val_panoptic_separated",) diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e01f6fb31e9b00b1857b7de3b5074184d1f4a21 --- /dev/null +++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-Panoptic-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6afa2c1cc92495309ed1553a17359fe5d7d6566e --- /dev/null +++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml @@ -0,0 +1,5 @@ +_BASE_: "Base-Panoptic-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 diff --git a/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b956b3f673e78649184fe2c50e2700b3f1f14794 --- /dev/null +++ b/configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-Panoptic-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml b/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1a7aaeb961581ed9492c4cfe5a69a1eb60495b3e --- /dev/null +++ b/configs/Cityscapes/mask_rcnn_R_50_FPN.yaml @@ -0,0 +1,27 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + # WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + # For better, more stable performance initialize from COCO + WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl" + MASK_ON: True + ROI_HEADS: + NUM_CLASSES: 8 +# This is similar to the setting used in Mask R-CNN paper, Appendix A +# But there are some differences, e.g., we did not initialize the output +# layer using the corresponding classes from COCO +INPUT: + MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024) + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 1024 + MAX_SIZE_TRAIN: 2048 + MAX_SIZE_TEST: 2048 +DATASETS: + TRAIN: ("cityscapes_fine_instance_seg_train",) + TEST: ("cityscapes_fine_instance_seg_val",) +SOLVER: + BASE_LR: 0.01 + STEPS: (18000,) + MAX_ITER: 24000 + IMS_PER_BATCH: 8 +TEST: + EVAL_PERIOD: 8000 diff --git a/configs/Detectron1-Comparisons/README.md b/configs/Detectron1-Comparisons/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a90ed9e433a00b8b9f43961d7a2696d5b9013127 --- /dev/null +++ b/configs/Detectron1-Comparisons/README.md @@ -0,0 +1,83 @@ + +Detectron2 model zoo's experimental settings and a few implementation details are different from Detectron. + +The differences in implementation details are shared in +[Compatibility with Other Libraries](../../docs/notes/compatibility.md). + +The differences in model zoo's experimental settings include: +* Use scale augmentation during training. This improves AP with lower training cost. +* Use L1 loss instead of smooth L1 loss for simplicity. This sometimes improves box AP but may + affect other AP. +* Use `POOLER_SAMPLING_RATIO=0` instead of 2. This does not significantly affect AP. +* Use `ROIAlignV2`. This does not significantly affect AP. + +In this directory, we provide a few configs that __do not__ have the above changes. +They mimic Detectron's behavior as close as possible, +and provide a fair comparison of accuracy and speed against Detectron. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
mask
AP
kp.
AP
model iddownload
Faster R-CNN1x0.2190.0383.136.9137781054model | metrics
Keypoint R-CNN1x0.3130.0715.053.164.2137781195model | metrics
Mask R-CNN1x0.2730.0433.437.834.9137781281model | metrics
+ +## Comparisons: + +* Faster R-CNN: Detectron's AP is 36.7, similar to ours. +* Keypoint R-CNN: Detectron's AP is box 53.6, keypoint 64.2. Fixing a Detectron's + [bug](https://github.com/facebookresearch/Detectron/issues/459) lead to a drop in box AP, and can be + compensated back by some parameter tuning. +* Mask R-CNN: Detectron's AP is box 37.7, mask 33.9. We're 1 AP better in mask AP, due to more correct implementation. + +For speed comparison, see [benchmarks](https://detectron2.readthedocs.io/notes/benchmarks.html). diff --git a/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml b/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6ce77f137fa2c4e5254a62b58c18b8b76096f2aa --- /dev/null +++ b/configs/Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml @@ -0,0 +1,17 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 + # Detectron1 uses smooth L1 loss with some magic beta values. + # The defaults are changed to L1 loss in Detectron2. + RPN: + SMOOTH_L1_BETA: 0.1111 + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 1.0 + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" +INPUT: + # no scale augmentation + MIN_SIZE_TRAIN: (800, ) diff --git a/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml b/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..aacf868ba5290c752031c130a2081af48afc0808 --- /dev/null +++ b/configs/Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,27 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + KEYPOINT_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + NUM_CLASSES: 1 + ROI_KEYPOINT_HEAD: + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" + # Detectron1 uses smooth L1 loss with some magic beta values. + # The defaults are changed to L1 loss in Detectron2. + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 1.0 + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" + RPN: + SMOOTH_L1_BETA: 0.1111 + # Detectron1 uses 2000 proposals per-batch, but this option is per-image in detectron2 + # 1000 proposals per-image is found to hurt box AP. + # Therefore we increase it to 1500 per-image. + POST_NMS_TOPK_TRAIN: 1500 +DATASETS: + TRAIN: ("keypoints_coco_2017_train",) + TEST: ("keypoints_coco_2017_val",) diff --git a/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml b/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4ea86a8d8e2cd3e51cbc7311b0d00710c07d01f6 --- /dev/null +++ b/configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml @@ -0,0 +1,20 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + # Detectron1 uses smooth L1 loss with some magic beta values. + # The defaults are changed to L1 loss in Detectron2. + RPN: + SMOOTH_L1_BETA: 0.1111 + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 1.0 + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" + ROI_MASK_HEAD: + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" +INPUT: + # no scale augmentation + MIN_SIZE_TRAIN: (800, ) diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f0c3a1bbc0a09e1384de522f30c443ba1e36fafa --- /dev/null +++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml @@ -0,0 +1,19 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: True + RESNETS: + DEPTH: 101 + ROI_HEADS: + NUM_CLASSES: 1230 + SCORE_THRESH_TEST: 0.0001 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +DATASETS: + TRAIN: ("lvis_v0.5_train",) + TEST: ("lvis_v0.5_val",) +TEST: + DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 +DATALOADER: + SAMPLER_TRAIN: "RepeatFactorTrainingSampler" + REPEAT_THRESHOLD: 0.001 diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..64b4caa4ef2b284782367ea702e1ae6653472630 --- /dev/null +++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,19 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + NUM_CLASSES: 1230 + SCORE_THRESH_TEST: 0.0001 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +DATASETS: + TRAIN: ("lvis_v0.5_train",) + TEST: ("lvis_v0.5_val",) +TEST: + DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 +DATALOADER: + SAMPLER_TRAIN: "RepeatFactorTrainingSampler" + REPEAT_THRESHOLD: 0.001 diff --git a/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml b/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c8b822c6c006ba642f4caf9b55e7983f6797427a --- /dev/null +++ b/configs/LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml @@ -0,0 +1,23 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/FAIR/X-101-32x8d.pkl" + PIXEL_STD: [57.375, 57.120, 58.395] + MASK_ON: True + RESNETS: + STRIDE_IN_1X1: False # this is a C2 model + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + DEPTH: 101 + ROI_HEADS: + NUM_CLASSES: 1230 + SCORE_THRESH_TEST: 0.0001 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +DATASETS: + TRAIN: ("lvis_v0.5_train",) + TEST: ("lvis_v0.5_val",) +TEST: + DETECTIONS_PER_IMAGE: 300 # LVIS allows up to 300 +DATALOADER: + SAMPLER_TRAIN: "RepeatFactorTrainingSampler" + REPEAT_THRESHOLD: 0.001 diff --git a/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml b/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..abb33b618932e94b66239945ac892f4c84a6e8f8 --- /dev/null +++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml @@ -0,0 +1,12 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + NAME: CascadeROIHeads + ROI_BOX_HEAD: + CLS_AGNOSTIC_BBOX_REG: True + RPN: + POST_NMS_TOPK_TRAIN: 2000 diff --git a/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml b/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e2201ad5c46ded91ccfa47b7698a521625c5e447 --- /dev/null +++ b/configs/Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml @@ -0,0 +1,15 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + NAME: CascadeROIHeads + ROI_BOX_HEAD: + CLS_AGNOSTIC_BBOX_REG: True + RPN: + POST_NMS_TOPK_TRAIN: 2000 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml b/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc117f6b5e3e51558ec2f01b73c5365622e5ce25 --- /dev/null +++ b/configs/Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml @@ -0,0 +1,36 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + MASK_ON: True + WEIGHTS: "catalog://ImageNetPretrained/FAIR/X-152-32x8d-IN5k" + RESNETS: + STRIDE_IN_1X1: False # this is a C2 model + NUM_GROUPS: 32 + WIDTH_PER_GROUP: 8 + DEPTH: 152 + DEFORM_ON_PER_STAGE: [False, True, True, True] + ROI_HEADS: + NAME: "CascadeROIHeads" + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_CONV: 4 + NUM_FC: 1 + NORM: "GN" + CLS_AGNOSTIC_BBOX_REG: True + ROI_MASK_HEAD: + NUM_CONV: 8 + NORM: "GN" + RPN: + POST_NMS_TOPK_TRAIN: 2000 +SOLVER: + IMS_PER_BATCH: 128 + STEPS: (35000, 45000) + MAX_ITER: 50000 + BASE_LR: 0.16 +INPUT: + MIN_SIZE_TRAIN: (640, 864) + MIN_SIZE_TRAIN_SAMPLING: "range" + MAX_SIZE_TRAIN: 1440 + CROP: + ENABLED: True +TEST: + EVAL_PERIOD: 2500 diff --git a/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml b/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4c3b767ff473bbab7225cc8a4a92608543d78246 --- /dev/null +++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_cls_agnostic.yaml @@ -0,0 +1,10 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + ROI_BOX_HEAD: + CLS_AGNOSTIC_BBOX_REG: True + ROI_MASK_HEAD: + CLS_AGNOSTIC_MASK: True diff --git a/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml b/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..04ff988d073ef9169ee4ca2cbce0d6f030c15232 --- /dev/null +++ b/configs/Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml @@ -0,0 +1,8 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5 + DEFORM_MODULATED: False diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml new file mode 100644 index 0000000000000000000000000000000000000000..68c0ca58d7df97ca728c339da0ca9828fe6be318 --- /dev/null +++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml @@ -0,0 +1,11 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + DEFORM_ON_PER_STAGE: [False, True, True, True] # on Res3,Res4,Res5 + DEFORM_MODULATED: False +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..74d274e5a529b5a8afe186940868f9d48c6112b3 --- /dev/null +++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_gn.yaml @@ -0,0 +1,21 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-50-GN" + MASK_ON: True + RESNETS: + DEPTH: 50 + NORM: "GN" + STRIDE_IN_1X1: False + FPN: + NORM: "GN" + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_CONV: 4 + NUM_FC: 1 + NORM: "GN" + ROI_MASK_HEAD: + NORM: "GN" +SOLVER: + # 3x schedule + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml b/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..11ebb076ba529f26c71a0d972e96ca4c2d6a830b --- /dev/null +++ b/configs/Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml @@ -0,0 +1,24 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + NORM: "SyncBN" + STRIDE_IN_1X1: True + FPN: + NORM: "SyncBN" + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_CONV: 4 + NUM_FC: 1 + NORM: "SyncBN" + ROI_MASK_HEAD: + NORM: "SyncBN" +SOLVER: + # 3x schedule + STEPS: (210000, 250000) + MAX_ITER: 270000 +TEST: + PRECISE_BN: + ENABLED: True diff --git a/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml b/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..34016cea3ca9d7fb69ef4fe01d6b47ee8690a13b --- /dev/null +++ b/configs/Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml @@ -0,0 +1,26 @@ +# A large PanopticFPN for demo purposes. +# Use GN on backbone to support semantic seg. +# Use Cascade + Deform Conv to improve localization. +_BASE_: "../COCO-PanopticSegmentation/Base-Panoptic-FPN.yaml" +MODEL: + WEIGHTS: "catalog://ImageNetPretrained/FAIR/R-101-GN" + RESNETS: + DEPTH: 101 + NORM: "GN" + DEFORM_ON_PER_STAGE: [False, True, True, True] + STRIDE_IN_1X1: False + FPN: + NORM: "GN" + ROI_HEADS: + NAME: CascadeROIHeads + ROI_BOX_HEAD: + CLS_AGNOSTIC_BBOX_REG: True + ROI_MASK_HEAD: + NORM: "GN" + RPN: + POST_NMS_TOPK_TRAIN: 2000 +SOLVER: + STEPS: (105000, 125000) + MAX_ITER: 135000 + IMS_PER_BATCH: 32 + BASE_LR: 0.04 diff --git a/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml b/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3400288cde242fcf66eef7f63b5a9165ca663c5 --- /dev/null +++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml @@ -0,0 +1,13 @@ +_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml" +MODEL: + # Train from random initialization. + WEIGHTS: "" + # It makes sense to divide by STD when training from scratch + # But it seems to make no difference on the results and C2's models didn't do this. + # So we keep things consistent with C2. + # PIXEL_STD: [57.375, 57.12, 58.395] + MASK_ON: True + BACKBONE: + FREEZE_AT: 0 +# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 +# to learn what you need for training from scratch. diff --git a/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d90c9ff0ef4573252ee165b4c958ec5f74178176 --- /dev/null +++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_gn.yaml @@ -0,0 +1,19 @@ +_BASE_: "mask_rcnn_R_50_FPN_3x_gn.yaml" +MODEL: + PIXEL_STD: [57.375, 57.12, 58.395] + WEIGHTS: "" + MASK_ON: True + RESNETS: + STRIDE_IN_1X1: False + BACKBONE: + FREEZE_AT: 0 +SOLVER: + # 9x schedule + IMS_PER_BATCH: 64 # 4x the standard + STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k + MAX_ITER: 202500 # 90k * 9 / 4 + BASE_LR: 0.08 +TEST: + EVAL_PERIOD: 2500 +# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 +# to learn what you need for training from scratch. diff --git a/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml new file mode 100644 index 0000000000000000000000000000000000000000..60d4e42330e396a1901437df8e17b262d5ad547a --- /dev/null +++ b/configs/Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn.yaml @@ -0,0 +1,19 @@ +_BASE_: "mask_rcnn_R_50_FPN_3x_syncbn.yaml" +MODEL: + PIXEL_STD: [57.375, 57.12, 58.395] + WEIGHTS: "" + MASK_ON: True + RESNETS: + STRIDE_IN_1X1: False + BACKBONE: + FREEZE_AT: 0 +SOLVER: + # 9x schedule + IMS_PER_BATCH: 64 # 4x the standard + STEPS: (187500, 197500) # last 60/4==15k and last 20/4==5k + MAX_ITER: 202500 # 90k * 9 / 4 + BASE_LR: 0.08 +TEST: + EVAL_PERIOD: 2500 +# NOTE: Please refer to Rethinking ImageNet Pre-training https://arxiv.org/abs/1811.08883 +# to learn what you need for training from scratch. diff --git a/configs/Misc/semantic_R_50_FPN_1x.yaml b/configs/Misc/semantic_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ac256e1372770ab3d9ae522c962de0fd0dbceeb5 --- /dev/null +++ b/configs/Misc/semantic_R_50_FPN_1x.yaml @@ -0,0 +1,11 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "SemanticSegmentor" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +DATASETS: + TRAIN: ("coco_2017_train_panoptic_stuffonly",) + TEST: ("coco_2017_val_panoptic_stuffonly",) +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) diff --git a/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml b/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea2a6baaebd1a186db18f2904430ffb25901898e --- /dev/null +++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_C4.yaml @@ -0,0 +1,18 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 + ROI_HEADS: + NUM_CLASSES: 20 +INPUT: + MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) + MIN_SIZE_TEST: 800 +DATASETS: + TRAIN: ('voc_2007_trainval', 'voc_2012_trainval') + TEST: ('voc_2007_test',) +SOLVER: + STEPS: (12000, 16000) + MAX_ITER: 18000 # 17.4 epochs + WARMUP_ITERS: 100 diff --git a/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml b/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e554cab18a358a27b630c1ab0c2359666b0e1514 --- /dev/null +++ b/configs/PascalVOC-Detection/faster_rcnn_R_50_FPN.yaml @@ -0,0 +1,18 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 + ROI_HEADS: + NUM_CLASSES: 20 +INPUT: + MIN_SIZE_TRAIN: (480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800) + MIN_SIZE_TEST: 800 +DATASETS: + TRAIN: ('voc_2007_trainval', 'voc_2012_trainval') + TEST: ('voc_2007_test',) +SOLVER: + STEPS: (12000, 16000) + MAX_ITER: 18000 # 17.4 epochs + WARMUP_ITERS: 100 diff --git a/configs/quick_schedules/README.md b/configs/quick_schedules/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a278199b8557a1e2fb341fe6757786a6cecb82b3 --- /dev/null +++ b/configs/quick_schedules/README.md @@ -0,0 +1 @@ +These are quick configs for performance or accuracy regression tracking purposes. diff --git a/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fc5a4116cb096278823049c1f823e99f8e16e97e --- /dev/null +++ b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml" +MODEL: + WEIGHTS: "detectron2://Misc/cascade_mask_rcnn_R_50_FPN_3x/144998488/model_final_480dd8.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 50.18, 0.02], ["segm", "AP", 43.87, 0.02]] diff --git a/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e41a0fe7ffe9c3531741df49e546aa45cfe4fdee --- /dev/null +++ b/configs/quick_schedules/cascade_mask_rcnn_R_50_FPN_instant_test.yaml @@ -0,0 +1,11 @@ +_BASE_: "../Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml" +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2f37e5e2cc2a9e195e13703e9930e67e0f9a896 --- /dev/null +++ b/configs/quick_schedules/fast_rcnn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-Detection/fast_rcnn_R_50_FPN_1x/137635226/model_final_e5f7ce.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 45.70, 0.02]] diff --git a/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52fc0ec03c8b87ab2be1dda97bec1e8c93e6bb5c --- /dev/null +++ b/configs/quick_schedules/fast_rcnn_R_50_FPN_instant_test.yaml @@ -0,0 +1,15 @@ +_BASE_: "../COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" +DATASETS: + TRAIN: ("coco_2017_val_100",) + PROPOSAL_FILES_TRAIN: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", ) + TEST: ("coco_2017_val_100",) + PROPOSAL_FILES_TEST: ("detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/coco_2017_val_box_proposals_ee0dad.pkl", ) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14cf2aa82aec52ad44e28ead0665dad811d55457 --- /dev/null +++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x/137849621/model_final_a6e10b.pkl" +DATASETS: + TEST: ("keypoints_coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 52.47, 0.02], ["keypoints", "AP", 67.36, 0.02]] diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..dc09034bdd3db9d3e0dc62a017a3883dbe79c649 --- /dev/null +++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_instant_test.yaml @@ -0,0 +1,14 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + KEYPOINT_ON: True +DATASETS: + TRAIN: ("keypoints_coco_2017_val_100",) + TEST: ("keypoints_coco_2017_val_100",) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4b92392f1c4457033ae4c87a521e339fe9e184ce --- /dev/null +++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_normalized_training_acc_test.yaml @@ -0,0 +1,30 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + KEYPOINT_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 + NUM_CLASSES: 1 + ROI_KEYPOINT_HEAD: + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: False + LOSS_WEIGHT: 4.0 + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 1.0 # Keypoint AP degrades when using plain L1 loss + RPN: + SMOOTH_L1_BETA: 0.2 # Keypoint AP degrades when using plain L1 loss +DATASETS: + TRAIN: ("keypoints_coco_2017_val",) + TEST: ("keypoints_coco_2017_val",) +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +SOLVER: + WARMUP_FACTOR: 0.33333333 + WARMUP_ITERS: 100 + STEPS: (5500, 5800) + MAX_ITER: 6000 +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 55.35, 1.0], ["keypoints", "AP", 76.91, 1.0]] diff --git a/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9bd962878fea64035887c48981beeb8d41bfdbd0 --- /dev/null +++ b/configs/quick_schedules/keypoint_rcnn_R_50_FPN_training_acc_test.yaml @@ -0,0 +1,28 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + KEYPOINT_ON: True + RESNETS: + DEPTH: 50 + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 + NUM_CLASSES: 1 + ROI_KEYPOINT_HEAD: + POOLER_RESOLUTION: 14 + POOLER_SAMPLING_RATIO: 2 + ROI_BOX_HEAD: + SMOOTH_L1_BETA: 1.0 # Keypoint AP degrades when using plain L1 loss + RPN: + SMOOTH_L1_BETA: 0.2 # Keypoint AP degrades when using plain L1 loss +DATASETS: + TRAIN: ("keypoints_coco_2017_val",) + TEST: ("keypoints_coco_2017_val",) +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +SOLVER: + WARMUP_FACTOR: 0.33333333 + WARMUP_ITERS: 100 + STEPS: (5500, 5800) + MAX_ITER: 6000 +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 53.5, 1.0], ["keypoints", "AP", 72.4, 1.0]] diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ab6e69812b94ea7e071f29d9a6937d5c70805b5b --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_C4_GCV_instant_test.yaml @@ -0,0 +1,18 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + BASE_LR: 0.001 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "value" + CLIP_VALUE: 1.0 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b2d5b7ff87e069f8c774a230bdfd47b8c12d18a3 --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_C4_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x/137849525/model_final_4ce675.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 47.37, 0.02], ["segm", "AP", 40.99, 0.02]] diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c4f1214efa520944fd941daec082ad45c164a23 --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_C4_instant_test.yaml @@ -0,0 +1,14 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + BASE_LR: 0.001 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f68dd8f96c7896b5fc95d694a399f2ce417c1deb --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_C4_training_acc_test.yaml @@ -0,0 +1,22 @@ +_BASE_: "../Base-RCNN-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 + MASK_ON: True +DATASETS: + TRAIN: ("coco_2017_val",) + TEST: ("coco_2017_val",) +INPUT: + MIN_SIZE_TRAIN: (600,) + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +SOLVER: + IMS_PER_BATCH: 8 # base uses 16 + WARMUP_FACTOR: 0.33333 + WARMUP_ITERS: 100 + STEPS: (11000, 11600) + MAX_ITER: 12000 +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 41.88, 0.7], ["segm", "AP", 33.79, 0.5]] diff --git a/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e3ce6cf922ae07fba5b5e01edbac19bf58a8e9dd --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_DC5_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x/137849551/model_final_84107b.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 47.44, 0.02], ["segm", "AP", 42.94, 0.02]] diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5454bfd95cc37749c50aec7866f32d9a80ca2b7 --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,10 @@ +_BASE_: "../COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 47.34, 0.02], ["segm", "AP", 42.67, 0.02], ["bbox_TTA", "AP", 49.11, 0.02], ["segm_TTA", "AP", 45.04, 0.02]] + AUG: + ENABLED: True + MIN_SIZES: (700, 800) # to save some time diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6dbfcde0bf837990634d419a6dda1e2909c3cd7f --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_instant_test.yaml @@ -0,0 +1,14 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ffca550461035967a565dca39bca039658a68eed --- /dev/null +++ b/configs/quick_schedules/mask_rcnn_R_50_FPN_training_acc_test.yaml @@ -0,0 +1,21 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + ROI_HEADS: + BATCH_SIZE_PER_IMAGE: 256 + MASK_ON: True +DATASETS: + TRAIN: ("coco_2017_val",) + TEST: ("coco_2017_val",) +INPUT: + MIN_SIZE_TRAIN: (600,) + MAX_SIZE_TRAIN: 1000 + MIN_SIZE_TEST: 800 + MAX_SIZE_TEST: 1000 +SOLVER: + WARMUP_FACTOR: 0.3333333 + WARMUP_ITERS: 100 + STEPS: (5500, 5800) + MAX_ITER: 6000 +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 42.0, 1.6], ["segm", "AP", 35.4, 1.25]] diff --git a/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..70874e3a92c9034d75cbbebb145b61084ba15e42 --- /dev/null +++ b/configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-PanopticSegmentation/panoptic_fpn_R_50_3x/139514569/model_final_c10459.pkl" +DATASETS: + TEST: ("coco_2017_val_100_panoptic_separated",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 46.47, 0.02], ["segm", "AP", 43.39, 0.02], ["sem_seg", "mIoU", 42.55, 0.02], ["panoptic_seg", "PQ", 38.99, 0.02]] diff --git a/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7cdee7bfcf6dc75dda52602a0d9177ad0a9cc6ed --- /dev/null +++ b/configs/quick_schedules/panoptic_fpn_R_50_instant_test.yaml @@ -0,0 +1,19 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "PanopticFPN" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + SEM_SEG_HEAD: + LOSS_WEIGHT: 0.5 +DATASETS: + TRAIN: ("coco_2017_val_100_panoptic_separated",) + TEST: ("coco_2017_val_100_panoptic_separated",) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 1 diff --git a/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml b/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..05816316f851690e60ee54b852b6f49ede73c886 --- /dev/null +++ b/configs/quick_schedules/panoptic_fpn_R_50_training_acc_test.yaml @@ -0,0 +1,20 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "PanopticFPN" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: True + RESNETS: + DEPTH: 50 + SEM_SEG_HEAD: + LOSS_WEIGHT: 0.5 +DATASETS: + TRAIN: ("coco_2017_val_panoptic_separated",) + TEST: ("coco_2017_val_panoptic_separated",) +SOLVER: + BASE_LR: 0.01 + WARMUP_FACTOR: 0.001 + WARMUP_ITERS: 500 + STEPS: (5500,) + MAX_ITER: 7000 +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 46.70, 1.1], ["segm", "AP", 38.73, 0.7], ["sem_seg", "mIoU", 64.73, 1.2], ["panoptic_seg", "PQ", 48.13, 0.8]] diff --git a/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..36b998833bac04c830d5ab9f44d5773b0437ac0b --- /dev/null +++ b/configs/quick_schedules/retinanet_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-Detection/retinanet_R_50_FPN_3x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-Detection/retinanet_R_50_FPN_3x/137849486/model_final_4cafe0.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 44.36, 0.02]] diff --git a/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml b/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8d95c1f614296716374686b22055a587ccd052b9 --- /dev/null +++ b/configs/quick_schedules/retinanet_R_50_FPN_instant_test.yaml @@ -0,0 +1,13 @@ +_BASE_: "../COCO-Detection/retinanet_R_50_FPN_1x.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c7c3f908a9e80e98b2d25b6d384a60acaba9d4f8 --- /dev/null +++ b/configs/quick_schedules/rpn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,7 @@ +_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml" +MODEL: + WEIGHTS: "detectron2://COCO-Detection/rpn_R_50_FPN_1x/137258492/model_final_02ce48.pkl" +DATASETS: + TEST: ("coco_2017_val_100",) +TEST: + EXPECTED_RESULTS: [["box_proposals", "AR@1000", 58.16, 0.02]] diff --git a/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml b/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..402d432477507dc36f04c4a9777cb80fe06b2809 --- /dev/null +++ b/configs/quick_schedules/rpn_R_50_FPN_instant_test.yaml @@ -0,0 +1,13 @@ +_BASE_: "../COCO-Detection/rpn_R_50_FPN_1x.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" +DATASETS: + TRAIN: ("coco_2017_val_100",) + TEST: ("coco_2017_val_100",) +SOLVER: + STEPS: (30,) + MAX_ITER: 40 + BASE_LR: 0.005 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bca74987d5218736983617883e0fe37f79d219b7 --- /dev/null +++ b/configs/quick_schedules/semantic_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,10 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "SemanticSegmentor" + WEIGHTS: "detectron2://semantic_R_50_FPN_1x/111802073/model_final_c18079783c55a94968edc28b7101c5f0.pkl" + RESNETS: + DEPTH: 50 +DATASETS: + TEST: ("coco_2017_val_100_panoptic_stuffonly",) +TEST: + EXPECTED_RESULTS: [["sem_seg", "mIoU", 39.53, 0.02], ["sem_seg", "mACC", 51.50, 0.02]] diff --git a/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..14ab606f219b462fe37fcc7d5fbdbe65cb5c2642 --- /dev/null +++ b/configs/quick_schedules/semantic_R_50_FPN_instant_test.yaml @@ -0,0 +1,18 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "SemanticSegmentor" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +DATASETS: + TRAIN: ("coco_2017_val_100_panoptic_stuffonly",) + TEST: ("coco_2017_val_100_panoptic_stuffonly",) +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +SOLVER: + BASE_LR: 0.005 + STEPS: (30,) + MAX_ITER: 40 + IMS_PER_BATCH: 4 +DATALOADER: + NUM_WORKERS: 2 diff --git a/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml b/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1f78d775889b11e9e76743de5ddb8139198edf61 --- /dev/null +++ b/configs/quick_schedules/semantic_R_50_FPN_training_acc_test.yaml @@ -0,0 +1,20 @@ +_BASE_: "../Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "SemanticSegmentor" + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +DATASETS: + TRAIN: ("coco_2017_val_panoptic_stuffonly",) + TEST: ("coco_2017_val_panoptic_stuffonly",) +SOLVER: + BASE_LR: 0.01 + WARMUP_FACTOR: 0.001 + WARMUP_ITERS: 300 + STEPS: (5500,) + MAX_ITER: 7000 +TEST: + EXPECTED_RESULTS: [["sem_seg", "mIoU", 76.51, 1.0], ["sem_seg", "mACC", 83.25, 1.0]] +INPUT: + # no scale augmentation + MIN_SIZE_TRAIN: (800, ) diff --git a/datasets/README.md b/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1a2633f95e6f6a5e54c8beca102a490036478587 --- /dev/null +++ b/datasets/README.md @@ -0,0 +1,99 @@ +# Setup Builtin Datasets + +Detectron2 has builtin support for a few datasets. +The datasets are assumed to exist in a directory specified by the environment variable +`DETECTRON2_DATASETS`. +Under this directory, detectron2 expects to find datasets in the structure described below. + +You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. +If left unset, the default is `./datasets` relative to your current working directory. + +The [model zoo](https://github.com/facebookresearch/detectron2/blob/master/MODEL_ZOO.md) +contains configs and models that use these builtin datasets. + +## Expected dataset structure for COCO instance/keypoint detection: + +``` +coco/ + annotations/ + instances_{train,val}2017.json + person_keypoints_{train,val}2017.json + {train,val}2017/ + # image files that are mentioned in the corresponding json +``` + +You can use the 2014 version of the dataset as well. + +Some of the builtin tests (`dev/run_*_tests.sh`) uses a tiny version of the COCO dataset, +which you can download with `./prepare_for_tests.sh`. + +## Expected dataset structure for PanopticFPN: + +``` +coco/ + annotations/ + panoptic_{train,val}2017.json + panoptic_{train,val}2017/ # png annotations + panoptic_stuff_{train,val}2017/ # generated by the script mentioned below +``` + +Install panopticapi by: +``` +pip install git+https://github.com/cocodataset/panopticapi.git +``` +Then, run `python prepare_panoptic_fpn.py`, to extract semantic annotations from panoptic annotations. + +## Expected dataset structure for LVIS instance segmentation: +``` +coco/ + {train,val,test}2017/ +lvis/ + lvis_v0.5_{train,val}.json + lvis_v0.5_image_info_test.json +``` + +Install lvis-api by: +``` +pip install git+https://github.com/lvis-dataset/lvis-api.git +``` + +Run `python prepare_cocofied_lvis.py` to prepare "cocofied" LVIS annotations for evaluation of models trained on the COCO dataset. + +## Expected dataset structure for cityscapes: +``` +cityscapes/ + gtFine/ + train/ + aachen/ + color.png, instanceIds.png, labelIds.png, polygons.json, + labelTrainIds.png + ... + val/ + test/ + leftImg8bit/ + train/ + val/ + test/ +``` +Install cityscapes scripts by: +``` +pip install git+https://github.com/mcordts/cityscapesScripts.git +``` + +Note: labelTrainIds.png are created using cityscapesescript with: +``` +CITYSCAPES_DATASET=$DETECTRON2_DATASETS/cityscapes python cityscapesscripts/preparation/createTrainIdLabelImgs.py +``` +They are not needed for instance segmentation. + +## Expected dataset structure for Pascal VOC: +``` +VOC20{07,12}/ + Annotations/ + ImageSets/ + Main/ + trainval.txt + test.txt + # train.txt or val.txt, if you use these splits + JPEGImages/ +``` diff --git a/datasets/prepare_cocofied_lvis.py b/datasets/prepare_cocofied_lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..a6aff7aaa952e06bc478e8de5563b24051fcadf7 --- /dev/null +++ b/datasets/prepare_cocofied_lvis.py @@ -0,0 +1,175 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import copy +import json +import os +from collections import defaultdict + +# This mapping is extracted from the official LVIS mapping: +# https://github.com/lvis-dataset/lvis-api/blob/master/data/coco_to_synset.json +COCO_SYNSET_CATEGORIES = [ + {"synset": "person.n.01", "coco_cat_id": 1}, + {"synset": "bicycle.n.01", "coco_cat_id": 2}, + {"synset": "car.n.01", "coco_cat_id": 3}, + {"synset": "motorcycle.n.01", "coco_cat_id": 4}, + {"synset": "airplane.n.01", "coco_cat_id": 5}, + {"synset": "bus.n.01", "coco_cat_id": 6}, + {"synset": "train.n.01", "coco_cat_id": 7}, + {"synset": "truck.n.01", "coco_cat_id": 8}, + {"synset": "boat.n.01", "coco_cat_id": 9}, + {"synset": "traffic_light.n.01", "coco_cat_id": 10}, + {"synset": "fireplug.n.01", "coco_cat_id": 11}, + {"synset": "stop_sign.n.01", "coco_cat_id": 13}, + {"synset": "parking_meter.n.01", "coco_cat_id": 14}, + {"synset": "bench.n.01", "coco_cat_id": 15}, + {"synset": "bird.n.01", "coco_cat_id": 16}, + {"synset": "cat.n.01", "coco_cat_id": 17}, + {"synset": "dog.n.01", "coco_cat_id": 18}, + {"synset": "horse.n.01", "coco_cat_id": 19}, + {"synset": "sheep.n.01", "coco_cat_id": 20}, + {"synset": "beef.n.01", "coco_cat_id": 21}, + {"synset": "elephant.n.01", "coco_cat_id": 22}, + {"synset": "bear.n.01", "coco_cat_id": 23}, + {"synset": "zebra.n.01", "coco_cat_id": 24}, + {"synset": "giraffe.n.01", "coco_cat_id": 25}, + {"synset": "backpack.n.01", "coco_cat_id": 27}, + {"synset": "umbrella.n.01", "coco_cat_id": 28}, + {"synset": "bag.n.04", "coco_cat_id": 31}, + {"synset": "necktie.n.01", "coco_cat_id": 32}, + {"synset": "bag.n.06", "coco_cat_id": 33}, + {"synset": "frisbee.n.01", "coco_cat_id": 34}, + {"synset": "ski.n.01", "coco_cat_id": 35}, + {"synset": "snowboard.n.01", "coco_cat_id": 36}, + {"synset": "ball.n.06", "coco_cat_id": 37}, + {"synset": "kite.n.03", "coco_cat_id": 38}, + {"synset": "baseball_bat.n.01", "coco_cat_id": 39}, + {"synset": "baseball_glove.n.01", "coco_cat_id": 40}, + {"synset": "skateboard.n.01", "coco_cat_id": 41}, + {"synset": "surfboard.n.01", "coco_cat_id": 42}, + {"synset": "tennis_racket.n.01", "coco_cat_id": 43}, + {"synset": "bottle.n.01", "coco_cat_id": 44}, + {"synset": "wineglass.n.01", "coco_cat_id": 46}, + {"synset": "cup.n.01", "coco_cat_id": 47}, + {"synset": "fork.n.01", "coco_cat_id": 48}, + {"synset": "knife.n.01", "coco_cat_id": 49}, + {"synset": "spoon.n.01", "coco_cat_id": 50}, + {"synset": "bowl.n.03", "coco_cat_id": 51}, + {"synset": "banana.n.02", "coco_cat_id": 52}, + {"synset": "apple.n.01", "coco_cat_id": 53}, + {"synset": "sandwich.n.01", "coco_cat_id": 54}, + {"synset": "orange.n.01", "coco_cat_id": 55}, + {"synset": "broccoli.n.01", "coco_cat_id": 56}, + {"synset": "carrot.n.01", "coco_cat_id": 57}, + {"synset": "frank.n.02", "coco_cat_id": 58}, + {"synset": "pizza.n.01", "coco_cat_id": 59}, + {"synset": "doughnut.n.02", "coco_cat_id": 60}, + {"synset": "cake.n.03", "coco_cat_id": 61}, + {"synset": "chair.n.01", "coco_cat_id": 62}, + {"synset": "sofa.n.01", "coco_cat_id": 63}, + {"synset": "pot.n.04", "coco_cat_id": 64}, + {"synset": "bed.n.01", "coco_cat_id": 65}, + {"synset": "dining_table.n.01", "coco_cat_id": 67}, + {"synset": "toilet.n.02", "coco_cat_id": 70}, + {"synset": "television_receiver.n.01", "coco_cat_id": 72}, + {"synset": "laptop.n.01", "coco_cat_id": 73}, + {"synset": "mouse.n.04", "coco_cat_id": 74}, + {"synset": "remote_control.n.01", "coco_cat_id": 75}, + {"synset": "computer_keyboard.n.01", "coco_cat_id": 76}, + {"synset": "cellular_telephone.n.01", "coco_cat_id": 77}, + {"synset": "microwave.n.02", "coco_cat_id": 78}, + {"synset": "oven.n.01", "coco_cat_id": 79}, + {"synset": "toaster.n.02", "coco_cat_id": 80}, + {"synset": "sink.n.01", "coco_cat_id": 81}, + {"synset": "electric_refrigerator.n.01", "coco_cat_id": 82}, + {"synset": "book.n.01", "coco_cat_id": 84}, + {"synset": "clock.n.01", "coco_cat_id": 85}, + {"synset": "vase.n.01", "coco_cat_id": 86}, + {"synset": "scissors.n.01", "coco_cat_id": 87}, + {"synset": "teddy.n.01", "coco_cat_id": 88}, + {"synset": "hand_blower.n.01", "coco_cat_id": 89}, + {"synset": "toothbrush.n.01", "coco_cat_id": 90}, +] + + +def cocofy_lvis(input_filename, output_filename): + """ + Filter LVIS instance segmentation annotations to remove all categories that are not included in + COCO. The new json files can be used to evaluate COCO AP using `lvis-api`. The category ids in + the output json are the incontiguous COCO dataset ids. + + Args: + input_filename (str): path to the LVIS json file. + output_filename (str): path to the COCOfied json file. + """ + + with open(input_filename, "r") as f: + lvis_json = json.load(f) + + lvis_annos = lvis_json.pop("annotations") + cocofied_lvis = copy.deepcopy(lvis_json) + lvis_json["annotations"] = lvis_annos + + # Mapping from lvis cat id to coco cat id via synset + lvis_cat_id_to_synset = {cat["id"]: cat["synset"] for cat in lvis_json["categories"]} + synset_to_coco_cat_id = {x["synset"]: x["coco_cat_id"] for x in COCO_SYNSET_CATEGORIES} + # Synsets that we will keep in the dataset + synsets_to_keep = set(synset_to_coco_cat_id.keys()) + coco_cat_id_with_instances = defaultdict(int) + + new_annos = [] + ann_id = 1 + for ann in lvis_annos: + lvis_cat_id = ann["category_id"] + synset = lvis_cat_id_to_synset[lvis_cat_id] + if synset not in synsets_to_keep: + continue + coco_cat_id = synset_to_coco_cat_id[synset] + new_ann = copy.deepcopy(ann) + new_ann["category_id"] = coco_cat_id + new_ann["id"] = ann_id + ann_id += 1 + new_annos.append(new_ann) + coco_cat_id_with_instances[coco_cat_id] += 1 + cocofied_lvis["annotations"] = new_annos + + for image in cocofied_lvis["images"]: + for key in ["not_exhaustive_category_ids", "neg_category_ids"]: + new_category_list = [] + for lvis_cat_id in image[key]: + synset = lvis_cat_id_to_synset[lvis_cat_id] + if synset not in synsets_to_keep: + continue + coco_cat_id = synset_to_coco_cat_id[synset] + new_category_list.append(coco_cat_id) + coco_cat_id_with_instances[coco_cat_id] += 1 + image[key] = new_category_list + + coco_cat_id_with_instances = set(coco_cat_id_with_instances.keys()) + + new_categories = [] + for cat in lvis_json["categories"]: + synset = cat["synset"] + if synset not in synsets_to_keep: + continue + coco_cat_id = synset_to_coco_cat_id[synset] + if coco_cat_id not in coco_cat_id_with_instances: + continue + new_cat = copy.deepcopy(cat) + new_cat["id"] = coco_cat_id + new_categories.append(new_cat) + cocofied_lvis["categories"] = new_categories + + with open(output_filename, "w") as f: + json.dump(cocofied_lvis, f) + print("{} is COCOfied and stored in {}.".format(input_filename, output_filename)) + + +if __name__ == "__main__": + dataset_dir = os.path.join(os.path.dirname(__file__), "lvis") + for s in ["lvis_v0.5_train", "lvis_v0.5_val"]: + print("Start COCOfing {}.".format(s)) + cocofy_lvis( + os.path.join(dataset_dir, "{}.json".format(s)), + os.path.join(dataset_dir, "{}_cocofied.json".format(s)), + ) diff --git a/datasets/prepare_for_tests.sh b/datasets/prepare_for_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..d59b5643c95095921863dddd2f1e4d9be28e06ee --- /dev/null +++ b/datasets/prepare_for_tests.sh @@ -0,0 +1,22 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Download some files needed for running tests. + +cd "${0%/*}" + +BASE=https://dl.fbaipublicfiles.com/detectron2 +mkdir -p coco/annotations + +for anno in instances_val2017_100 \ + person_keypoints_val2017_100 \ + instances_minival2014_100 \ + person_keypoints_minival2014_100; do + + dest=coco/annotations/$anno.json + [[ -s $dest ]] && { + echo "$dest exists. Skipping ..." + } || { + wget $BASE/annotations/coco/$anno.json -O $dest + } +done diff --git a/datasets/prepare_panoptic_fpn.py b/datasets/prepare_panoptic_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..ab641184da9fc1bc0f0b48fc232c13251d72b5c8 --- /dev/null +++ b/datasets/prepare_panoptic_fpn.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import functools +import json +import multiprocessing as mp +import numpy as np +import os +import time +from fvcore.common.download import download +from PIL import Image + +from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES + +from panopticapi.utils import rgb2id + + +def _process_panoptic_to_semantic(input_panoptic, output_semantic, segments, id_map): + panoptic = np.asarray(Image.open(input_panoptic), dtype=np.uint32) + panoptic = rgb2id(panoptic) + output = np.zeros_like(panoptic, dtype=np.uint8) + 255 + for seg in segments: + cat_id = seg["category_id"] + new_cat_id = id_map[cat_id] + output[panoptic == seg["id"]] = new_cat_id + Image.fromarray(output).save(output_semantic) + + +def separate_coco_semantic_from_panoptic(panoptic_json, panoptic_root, sem_seg_root, categories): + """ + Create semantic segmentation annotations from panoptic segmentation + annotations, to be used by PanopticFPN. + + It maps all thing categories to class 0, and maps all unlabeled pixels to class 255. + It maps all stuff categories to contiguous ids starting from 1. + + Args: + panoptic_json (str): path to the panoptic json file, in COCO's format. + panoptic_root (str): a directory with panoptic annotation files, in COCO's format. + sem_seg_root (str): a directory to output semantic annotation files + categories (list[dict]): category metadata. Each dict needs to have: + "id": corresponds to the "category_id" in the json annotations + "isthing": 0 or 1 + """ + os.makedirs(sem_seg_root, exist_ok=True) + + stuff_ids = [k["id"] for k in categories if k["isthing"] == 0] + thing_ids = [k["id"] for k in categories if k["isthing"] == 1] + id_map = {} # map from category id to id in the output semantic annotation + assert len(stuff_ids) <= 254 + for i, stuff_id in enumerate(stuff_ids): + id_map[stuff_id] = i + 1 + for thing_id in thing_ids: + id_map[thing_id] = 0 + id_map[0] = 255 + + with open(panoptic_json) as f: + obj = json.load(f) + + pool = mp.Pool(processes=max(mp.cpu_count() // 2, 4)) + + def iter_annotations(): + for anno in obj["annotations"]: + file_name = anno["file_name"] + segments = anno["segments_info"] + input = os.path.join(panoptic_root, file_name) + output = os.path.join(sem_seg_root, file_name) + yield input, output, segments + + print("Start writing to {} ...".format(sem_seg_root)) + start = time.time() + pool.starmap( + functools.partial(_process_panoptic_to_semantic, id_map=id_map), + iter_annotations(), + chunksize=100, + ) + print("Finished. time: {:.2f}s".format(time.time() - start)) + + +if __name__ == "__main__": + dataset_dir = os.path.join(os.path.dirname(__file__), "coco") + for s in ["val2017", "train2017"]: + separate_coco_semantic_from_panoptic( + os.path.join(dataset_dir, "annotations/panoptic_{}.json".format(s)), + os.path.join(dataset_dir, "panoptic_{}".format(s)), + os.path.join(dataset_dir, "panoptic_stuff_{}".format(s)), + COCO_CATEGORIES, + ) + + # Prepare val2017_100 for quick testing: + + dest_dir = os.path.join(dataset_dir, "annotations/") + URL_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" + download(URL_PREFIX + "annotations/coco/panoptic_val2017_100.json", dest_dir) + with open(os.path.join(dest_dir, "panoptic_val2017_100.json")) as f: + obj = json.load(f) + + def link_val100(dir_full, dir_100): + print("Creating " + dir_100 + " ...") + os.makedirs(dir_100, exist_ok=True) + for img in obj["images"]: + basename = os.path.splitext(img["file_name"])[0] + src = os.path.join(dir_full, basename + ".png") + dst = os.path.join(dir_100, basename + ".png") + src = os.path.relpath(src, start=dir_100) + os.symlink(src, dst) + + link_val100( + os.path.join(dataset_dir, "panoptic_val2017"), + os.path.join(dataset_dir, "panoptic_val2017_100"), + ) + + link_val100( + os.path.join(dataset_dir, "panoptic_stuff_val2017"), + os.path.join(dataset_dir, "panoptic_stuff_val2017_100"), + ) diff --git a/demo/README.md b/demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..caa755f6f0f472a04a419deec4a6acfdb949023b --- /dev/null +++ b/demo/README.md @@ -0,0 +1,8 @@ + +## Detectron2 Demo + +We provide a command line tool to run a simple demo of builtin models. +The usage is explained in [GETTING_STARTED.md](../GETTING_STARTED.md). + +See our [blog post](https://ai.facebook.com/blog/-detectron2-a-pytorch-based-modular-object-detection-library-) +for a high-quality demo generated with this tool. diff --git a/demo/demo.py b/demo/demo.py new file mode 100755 index 0000000000000000000000000000000000000000..1fd8df8f539cfe4a4f003fb820f49ffad0f54f80 --- /dev/null +++ b/demo/demo.py @@ -0,0 +1,161 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import argparse +import glob +import multiprocessing as mp +import os +import time +import cv2 +import tqdm + +from detectron2.config import get_cfg +from detectron2.data.detection_utils import read_image +from detectron2.utils.logger import setup_logger + +from predictor import VisualizationDemo + +# constants +WINDOW_NAME = "COCO detections" + + +def setup_cfg(args): + # load config from file and command-line arguments + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + # Set score_threshold for builtin models + cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold + cfg.freeze() + return cfg + + +def get_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for builtin models") + parser.add_argument( + "--config-file", + default="configs/quick_schedules/mask_rcnn_R_50_FPN_inference_acc_test.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.") + parser.add_argument("--video-input", help="Path to video file.") + parser.add_argument( + "--input", + nargs="+", + help="A list of space separated input images; " + "or a single glob pattern such as 'directory/*.jpg'", + ) + parser.add_argument( + "--output", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + + parser.add_argument( + "--confidence-threshold", + type=float, + default=0.5, + help="Minimum score for instance predictions to be shown", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + return parser + + +if __name__ == "__main__": + mp.set_start_method("spawn", force=True) + args = get_parser().parse_args() + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + cfg = setup_cfg(args) + + demo = VisualizationDemo(cfg) + + if args.input: + if len(args.input) == 1: + args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" + for path in tqdm.tqdm(args.input, disable=not args.output): + # use PIL, to be consistent with evaluation + img = read_image(path, format="BGR") + start_time = time.time() + predictions, visualized_output = demo.run_on_image(img) + logger.info( + "{}: {} in {:.2f}s".format( + path, + "detected {} instances".format(len(predictions["instances"])) + if "instances" in predictions + else "finished", + time.time() - start_time, + ) + ) + + if args.output: + if os.path.isdir(args.output): + assert os.path.isdir(args.output), args.output + out_filename = os.path.join(args.output, os.path.basename(path)) + else: + assert len(args.input) == 1, "Please specify a directory with args.output" + out_filename = args.output + visualized_output.save(out_filename) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + break # esc to quit + elif args.webcam: + assert args.input is None, "Cannot have both --input and --webcam!" + assert args.output is None, "output not yet supported with --webcam!" + cam = cv2.VideoCapture(0) + for vis in tqdm.tqdm(demo.run_on_video(cam)): + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, vis) + if cv2.waitKey(1) == 27: + break # esc to quit + cam.release() + cv2.destroyAllWindows() + elif args.video_input: + video = cv2.VideoCapture(args.video_input) + width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) + frames_per_second = video.get(cv2.CAP_PROP_FPS) + num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) + basename = os.path.basename(args.video_input) + + if args.output: + if os.path.isdir(args.output): + output_fname = os.path.join(args.output, basename) + output_fname = os.path.splitext(output_fname)[0] + ".mkv" + else: + output_fname = args.output + assert not os.path.isfile(output_fname), output_fname + output_file = cv2.VideoWriter( + filename=output_fname, + # some installation of opencv may not support x264 (due to its license), + # you can try other format (e.g. MPEG) + fourcc=cv2.VideoWriter_fourcc(*"x264"), + fps=float(frames_per_second), + frameSize=(width, height), + isColor=True, + ) + assert os.path.isfile(args.video_input) + for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames): + if args.output: + output_file.write(vis_frame) + else: + cv2.namedWindow(basename, cv2.WINDOW_NORMAL) + cv2.imshow(basename, vis_frame) + if cv2.waitKey(1) == 27: + break # esc to quit + video.release() + if args.output: + output_file.release() + else: + cv2.destroyAllWindows() diff --git a/demo/predictor.py b/demo/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..689fa85436d928858e652df665f5e7460a1f3154 --- /dev/null +++ b/demo/predictor.py @@ -0,0 +1,220 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import atexit +import bisect +import multiprocessing as mp +from collections import deque +import cv2 +import torch + +from detectron2.data import MetadataCatalog +from detectron2.engine.defaults import DefaultPredictor +from detectron2.utils.video_visualizer import VideoVisualizer +from detectron2.utils.visualizer import ColorMode, Visualizer + + +class VisualizationDemo(object): + def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): + """ + Args: + cfg (CfgNode): + instance_mode (ColorMode): + parallel (bool): whether to run the model in different processes from visualization. + Useful since the visualization logic can be slow. + """ + self.metadata = MetadataCatalog.get( + cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" + ) + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + + self.parallel = parallel + if parallel: + num_gpu = torch.cuda.device_count() + self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) + else: + self.predictor = DefaultPredictor(cfg) + + def run_on_image(self, image): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + vis_output = None + predictions = self.predictor(image) + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = image[:, :, ::-1] + visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode) + if "panoptic_seg" in predictions: + panoptic_seg, segments_info = predictions["panoptic_seg"] + vis_output = visualizer.draw_panoptic_seg_predictions( + panoptic_seg.to(self.cpu_device), segments_info + ) + else: + if "sem_seg" in predictions: + vis_output = visualizer.draw_sem_seg( + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) + ) + if "instances" in predictions: + instances = predictions["instances"].to(self.cpu_device) + vis_output = visualizer.draw_instance_predictions(predictions=instances) + + return predictions, vis_output + + def _frame_from_video(self, video): + while video.isOpened(): + success, frame = video.read() + if success: + yield frame + else: + break + + def run_on_video(self, video): + """ + Visualizes predictions on frames of the input video. + + Args: + video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be + either a webcam or a video file. + + Yields: + ndarray: BGR visualizations of each video frame. + """ + video_visualizer = VideoVisualizer(self.metadata, self.instance_mode) + + def process_predictions(frame, predictions): + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + if "panoptic_seg" in predictions: + panoptic_seg, segments_info = predictions["panoptic_seg"] + vis_frame = video_visualizer.draw_panoptic_seg_predictions( + frame, panoptic_seg.to(self.cpu_device), segments_info + ) + elif "instances" in predictions: + predictions = predictions["instances"].to(self.cpu_device) + vis_frame = video_visualizer.draw_instance_predictions(frame, predictions) + elif "sem_seg" in predictions: + vis_frame = video_visualizer.draw_sem_seg( + frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device) + ) + + # Converts Matplotlib RGB format to OpenCV BGR format + vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR) + return vis_frame + + frame_gen = self._frame_from_video(video) + if self.parallel: + buffer_size = self.predictor.default_buffer_size + + frame_data = deque() + + for cnt, frame in enumerate(frame_gen): + frame_data.append(frame) + self.predictor.put(frame) + + if cnt >= buffer_size: + frame = frame_data.popleft() + predictions = self.predictor.get() + yield process_predictions(frame, predictions) + + while len(frame_data): + frame = frame_data.popleft() + predictions = self.predictor.get() + yield process_predictions(frame, predictions) + else: + for frame in frame_gen: + yield process_predictions(frame, self.predictor(frame)) + + +class AsyncPredictor: + """ + A predictor that runs the model asynchronously, possibly on >1 GPUs. + Because rendering the visualization takes considerably amount of time, + this helps improve throughput when rendering videos. + """ + + class _StopToken: + pass + + class _PredictWorker(mp.Process): + def __init__(self, cfg, task_queue, result_queue): + self.cfg = cfg + self.task_queue = task_queue + self.result_queue = result_queue + super().__init__() + + def run(self): + predictor = DefaultPredictor(self.cfg) + + while True: + task = self.task_queue.get() + if isinstance(task, AsyncPredictor._StopToken): + break + idx, data = task + result = predictor(data) + self.result_queue.put((idx, result)) + + def __init__(self, cfg, num_gpus: int = 1): + """ + Args: + cfg (CfgNode): + num_gpus (int): if 0, will run on CPU + """ + num_workers = max(num_gpus, 1) + self.task_queue = mp.Queue(maxsize=num_workers * 3) + self.result_queue = mp.Queue(maxsize=num_workers * 3) + self.procs = [] + for gpuid in range(max(num_gpus, 1)): + cfg = cfg.clone() + cfg.defrost() + cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) + + self.put_idx = 0 + self.get_idx = 0 + self.result_rank = [] + self.result_data = [] + + for p in self.procs: + p.start() + atexit.register(self.shutdown) + + def put(self, image): + self.put_idx += 1 + self.task_queue.put((self.put_idx, image)) + + def get(self): + self.get_idx += 1 # the index needed for this request + if len(self.result_rank) and self.result_rank[0] == self.get_idx: + res = self.result_data[0] + del self.result_data[0], self.result_rank[0] + return res + + while True: + # make sure the results are returned in the correct order + idx, res = self.result_queue.get() + if idx == self.get_idx: + return res + insert = bisect.bisect(self.result_rank, idx) + self.result_rank.insert(insert, idx) + self.result_data.insert(insert, res) + + def __len__(self): + return self.put_idx - self.get_idx + + def __call__(self, image): + self.put(image) + return self.get() + + def shutdown(self): + for _ in self.procs: + self.task_queue.put(AsyncPredictor._StopToken()) + + @property + def default_buffer_size(self): + return len(self.procs) * 5 diff --git a/detectron2/__init__.py b/detectron2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..41816af2e8e538fa2ef4dc7b34f5667e0e823b90 --- /dev/null +++ b/detectron2/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +from .utils.env import setup_environment + +setup_environment() + + +# This line will be programatically read/write by setup.py. +# Leave them at the bottom of this file and don't touch them. +__version__ = "0.1.3" diff --git a/detectron2/checkpoint/__init__.py b/detectron2/checkpoint/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e17a9df03d886b379ffbb1c4ec41e03c5025410f --- /dev/null +++ b/detectron2/checkpoint/__init__.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# File: + + +from . import catalog as _UNUSED # register the handler +from .detection_checkpoint import DetectionCheckpointer +from fvcore.common.checkpoint import Checkpointer, PeriodicCheckpointer + +__all__ = ["Checkpointer", "PeriodicCheckpointer", "DetectionCheckpointer"] diff --git a/detectron2/checkpoint/c2_model_loading.py b/detectron2/checkpoint/c2_model_loading.py new file mode 100644 index 0000000000000000000000000000000000000000..e27ba8463c744438d44f04f23fd4975525eba667 --- /dev/null +++ b/detectron2/checkpoint/c2_model_loading.py @@ -0,0 +1,313 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import re +import torch +from fvcore.common.checkpoint import ( + get_missing_parameters_message, + get_unexpected_parameters_message, +) + + +def convert_basic_c2_names(original_keys): + """ + Apply some basic name conversion to names in C2 weights. + It only deals with typical backbone models. + + Args: + original_keys (list[str]): + Returns: + list[str]: The same number of strings matching those in original_keys. + """ + layer_keys = copy.deepcopy(original_keys) + layer_keys = [ + {"pred_b": "linear_b", "pred_w": "linear_w"}.get(k, k) for k in layer_keys + ] # some hard-coded mappings + + layer_keys = [k.replace("_", ".") for k in layer_keys] + layer_keys = [re.sub("\\.b$", ".bias", k) for k in layer_keys] + layer_keys = [re.sub("\\.w$", ".weight", k) for k in layer_keys] + # Uniform both bn and gn names to "norm" + layer_keys = [re.sub("bn\\.s$", "norm.weight", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.bias$", "norm.bias", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.rm", "norm.running_mean", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.running.mean$", "norm.running_mean", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.riv$", "norm.running_var", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.running.var$", "norm.running_var", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.gamma$", "norm.weight", k) for k in layer_keys] + layer_keys = [re.sub("bn\\.beta$", "norm.bias", k) for k in layer_keys] + layer_keys = [re.sub("gn\\.s$", "norm.weight", k) for k in layer_keys] + layer_keys = [re.sub("gn\\.bias$", "norm.bias", k) for k in layer_keys] + + # stem + layer_keys = [re.sub("^res\\.conv1\\.norm\\.", "conv1.norm.", k) for k in layer_keys] + # to avoid mis-matching with "conv1" in other components (e.g. detection head) + layer_keys = [re.sub("^conv1\\.", "stem.conv1.", k) for k in layer_keys] + + # layer1-4 is used by torchvision, however we follow the C2 naming strategy (res2-5) + # layer_keys = [re.sub("^res2.", "layer1.", k) for k in layer_keys] + # layer_keys = [re.sub("^res3.", "layer2.", k) for k in layer_keys] + # layer_keys = [re.sub("^res4.", "layer3.", k) for k in layer_keys] + # layer_keys = [re.sub("^res5.", "layer4.", k) for k in layer_keys] + + # blocks + layer_keys = [k.replace(".branch1.", ".shortcut.") for k in layer_keys] + layer_keys = [k.replace(".branch2a.", ".conv1.") for k in layer_keys] + layer_keys = [k.replace(".branch2b.", ".conv2.") for k in layer_keys] + layer_keys = [k.replace(".branch2c.", ".conv3.") for k in layer_keys] + + # DensePose substitutions + layer_keys = [re.sub("^body.conv.fcn", "body_conv_fcn", k) for k in layer_keys] + layer_keys = [k.replace("AnnIndex.lowres", "ann_index_lowres") for k in layer_keys] + layer_keys = [k.replace("Index.UV.lowres", "index_uv_lowres") for k in layer_keys] + layer_keys = [k.replace("U.lowres", "u_lowres") for k in layer_keys] + layer_keys = [k.replace("V.lowres", "v_lowres") for k in layer_keys] + return layer_keys + + +def convert_c2_detectron_names(weights): + """ + Map Caffe2 Detectron weight names to Detectron2 names. + + Args: + weights (dict): name -> tensor + + Returns: + dict: detectron2 names -> tensor + dict: detectron2 names -> C2 names + """ + logger = logging.getLogger(__name__) + logger.info("Remapping C2 weights ......") + original_keys = sorted(weights.keys()) + layer_keys = copy.deepcopy(original_keys) + + layer_keys = convert_basic_c2_names(layer_keys) + + # -------------------------------------------------------------------------- + # RPN hidden representation conv + # -------------------------------------------------------------------------- + # FPN case + # In the C2 model, the RPN hidden layer conv is defined for FPN level 2 and then + # shared for all other levels, hence the appearance of "fpn2" + layer_keys = [ + k.replace("conv.rpn.fpn2", "proposal_generator.rpn_head.conv") for k in layer_keys + ] + # Non-FPN case + layer_keys = [k.replace("conv.rpn", "proposal_generator.rpn_head.conv") for k in layer_keys] + + # -------------------------------------------------------------------------- + # RPN box transformation conv + # -------------------------------------------------------------------------- + # FPN case (see note above about "fpn2") + layer_keys = [ + k.replace("rpn.bbox.pred.fpn2", "proposal_generator.rpn_head.anchor_deltas") + for k in layer_keys + ] + layer_keys = [ + k.replace("rpn.cls.logits.fpn2", "proposal_generator.rpn_head.objectness_logits") + for k in layer_keys + ] + # Non-FPN case + layer_keys = [ + k.replace("rpn.bbox.pred", "proposal_generator.rpn_head.anchor_deltas") for k in layer_keys + ] + layer_keys = [ + k.replace("rpn.cls.logits", "proposal_generator.rpn_head.objectness_logits") + for k in layer_keys + ] + + # -------------------------------------------------------------------------- + # Fast R-CNN box head + # -------------------------------------------------------------------------- + layer_keys = [re.sub("^bbox\\.pred", "bbox_pred", k) for k in layer_keys] + layer_keys = [re.sub("^cls\\.score", "cls_score", k) for k in layer_keys] + layer_keys = [re.sub("^fc6\\.", "box_head.fc1.", k) for k in layer_keys] + layer_keys = [re.sub("^fc7\\.", "box_head.fc2.", k) for k in layer_keys] + # 4conv1fc head tensor names: head_conv1_w, head_conv1_gn_s + layer_keys = [re.sub("^head\\.conv", "box_head.conv", k) for k in layer_keys] + + # -------------------------------------------------------------------------- + # FPN lateral and output convolutions + # -------------------------------------------------------------------------- + def fpn_map(name): + """ + Look for keys with the following patterns: + 1) Starts with "fpn.inner." + Example: "fpn.inner.res2.2.sum.lateral.weight" + Meaning: These are lateral pathway convolutions + 2) Starts with "fpn.res" + Example: "fpn.res2.2.sum.weight" + Meaning: These are FPN output convolutions + """ + splits = name.split(".") + norm = ".norm" if "norm" in splits else "" + if name.startswith("fpn.inner."): + # splits example: ['fpn', 'inner', 'res2', '2', 'sum', 'lateral', 'weight'] + stage = int(splits[2][len("res") :]) + return "fpn_lateral{}{}.{}".format(stage, norm, splits[-1]) + elif name.startswith("fpn.res"): + # splits example: ['fpn', 'res2', '2', 'sum', 'weight'] + stage = int(splits[1][len("res") :]) + return "fpn_output{}{}.{}".format(stage, norm, splits[-1]) + return name + + layer_keys = [fpn_map(k) for k in layer_keys] + + # -------------------------------------------------------------------------- + # Mask R-CNN mask head + # -------------------------------------------------------------------------- + # roi_heads.StandardROIHeads case + layer_keys = [k.replace(".[mask].fcn", "mask_head.mask_fcn") for k in layer_keys] + layer_keys = [re.sub("^\\.mask\\.fcn", "mask_head.mask_fcn", k) for k in layer_keys] + layer_keys = [k.replace("mask.fcn.logits", "mask_head.predictor") for k in layer_keys] + # roi_heads.Res5ROIHeads case + layer_keys = [k.replace("conv5.mask", "mask_head.deconv") for k in layer_keys] + + # -------------------------------------------------------------------------- + # Keypoint R-CNN head + # -------------------------------------------------------------------------- + # interestingly, the keypoint head convs have blob names that are simply "conv_fcnX" + layer_keys = [k.replace("conv.fcn", "roi_heads.keypoint_head.conv_fcn") for k in layer_keys] + layer_keys = [ + k.replace("kps.score.lowres", "roi_heads.keypoint_head.score_lowres") for k in layer_keys + ] + layer_keys = [k.replace("kps.score.", "roi_heads.keypoint_head.score.") for k in layer_keys] + + # -------------------------------------------------------------------------- + # Done with replacements + # -------------------------------------------------------------------------- + assert len(set(layer_keys)) == len(layer_keys) + assert len(original_keys) == len(layer_keys) + + new_weights = {} + new_keys_to_original_keys = {} + for orig, renamed in zip(original_keys, layer_keys): + new_keys_to_original_keys[renamed] = orig + if renamed.startswith("bbox_pred.") or renamed.startswith("mask_head.predictor."): + # remove the meaningless prediction weight for background class + new_start_idx = 4 if renamed.startswith("bbox_pred.") else 1 + new_weights[renamed] = weights[orig][new_start_idx:] + logger.info( + "Remove prediction weight for background class in {}. The shape changes from " + "{} to {}.".format( + renamed, tuple(weights[orig].shape), tuple(new_weights[renamed].shape) + ) + ) + elif renamed.startswith("cls_score."): + # move weights of bg class from original index 0 to last index + logger.info( + "Move classification weights for background class in {} from index 0 to " + "index {}.".format(renamed, weights[orig].shape[0] - 1) + ) + new_weights[renamed] = torch.cat([weights[orig][1:], weights[orig][:1]]) + else: + new_weights[renamed] = weights[orig] + + return new_weights, new_keys_to_original_keys + + +# Note the current matching is not symmetric. +# it assumes model_state_dict will have longer names. +def align_and_update_state_dicts(model_state_dict, ckpt_state_dict, c2_conversion=True): + """ + Match names between the two state-dict, and update the values of model_state_dict in-place with + copies of the matched tensor in ckpt_state_dict. + If `c2_conversion==True`, `ckpt_state_dict` is assumed to be a Caffe2 + model and will be renamed at first. + + Strategy: suppose that the models that we will create will have prefixes appended + to each of its keys, for example due to an extra level of nesting that the original + pre-trained weights from ImageNet won't contain. For example, model.state_dict() + might return backbone[0].body.res2.conv1.weight, while the pre-trained model contains + res2.conv1.weight. We thus want to match both parameters together. + For that, we look for each model weight, look among all loaded keys if there is one + that is a suffix of the current weight name, and use it if that's the case. + If multiple matches exist, take the one with longest size + of the corresponding name. For example, for the same model as before, the pretrained + weight file can contain both res2.conv1.weight, as well as conv1.weight. In this case, + we want to match backbone[0].body.conv1.weight to conv1.weight, and + backbone[0].body.res2.conv1.weight to res2.conv1.weight. + """ + model_keys = sorted(model_state_dict.keys()) + if c2_conversion: + ckpt_state_dict, original_keys = convert_c2_detectron_names(ckpt_state_dict) + # original_keys: the name in the original dict (before renaming) + else: + original_keys = {x: x for x in ckpt_state_dict.keys()} + ckpt_keys = sorted(ckpt_state_dict.keys()) + + def match(a, b): + # Matched ckpt_key should be a complete (starts with '.') suffix. + # For example, roi_heads.mesh_head.whatever_conv1 does not match conv1, + # but matches whatever_conv1 or mesh_head.whatever_conv1. + return a == b or a.endswith("." + b) + + # get a matrix of string matches, where each (i, j) entry correspond to the size of the + # ckpt_key string, if it matches + match_matrix = [len(j) if match(i, j) else 0 for i in model_keys for j in ckpt_keys] + match_matrix = torch.as_tensor(match_matrix).view(len(model_keys), len(ckpt_keys)) + # use the matched one with longest size in case of multiple matches + max_match_size, idxs = match_matrix.max(1) + # remove indices that correspond to no-match + idxs[max_match_size == 0] = -1 + + # used for logging + max_len_model = max(len(key) for key in model_keys) if model_keys else 1 + max_len_ckpt = max(len(key) for key in ckpt_keys) if ckpt_keys else 1 + log_str_template = "{: <{}} loaded from {: <{}} of shape {}" + logger = logging.getLogger(__name__) + # matched_pairs (matched checkpoint key --> matched model key) + matched_keys = {} + for idx_model, idx_ckpt in enumerate(idxs.tolist()): + if idx_ckpt == -1: + continue + key_model = model_keys[idx_model] + key_ckpt = ckpt_keys[idx_ckpt] + value_ckpt = ckpt_state_dict[key_ckpt] + shape_in_model = model_state_dict[key_model].shape + + if shape_in_model != value_ckpt.shape: + logger.warning( + "Shape of {} in checkpoint is {}, while shape of {} in model is {}.".format( + key_ckpt, value_ckpt.shape, key_model, shape_in_model + ) + ) + logger.warning( + "{} will not be loaded. Please double check and see if this is desired.".format( + key_ckpt + ) + ) + continue + + model_state_dict[key_model] = value_ckpt.clone() + if key_ckpt in matched_keys: # already added to matched_keys + logger.error( + "Ambiguity found for {} in checkpoint!" + "It matches at least two keys in the model ({} and {}).".format( + key_ckpt, key_model, matched_keys[key_ckpt] + ) + ) + raise ValueError("Cannot match one checkpoint key to multiple keys in the model.") + + matched_keys[key_ckpt] = key_model + logger.info( + log_str_template.format( + key_model, + max_len_model, + original_keys[key_ckpt], + max_len_ckpt, + tuple(shape_in_model), + ) + ) + matched_model_keys = matched_keys.values() + matched_ckpt_keys = matched_keys.keys() + # print warnings about unmatched keys on both side + unmatched_model_keys = [k for k in model_keys if k not in matched_model_keys] + if len(unmatched_model_keys): + logger.info(get_missing_parameters_message(unmatched_model_keys)) + + unmatched_ckpt_keys = [k for k in ckpt_keys if k not in matched_ckpt_keys] + if len(unmatched_ckpt_keys): + logger.info( + get_unexpected_parameters_message(original_keys[x] for x in unmatched_ckpt_keys) + ) diff --git a/detectron2/checkpoint/catalog.py b/detectron2/checkpoint/catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..62f81f3c1531e2726400cba4c97b60d744670da5 --- /dev/null +++ b/detectron2/checkpoint/catalog.py @@ -0,0 +1,134 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +from fvcore.common.file_io import PathHandler, PathManager + + +class ModelCatalog(object): + """ + Store mappings from names to third-party models. + """ + + S3_C2_DETECTRON_PREFIX = "https://dl.fbaipublicfiles.com/detectron" + + # MSRA models have STRIDE_IN_1X1=True. False otherwise. + # NOTE: all BN models here have fused BN into an affine layer. + # As a result, you should only load them to a model with "FrozenBN". + # Loading them to a model with regular BN or SyncBN is wrong. + # Even when loaded to FrozenBN, it is still different from affine by an epsilon, + # which should be negligible for training. + # NOTE: all models here uses PIXEL_STD=[1,1,1] + # NOTE: Most of the BN models here are no longer used. We use the + # re-converted pre-trained models under detectron2 model zoo instead. + C2_IMAGENET_MODELS = { + "MSRA/R-50": "ImageNetPretrained/MSRA/R-50.pkl", + "MSRA/R-101": "ImageNetPretrained/MSRA/R-101.pkl", + "FAIR/R-50-GN": "ImageNetPretrained/47261647/R-50-GN.pkl", + "FAIR/R-101-GN": "ImageNetPretrained/47592356/R-101-GN.pkl", + "FAIR/X-101-32x8d": "ImageNetPretrained/20171220/X-101-32x8d.pkl", + "FAIR/X-101-64x4d": "ImageNetPretrained/FBResNeXt/X-101-64x4d.pkl", + "FAIR/X-152-32x8d-IN5k": "ImageNetPretrained/25093814/X-152-32x8d-IN5k.pkl", + } + + C2_DETECTRON_PATH_FORMAT = ( + "{prefix}/{url}/output/train/{dataset}/{type}/model_final.pkl" # noqa B950 + ) + + C2_DATASET_COCO = "coco_2014_train%3Acoco_2014_valminusminival" + C2_DATASET_COCO_KEYPOINTS = "keypoints_coco_2014_train%3Akeypoints_coco_2014_valminusminival" + + # format: {model_name} -> part of the url + C2_DETECTRON_MODELS = { + "35857197/e2e_faster_rcnn_R-50-C4_1x": "35857197/12_2017_baselines/e2e_faster_rcnn_R-50-C4_1x.yaml.01_33_49.iAX0mXvW", # noqa B950 + "35857345/e2e_faster_rcnn_R-50-FPN_1x": "35857345/12_2017_baselines/e2e_faster_rcnn_R-50-FPN_1x.yaml.01_36_30.cUF7QR7I", # noqa B950 + "35857890/e2e_faster_rcnn_R-101-FPN_1x": "35857890/12_2017_baselines/e2e_faster_rcnn_R-101-FPN_1x.yaml.01_38_50.sNxI7sX7", # noqa B950 + "36761737/e2e_faster_rcnn_X-101-32x8d-FPN_1x": "36761737/12_2017_baselines/e2e_faster_rcnn_X-101-32x8d-FPN_1x.yaml.06_31_39.5MIHi1fZ", # noqa B950 + "35858791/e2e_mask_rcnn_R-50-C4_1x": "35858791/12_2017_baselines/e2e_mask_rcnn_R-50-C4_1x.yaml.01_45_57.ZgkA7hPB", # noqa B950 + "35858933/e2e_mask_rcnn_R-50-FPN_1x": "35858933/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml.01_48_14.DzEQe4wC", # noqa B950 + "35861795/e2e_mask_rcnn_R-101-FPN_1x": "35861795/12_2017_baselines/e2e_mask_rcnn_R-101-FPN_1x.yaml.02_31_37.KqyEK4tT", # noqa B950 + "36761843/e2e_mask_rcnn_X-101-32x8d-FPN_1x": "36761843/12_2017_baselines/e2e_mask_rcnn_X-101-32x8d-FPN_1x.yaml.06_35_59.RZotkLKI", # noqa B950 + "48616381/e2e_mask_rcnn_R-50-FPN_2x_gn": "GN/48616381/04_2018_gn_baselines/e2e_mask_rcnn_R-50-FPN_2x_gn_0416.13_23_38.bTlTI97Q", # noqa B950 + "37697547/e2e_keypoint_rcnn_R-50-FPN_1x": "37697547/12_2017_baselines/e2e_keypoint_rcnn_R-50-FPN_1x.yaml.08_42_54.kdzV35ao", # noqa B950 + "35998355/rpn_R-50-C4_1x": "35998355/12_2017_baselines/rpn_R-50-C4_1x.yaml.08_00_43.njH5oD9L", # noqa B950 + "35998814/rpn_R-50-FPN_1x": "35998814/12_2017_baselines/rpn_R-50-FPN_1x.yaml.08_06_03.Axg0r179", # noqa B950 + "36225147/fast_R-50-FPN_1x": "36225147/12_2017_baselines/fast_rcnn_R-50-FPN_1x.yaml.08_39_09.L3obSdQ2", # noqa B950 + } + + @staticmethod + def get(name): + if name.startswith("Caffe2Detectron/COCO"): + return ModelCatalog._get_c2_detectron_baseline(name) + if name.startswith("ImageNetPretrained/"): + return ModelCatalog._get_c2_imagenet_pretrained(name) + raise RuntimeError("model not present in the catalog: {}".format(name)) + + @staticmethod + def _get_c2_imagenet_pretrained(name): + prefix = ModelCatalog.S3_C2_DETECTRON_PREFIX + name = name[len("ImageNetPretrained/") :] + name = ModelCatalog.C2_IMAGENET_MODELS[name] + url = "/".join([prefix, name]) + return url + + @staticmethod + def _get_c2_detectron_baseline(name): + name = name[len("Caffe2Detectron/COCO/") :] + url = ModelCatalog.C2_DETECTRON_MODELS[name] + if "keypoint_rcnn" in name: + dataset = ModelCatalog.C2_DATASET_COCO_KEYPOINTS + else: + dataset = ModelCatalog.C2_DATASET_COCO + + if "35998355/rpn_R-50-C4_1x" in name: + # this one model is somehow different from others .. + type = "rpn" + else: + type = "generalized_rcnn" + + # Detectron C2 models are stored in the structure defined in `C2_DETECTRON_PATH_FORMAT`. + url = ModelCatalog.C2_DETECTRON_PATH_FORMAT.format( + prefix=ModelCatalog.S3_C2_DETECTRON_PREFIX, url=url, type=type, dataset=dataset + ) + return url + + +class ModelCatalogHandler(PathHandler): + """ + Resolve URL like catalog://. + """ + + PREFIX = "catalog://" + + def _get_supported_prefixes(self): + return [self.PREFIX] + + def _get_local_path(self, path): + logger = logging.getLogger(__name__) + catalog_path = ModelCatalog.get(path[len(self.PREFIX) :]) + logger.info("Catalog entry {} points to {}".format(path, catalog_path)) + return PathManager.get_local_path(catalog_path) + + def _open(self, path, mode="r", **kwargs): + return PathManager.open(self._get_local_path(path), mode, **kwargs) + + +class Detectron2Handler(PathHandler): + """ + Resolve anything that's in Detectron2 model zoo. + """ + + PREFIX = "detectron2://" + S3_DETECTRON2_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" + + def _get_supported_prefixes(self): + return [self.PREFIX] + + def _get_local_path(self, path): + name = path[len(self.PREFIX) :] + return PathManager.get_local_path(self.S3_DETECTRON2_PREFIX + name) + + def _open(self, path, mode="r", **kwargs): + return PathManager.open(self._get_local_path(path), mode, **kwargs) + + +PathManager.register_handler(ModelCatalogHandler()) +PathManager.register_handler(Detectron2Handler()) diff --git a/detectron2/checkpoint/detection_checkpoint.py b/detectron2/checkpoint/detection_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..06e6739f7b2070cf3e2d34099188e5ea1f7cf622 --- /dev/null +++ b/detectron2/checkpoint/detection_checkpoint.py @@ -0,0 +1,73 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import pickle +from fvcore.common.checkpoint import Checkpointer +from fvcore.common.file_io import PathManager + +import detectron2.utils.comm as comm + +from .c2_model_loading import align_and_update_state_dicts + + +class DetectionCheckpointer(Checkpointer): + """ + Same as :class:`Checkpointer`, but is able to handle models in detectron & detectron2 + model zoo, and apply conversions for legacy models. + """ + + def __init__(self, model, save_dir="", *, save_to_disk=None, **checkpointables): + is_main_process = comm.is_main_process() + super().__init__( + model, + save_dir, + save_to_disk=is_main_process if save_to_disk is None else save_to_disk, + **checkpointables, + ) + + def _load_file(self, filename): + if filename.endswith(".pkl"): + with PathManager.open(filename, "rb") as f: + data = pickle.load(f, encoding="latin1") + if "model" in data and "__author__" in data: + # file is in Detectron2 model zoo format + self.logger.info("Reading a file from '{}'".format(data["__author__"])) + return data + else: + # assume file is from Caffe2 / Detectron1 model zoo + if "blobs" in data: + # Detection models have "blobs", but ImageNet models don't + data = data["blobs"] + data = {k: v for k, v in data.items() if not k.endswith("_momentum")} + return {"model": data, "__author__": "Caffe2", "matching_heuristics": True} + + loaded = super()._load_file(filename) # load native pth checkpoint + if "model" not in loaded: + loaded = {"model": loaded} + return loaded + + def _load_model(self, checkpoint): + if checkpoint.get("matching_heuristics", False): + self._convert_ndarray_to_tensor(checkpoint["model"]) + # convert weights by name-matching heuristics + model_state_dict = self.model.state_dict() + align_and_update_state_dicts( + model_state_dict, + checkpoint["model"], + c2_conversion=checkpoint.get("__author__", None) == "Caffe2", + ) + checkpoint["model"] = model_state_dict + # for non-caffe2 models, use standard ways to load it + incompatible = super()._load_model(checkpoint) + if incompatible is None: # support older versions of fvcore + return None + + model_buffers = dict(self.model.named_buffers(recurse=False)) + for k in ["pixel_mean", "pixel_std"]: + # Ignore missing key message about pixel_mean/std. + # Though they may be missing in old checkpoints, they will be correctly + # initialized from config anyway. + if k in model_buffers: + try: + incompatible.missing_keys.remove(k) + except ValueError: + pass + return incompatible diff --git a/detectron2/config/__init__.py b/detectron2/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f996ecd74947c504f86e3e6854a45bd74ad32c1c --- /dev/null +++ b/detectron2/config/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .compat import downgrade_config, upgrade_config +from .config import CfgNode, get_cfg, global_cfg, set_global_cfg, configurable + +__all__ = [ + "CfgNode", + "get_cfg", + "global_cfg", + "set_global_cfg", + "downgrade_config", + "upgrade_config", + "configurable", +] diff --git a/detectron2/config/compat.py b/detectron2/config/compat.py new file mode 100644 index 0000000000000000000000000000000000000000..41fe3a00ca05885abf28106808fe7f8d862b5036 --- /dev/null +++ b/detectron2/config/compat.py @@ -0,0 +1,229 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Backward compatibility of configs. + +Instructions to bump version: ++ It's not needed to bump version if new keys are added. + It's only needed when backward-incompatible changes happen + (i.e., some existing keys disappear, or the meaning of a key changes) ++ To bump version, do the following: + 1. Increment _C.VERSION in defaults.py + 2. Add a converter in this file. + + Each ConverterVX has a function "upgrade" which in-place upgrades config from X-1 to X, + and a function "downgrade" which in-place downgrades config from X to X-1 + + In each function, VERSION is left unchanged. + + Each converter assumes that its input has the relevant keys + (i.e., the input is not a partial config). + 3. Run the tests (test_config.py) to make sure the upgrade & downgrade + functions are consistent. +""" + +import logging +from typing import List, Optional, Tuple + +from .config import CfgNode as CN +from .defaults import _C + +__all__ = ["upgrade_config", "downgrade_config"] + + +def upgrade_config(cfg: CN, to_version: Optional[int] = None) -> CN: + """ + Upgrade a config from its current version to a newer version. + + Args: + cfg (CfgNode): + to_version (int): defaults to the latest version. + """ + cfg = cfg.clone() + if to_version is None: + to_version = _C.VERSION + + assert cfg.VERSION <= to_version, "Cannot upgrade from v{} to v{}!".format( + cfg.VERSION, to_version + ) + for k in range(cfg.VERSION, to_version): + converter = globals()["ConverterV" + str(k + 1)] + converter.upgrade(cfg) + cfg.VERSION = k + 1 + return cfg + + +def downgrade_config(cfg: CN, to_version: int) -> CN: + """ + Downgrade a config from its current version to an older version. + + Args: + cfg (CfgNode): + to_version (int): + + Note: + A general downgrade of arbitrary configs is not always possible due to the + different functionalities in different versions. + The purpose of downgrade is only to recover the defaults in old versions, + allowing it to load an old partial yaml config. + Therefore, the implementation only needs to fill in the default values + in the old version when a general downgrade is not possible. + """ + cfg = cfg.clone() + assert cfg.VERSION >= to_version, "Cannot downgrade from v{} to v{}!".format( + cfg.VERSION, to_version + ) + for k in range(cfg.VERSION, to_version, -1): + converter = globals()["ConverterV" + str(k)] + converter.downgrade(cfg) + cfg.VERSION = k - 1 + return cfg + + +def guess_version(cfg: CN, filename: str) -> int: + """ + Guess the version of a partial config where the VERSION field is not specified. + Returns the version, or the latest if cannot make a guess. + + This makes it easier for users to migrate. + """ + logger = logging.getLogger(__name__) + + def _has(name: str) -> bool: + cur = cfg + for n in name.split("."): + if n not in cur: + return False + cur = cur[n] + return True + + # Most users' partial configs have "MODEL.WEIGHT", so guess on it + ret = None + if _has("MODEL.WEIGHT") or _has("TEST.AUG_ON"): + ret = 1 + + if ret is not None: + logger.warning("Config '{}' has no VERSION. Assuming it to be v{}.".format(filename, ret)) + else: + ret = _C.VERSION + logger.warning( + "Config '{}' has no VERSION. Assuming it to be compatible with latest v{}.".format( + filename, ret + ) + ) + return ret + + +def _rename(cfg: CN, old: str, new: str) -> None: + old_keys = old.split(".") + new_keys = new.split(".") + + def _set(key_seq: List[str], val: str) -> None: + cur = cfg + for k in key_seq[:-1]: + if k not in cur: + cur[k] = CN() + cur = cur[k] + cur[key_seq[-1]] = val + + def _get(key_seq: List[str]) -> CN: + cur = cfg + for k in key_seq: + cur = cur[k] + return cur + + def _del(key_seq: List[str]) -> None: + cur = cfg + for k in key_seq[:-1]: + cur = cur[k] + del cur[key_seq[-1]] + if len(cur) == 0 and len(key_seq) > 1: + _del(key_seq[:-1]) + + _set(new_keys, _get(old_keys)) + _del(old_keys) + + +class _RenameConverter: + """ + A converter that handles simple rename. + """ + + RENAME: List[Tuple[str, str]] = [] # list of tuples of (old name, new name) + + @classmethod + def upgrade(cls, cfg: CN) -> None: + for old, new in cls.RENAME: + _rename(cfg, old, new) + + @classmethod + def downgrade(cls, cfg: CN) -> None: + for old, new in cls.RENAME[::-1]: + _rename(cfg, new, old) + + +class ConverterV1(_RenameConverter): + RENAME = [("MODEL.RPN_HEAD.NAME", "MODEL.RPN.HEAD_NAME")] + + +class ConverterV2(_RenameConverter): + """ + A large bulk of rename, before public release. + """ + + RENAME = [ + ("MODEL.WEIGHT", "MODEL.WEIGHTS"), + ("MODEL.PANOPTIC_FPN.SEMANTIC_LOSS_SCALE", "MODEL.SEM_SEG_HEAD.LOSS_WEIGHT"), + ("MODEL.PANOPTIC_FPN.RPN_LOSS_SCALE", "MODEL.RPN.LOSS_WEIGHT"), + ("MODEL.PANOPTIC_FPN.INSTANCE_LOSS_SCALE", "MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT"), + ("MODEL.PANOPTIC_FPN.COMBINE_ON", "MODEL.PANOPTIC_FPN.COMBINE.ENABLED"), + ( + "MODEL.PANOPTIC_FPN.COMBINE_OVERLAP_THRESHOLD", + "MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH", + ), + ( + "MODEL.PANOPTIC_FPN.COMBINE_STUFF_AREA_LIMIT", + "MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT", + ), + ( + "MODEL.PANOPTIC_FPN.COMBINE_INSTANCES_CONFIDENCE_THRESHOLD", + "MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH", + ), + ("MODEL.ROI_HEADS.SCORE_THRESH", "MODEL.ROI_HEADS.SCORE_THRESH_TEST"), + ("MODEL.ROI_HEADS.NMS", "MODEL.ROI_HEADS.NMS_THRESH_TEST"), + ("MODEL.RETINANET.INFERENCE_SCORE_THRESHOLD", "MODEL.RETINANET.SCORE_THRESH_TEST"), + ("MODEL.RETINANET.INFERENCE_TOPK_CANDIDATES", "MODEL.RETINANET.TOPK_CANDIDATES_TEST"), + ("MODEL.RETINANET.INFERENCE_NMS_THRESHOLD", "MODEL.RETINANET.NMS_THRESH_TEST"), + ("TEST.DETECTIONS_PER_IMG", "TEST.DETECTIONS_PER_IMAGE"), + ("TEST.AUG_ON", "TEST.AUG.ENABLED"), + ("TEST.AUG_MIN_SIZES", "TEST.AUG.MIN_SIZES"), + ("TEST.AUG_MAX_SIZE", "TEST.AUG.MAX_SIZE"), + ("TEST.AUG_FLIP", "TEST.AUG.FLIP"), + ] + + @classmethod + def upgrade(cls, cfg: CN) -> None: + super().upgrade(cfg) + + if cfg.MODEL.META_ARCHITECTURE == "RetinaNet": + _rename( + cfg, "MODEL.RETINANET.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS" + ) + _rename(cfg, "MODEL.RETINANET.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES") + del cfg["MODEL"]["RPN"]["ANCHOR_SIZES"] + del cfg["MODEL"]["RPN"]["ANCHOR_ASPECT_RATIOS"] + else: + _rename(cfg, "MODEL.RPN.ANCHOR_ASPECT_RATIOS", "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS") + _rename(cfg, "MODEL.RPN.ANCHOR_SIZES", "MODEL.ANCHOR_GENERATOR.SIZES") + del cfg["MODEL"]["RETINANET"]["ANCHOR_SIZES"] + del cfg["MODEL"]["RETINANET"]["ANCHOR_ASPECT_RATIOS"] + del cfg["MODEL"]["RETINANET"]["ANCHOR_STRIDES"] + + @classmethod + def downgrade(cls, cfg: CN) -> None: + super().downgrade(cfg) + + _rename(cfg, "MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS", "MODEL.RPN.ANCHOR_ASPECT_RATIOS") + _rename(cfg, "MODEL.ANCHOR_GENERATOR.SIZES", "MODEL.RPN.ANCHOR_SIZES") + cfg.MODEL.RETINANET.ANCHOR_ASPECT_RATIOS = cfg.MODEL.RPN.ANCHOR_ASPECT_RATIOS + cfg.MODEL.RETINANET.ANCHOR_SIZES = cfg.MODEL.RPN.ANCHOR_SIZES + cfg.MODEL.RETINANET.ANCHOR_STRIDES = [] # this is not used anywhere in any version diff --git a/detectron2/config/config.py b/detectron2/config/config.py new file mode 100644 index 0000000000000000000000000000000000000000..14ad524f00e706ddba567a62f805481c2f185a8e --- /dev/null +++ b/detectron2/config/config.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import functools +import inspect +import logging +from fvcore.common.config import CfgNode as _CfgNode +from fvcore.common.file_io import PathManager + + +class CfgNode(_CfgNode): + """ + The same as `fvcore.common.config.CfgNode`, but different in: + + 1. Use unsafe yaml loading by default. + Note that this may lead to arbitrary code execution: you must not + load a config file from untrusted sources before manually inspecting + the content of the file. + 2. Support config versioning. + When attempting to merge an old config, it will convert the old config automatically. + """ + + # Note that the default value of allow_unsafe is changed to True + def merge_from_file(self, cfg_filename: str, allow_unsafe: bool = True) -> None: + assert PathManager.isfile(cfg_filename), f"Config file '{cfg_filename}' does not exist!" + loaded_cfg = _CfgNode.load_yaml_with_base(cfg_filename, allow_unsafe=allow_unsafe) + loaded_cfg = type(self)(loaded_cfg) + + # defaults.py needs to import CfgNode + from .defaults import _C + + latest_ver = _C.VERSION + assert ( + latest_ver == self.VERSION + ), "CfgNode.merge_from_file is only allowed on a config object of latest version!" + + logger = logging.getLogger(__name__) + + loaded_ver = loaded_cfg.get("VERSION", None) + if loaded_ver is None: + from .compat import guess_version + + loaded_ver = guess_version(loaded_cfg, cfg_filename) + assert loaded_ver <= self.VERSION, "Cannot merge a v{} config into a v{} config.".format( + loaded_ver, self.VERSION + ) + + if loaded_ver == self.VERSION: + self.merge_from_other_cfg(loaded_cfg) + else: + # compat.py needs to import CfgNode + from .compat import upgrade_config, downgrade_config + + logger.warning( + "Loading an old v{} config file '{}' by automatically upgrading to v{}. " + "See docs/CHANGELOG.md for instructions to update your files.".format( + loaded_ver, cfg_filename, self.VERSION + ) + ) + # To convert, first obtain a full config at an old version + old_self = downgrade_config(self, to_version=loaded_ver) + old_self.merge_from_other_cfg(loaded_cfg) + new_config = upgrade_config(old_self) + self.clear() + self.update(new_config) + + def dump(self, *args, **kwargs): + """ + Returns: + str: a yaml string representation of the config + """ + # to make it show up in docs + return super().dump(*args, **kwargs) + + +global_cfg = CfgNode() + + +def get_cfg() -> CfgNode: + """ + Get a copy of the default config. + + Returns: + a detectron2 CfgNode instance. + """ + from .defaults import _C + + return _C.clone() + + +def set_global_cfg(cfg: CfgNode) -> None: + """ + Let the global config point to the given cfg. + + Assume that the given "cfg" has the key "KEY", after calling + `set_global_cfg(cfg)`, the key can be accessed by: + + .. code-block:: python + + from detectron2.config import global_cfg + print(global_cfg.KEY) + + By using a hacky global config, you can access these configs anywhere, + without having to pass the config object or the values deep into the code. + This is a hacky feature introduced for quick prototyping / research exploration. + """ + global global_cfg + global_cfg.clear() + global_cfg.update(cfg) + + +def configurable(init_func): + """ + Decorate a class's __init__ method so that it can be called with a CfgNode + object using the class's from_config classmethod. + + Examples: + + .. code-block:: python + + class A: + @configurable + def __init__(self, a, b=2, c=3): + pass + + @classmethod + def from_config(cls, cfg): + # Returns kwargs to be passed to __init__ + return {"a": cfg.A, "b": cfg.B} + + a1 = A(a=1, b=2) # regular construction + a2 = A(cfg) # construct with a cfg + a3 = A(cfg, b=3, c=4) # construct with extra overwrite + """ + assert init_func.__name__ == "__init__", "@configurable should only be used for __init__!" + if init_func.__module__.startswith("detectron2."): + assert ( + init_func.__doc__ is not None and "experimental" in init_func.__doc__ + ), f"configurable {init_func} should be marked experimental" + + @functools.wraps(init_func) + def wrapped(self, *args, **kwargs): + try: + from_config_func = type(self).from_config + except AttributeError: + raise AttributeError("Class with @configurable must have a 'from_config' classmethod.") + if not inspect.ismethod(from_config_func): + raise TypeError("Class with @configurable must have a 'from_config' classmethod.") + + if _called_with_cfg(*args, **kwargs): + explicit_args = _get_args_from_config(from_config_func, *args, **kwargs) + init_func(self, **explicit_args) + else: + init_func(self, *args, **kwargs) + + return wrapped + + +def _get_args_from_config(from_config_func, *args, **kwargs): + """ + Use `from_config` to obtain explicit arguments. + + Returns: + dict: arguments to be used for cls.__init__ + """ + signature = inspect.signature(from_config_func) + if list(signature.parameters.keys())[0] != "cfg": + raise TypeError( + f"{from_config_func.__self__}.from_config must take 'cfg' as the first argument!" + ) + support_var_arg = any( + param.kind in [param.VAR_POSITIONAL, param.VAR_KEYWORD] + for param in signature.parameters.values() + ) + if support_var_arg: # forward all arguments to from_config, if from_config accepts them + ret = from_config_func(*args, **kwargs) + else: + # forward supported arguments to from_config + supported_arg_names = set(signature.parameters.keys()) + extra_kwargs = {} + for name in list(kwargs.keys()): + if name not in supported_arg_names: + extra_kwargs[name] = kwargs.pop(name) + ret = from_config_func(*args, **kwargs) + # forward the other arguments to __init__ + ret.update(extra_kwargs) + return ret + + +def _called_with_cfg(*args, **kwargs): + """ + Returns: + bool: whether the arguments contain CfgNode and should be considered + forwarded to from_config. + """ + if len(args) and isinstance(args[0], _CfgNode): + return True + if isinstance(kwargs.pop("cfg", None), _CfgNode): + return True + # `from_config`'s first argument is forced to be "cfg". + # So the above check covers all cases. + return False diff --git a/detectron2/config/defaults.py b/detectron2/config/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..88bb89ed49f5e618df56719d1640011a3df2cb0f --- /dev/null +++ b/detectron2/config/defaults.py @@ -0,0 +1,598 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .config import CfgNode as CN + +# ----------------------------------------------------------------------------- +# Convention about Training / Test specific parameters +# ----------------------------------------------------------------------------- +# Whenever an argument can be either used for training or for testing, the +# corresponding name will be post-fixed by a _TRAIN for a training parameter, +# or _TEST for a test-specific parameter. +# For example, the number of images during training will be +# IMAGES_PER_BATCH_TRAIN, while the number of images for testing will be +# IMAGES_PER_BATCH_TEST + +# ----------------------------------------------------------------------------- +# Config definition +# ----------------------------------------------------------------------------- + +_C = CN() + +# The version number, to upgrade from old configs to new ones if any +# changes happen. It's recommended to keep a VERSION in your config file. +_C.VERSION = 2 + +_C.MODEL = CN() +_C.MODEL.LOAD_PROPOSALS = False +_C.MODEL.MASK_ON = False +_C.MODEL.KEYPOINT_ON = False +_C.MODEL.DEVICE = "cuda" +_C.MODEL.META_ARCHITECTURE = "GeneralizedRCNN" + +# Path (possibly with schema like catalog:// or detectron2://) to a checkpoint file +# to be loaded to the model. You can find available models in the model zoo. +_C.MODEL.WEIGHTS = "" + +# Values to be used for image normalization (BGR order, since INPUT.FORMAT defaults to BGR). +# To train on images of different number of channels, just set different mean & std. +# Default values are the mean pixel value from ImageNet: [103.53, 116.28, 123.675] +_C.MODEL.PIXEL_MEAN = [103.530, 116.280, 123.675] +# When using pre-trained models in Detectron1 or any MSRA models, +# std has been absorbed into its conv1 weights, so the std needs to be set 1. +# Otherwise, you can use [57.375, 57.120, 58.395] (ImageNet std) +_C.MODEL.PIXEL_STD = [1.0, 1.0, 1.0] + + +# ----------------------------------------------------------------------------- +# INPUT +# ----------------------------------------------------------------------------- +_C.INPUT = CN() +# Size of the smallest side of the image during training +_C.INPUT.MIN_SIZE_TRAIN = (800,) +# Sample size of smallest side by choice or random selection from range give by +# INPUT.MIN_SIZE_TRAIN +_C.INPUT.MIN_SIZE_TRAIN_SAMPLING = "choice" +# Maximum size of the side of the image during training +_C.INPUT.MAX_SIZE_TRAIN = 1333 +# Size of the smallest side of the image during testing. Set to zero to disable resize in testing. +_C.INPUT.MIN_SIZE_TEST = 800 +# Maximum size of the side of the image during testing +_C.INPUT.MAX_SIZE_TEST = 1333 + +# `True` if cropping is used for data augmentation during training +_C.INPUT.CROP = CN({"ENABLED": False}) +# Cropping type: +# - "relative" crop (H * CROP.SIZE[0], W * CROP.SIZE[1]) part of an input of size (H, W) +# - "relative_range" uniformly sample relative crop size from between [CROP.SIZE[0], [CROP.SIZE[1]]. +# and [1, 1] and use it as in "relative" scenario. +# - "absolute" crop part of an input with absolute size: (CROP.SIZE[0], CROP.SIZE[1]). +_C.INPUT.CROP.TYPE = "relative_range" +# Size of crop in range (0, 1] if CROP.TYPE is "relative" or "relative_range" and in number of +# pixels if CROP.TYPE is "absolute" +_C.INPUT.CROP.SIZE = [0.9, 0.9] + + +# Whether the model needs RGB, YUV, HSV etc. +# Should be one of the modes defined here, as we use PIL to read the image: +# https://pillow.readthedocs.io/en/stable/handbook/concepts.html#concept-modes +# with BGR being the one exception. One can set image format to BGR, we will +# internally use RGB for conversion and flip the channels over +_C.INPUT.FORMAT = "BGR" +# The ground truth mask format that the model will use. +# Mask R-CNN supports either "polygon" or "bitmask" as ground truth. +_C.INPUT.MASK_FORMAT = "polygon" # alternative: "bitmask" + + +# ----------------------------------------------------------------------------- +# Dataset +# ----------------------------------------------------------------------------- +_C.DATASETS = CN() +# List of the dataset names for training. Must be registered in DatasetCatalog +_C.DATASETS.TRAIN = () +# List of the pre-computed proposal files for training, which must be consistent +# with datasets listed in DATASETS.TRAIN. +_C.DATASETS.PROPOSAL_FILES_TRAIN = () +# Number of top scoring precomputed proposals to keep for training +_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN = 2000 +# List of the dataset names for testing. Must be registered in DatasetCatalog +_C.DATASETS.TEST = () +# List of the pre-computed proposal files for test, which must be consistent +# with datasets listed in DATASETS.TEST. +_C.DATASETS.PROPOSAL_FILES_TEST = () +# Number of top scoring precomputed proposals to keep for test +_C.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST = 1000 + +# ----------------------------------------------------------------------------- +# DataLoader +# ----------------------------------------------------------------------------- +_C.DATALOADER = CN() +# Number of data loading threads +_C.DATALOADER.NUM_WORKERS = 4 +# If True, each batch should contain only images for which the aspect ratio +# is compatible. This groups portrait images together, and landscape images +# are not batched with portrait images. +_C.DATALOADER.ASPECT_RATIO_GROUPING = True +# Options: TrainingSampler, RepeatFactorTrainingSampler +_C.DATALOADER.SAMPLER_TRAIN = "TrainingSampler" +# Repeat threshold for RepeatFactorTrainingSampler +_C.DATALOADER.REPEAT_THRESHOLD = 0.0 +# if True, the dataloader will filter out images that have no associated +# annotations at train time. +_C.DATALOADER.FILTER_EMPTY_ANNOTATIONS = True + +# ---------------------------------------------------------------------------- # +# Backbone options +# ---------------------------------------------------------------------------- # +_C.MODEL.BACKBONE = CN() + +_C.MODEL.BACKBONE.NAME = "build_resnet_backbone" +# Freeze the first several stages so they are not trained. +# There are 5 stages in ResNet. The first is a convolution, and the following +# stages are each group of residual blocks. +_C.MODEL.BACKBONE.FREEZE_AT = 2 + + +# ---------------------------------------------------------------------------- # +# FPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.FPN = CN() +# Names of the input feature maps to be used by FPN +# They must have contiguous power of 2 strides +# e.g., ["res2", "res3", "res4", "res5"] +_C.MODEL.FPN.IN_FEATURES = [] +_C.MODEL.FPN.OUT_CHANNELS = 256 + +# Options: "" (no norm), "GN" +_C.MODEL.FPN.NORM = "" + +# Types for fusing the FPN top-down and lateral features. Can be either "sum" or "avg" +_C.MODEL.FPN.FUSE_TYPE = "sum" + + +# ---------------------------------------------------------------------------- # +# Proposal generator options +# ---------------------------------------------------------------------------- # +_C.MODEL.PROPOSAL_GENERATOR = CN() +# Current proposal generators include "RPN", "RRPN" and "PrecomputedProposals" +_C.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" +# Proposal height and width both need to be greater than MIN_SIZE +# (a the scale used during training or inference) +_C.MODEL.PROPOSAL_GENERATOR.MIN_SIZE = 0 + + +# ---------------------------------------------------------------------------- # +# Anchor generator options +# ---------------------------------------------------------------------------- # +_C.MODEL.ANCHOR_GENERATOR = CN() +# The generator can be any name in the ANCHOR_GENERATOR registry +_C.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" +# Anchor sizes (i.e. sqrt of area) in absolute pixels w.r.t. the network input. +# Format: list[list[float]]. SIZES[i] specifies the list of sizes +# to use for IN_FEATURES[i]; len(SIZES) == len(IN_FEATURES) must be true, +# or len(SIZES) == 1 is true and size list SIZES[0] is used for all +# IN_FEATURES. +_C.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64, 128, 256, 512]] +# Anchor aspect ratios. For each area given in `SIZES`, anchors with different aspect +# ratios are generated by an anchor generator. +# Format: list[list[float]]. ASPECT_RATIOS[i] specifies the list of aspect ratios (H/W) +# to use for IN_FEATURES[i]; len(ASPECT_RATIOS) == len(IN_FEATURES) must be true, +# or len(ASPECT_RATIOS) == 1 is true and aspect ratio list ASPECT_RATIOS[0] is used +# for all IN_FEATURES. +_C.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.5, 1.0, 2.0]] +# Anchor angles. +# list[list[float]], the angle in degrees, for each input feature map. +# ANGLES[i] specifies the list of angles for IN_FEATURES[i]. +_C.MODEL.ANCHOR_GENERATOR.ANGLES = [[-90, 0, 90]] +# Relative offset between the center of the first anchor and the top-left corner of the image +# Value has to be in [0, 1). Recommend to use 0.5, which means half stride. +# The value is not expected to affect model accuracy. +_C.MODEL.ANCHOR_GENERATOR.OFFSET = 0.0 + +# ---------------------------------------------------------------------------- # +# RPN options +# ---------------------------------------------------------------------------- # +_C.MODEL.RPN = CN() +_C.MODEL.RPN.HEAD_NAME = "StandardRPNHead" # used by RPN_HEAD_REGISTRY + +# Names of the input feature maps to be used by RPN +# e.g., ["p2", "p3", "p4", "p5", "p6"] for FPN +_C.MODEL.RPN.IN_FEATURES = ["res4"] +# Remove RPN anchors that go outside the image by BOUNDARY_THRESH pixels +# Set to -1 or a large value, e.g. 100000, to disable pruning anchors +_C.MODEL.RPN.BOUNDARY_THRESH = -1 +# IOU overlap ratios [BG_IOU_THRESHOLD, FG_IOU_THRESHOLD] +# Minimum overlap required between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a positive example (IoU >= FG_IOU_THRESHOLD +# ==> positive RPN example: 1) +# Maximum overlap allowed between an anchor and ground-truth box for the +# (anchor, gt box) pair to be a negative examples (IoU < BG_IOU_THRESHOLD +# ==> negative RPN example: 0) +# Anchors with overlap in between (BG_IOU_THRESHOLD <= IoU < FG_IOU_THRESHOLD) +# are ignored (-1) +_C.MODEL.RPN.IOU_THRESHOLDS = [0.3, 0.7] +_C.MODEL.RPN.IOU_LABELS = [0, -1, 1] +# Total number of RPN examples per image +_C.MODEL.RPN.BATCH_SIZE_PER_IMAGE = 256 +# Target fraction of foreground (positive) examples per RPN minibatch +_C.MODEL.RPN.POSITIVE_FRACTION = 0.5 +# Weights on (dx, dy, dw, dh) for normalizing RPN anchor regression targets +_C.MODEL.RPN.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) +# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. +_C.MODEL.RPN.SMOOTH_L1_BETA = 0.0 +_C.MODEL.RPN.LOSS_WEIGHT = 1.0 +# Number of top scoring RPN proposals to keep before applying NMS +# When FPN is used, this is *per FPN level* (not total) +_C.MODEL.RPN.PRE_NMS_TOPK_TRAIN = 12000 +_C.MODEL.RPN.PRE_NMS_TOPK_TEST = 6000 +# Number of top scoring RPN proposals to keep after applying NMS +# When FPN is used, this limit is applied per level and then again to the union +# of proposals from all levels +# NOTE: When FPN is used, the meaning of this config is different from Detectron1. +# It means per-batch topk in Detectron1, but per-image topk here. +# See "modeling/rpn/rpn_outputs.py" for details. +_C.MODEL.RPN.POST_NMS_TOPK_TRAIN = 2000 +_C.MODEL.RPN.POST_NMS_TOPK_TEST = 1000 +# NMS threshold used on RPN proposals +_C.MODEL.RPN.NMS_THRESH = 0.7 + +# ---------------------------------------------------------------------------- # +# ROI HEADS options +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_HEADS = CN() +_C.MODEL.ROI_HEADS.NAME = "Res5ROIHeads" +# Number of foreground classes +_C.MODEL.ROI_HEADS.NUM_CLASSES = 80 +# Names of the input feature maps to be used by ROI heads +# Currently all heads (box, mask, ...) use the same input feature map list +# e.g., ["p2", "p3", "p4", "p5"] is commonly used for FPN +_C.MODEL.ROI_HEADS.IN_FEATURES = ["res4"] +# IOU overlap ratios [IOU_THRESHOLD] +# Overlap threshold for an RoI to be considered background (if < IOU_THRESHOLD) +# Overlap threshold for an RoI to be considered foreground (if >= IOU_THRESHOLD) +_C.MODEL.ROI_HEADS.IOU_THRESHOLDS = [0.5] +_C.MODEL.ROI_HEADS.IOU_LABELS = [0, 1] +# RoI minibatch size *per image* (number of regions of interest [ROIs]) +# Total number of RoIs per training minibatch = +# ROI_HEADS.BATCH_SIZE_PER_IMAGE * SOLVER.IMS_PER_BATCH +# E.g., a common configuration is: 512 * 16 = 8192 +_C.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 512 +# Target fraction of RoI minibatch that is labeled foreground (i.e. class > 0) +_C.MODEL.ROI_HEADS.POSITIVE_FRACTION = 0.25 + +# Only used on test mode + +# Minimum score threshold (assuming scores in a [0, 1] range); a value chosen to +# balance obtaining high recall with not having too many low precision +# detections that will slow down inference post processing steps (like NMS) +# A default threshold of 0.0 increases AP by ~0.2-0.3 but significantly slows down +# inference. +_C.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.05 +# Overlap threshold used for non-maximum suppression (suppress boxes with +# IoU >= this threshold) +_C.MODEL.ROI_HEADS.NMS_THRESH_TEST = 0.5 +# If True, augment proposals with ground-truth boxes before sampling proposals to +# train ROI heads. +_C.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT = True + +# ---------------------------------------------------------------------------- # +# Box Head +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_BOX_HEAD = CN() +# C4 don't use head name option +# Options for non-C4 models: FastRCNNConvFCHead, +_C.MODEL.ROI_BOX_HEAD.NAME = "" +# Default weights on (dx, dy, dw, dh) for normalizing bbox regression targets +# These are empirically chosen to approximately lead to unit variance targets +_C.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10.0, 10.0, 5.0, 5.0) +# The transition point from L1 to L2 loss. Set to 0.0 to make the loss simply L1. +_C.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA = 0.0 +_C.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO = 0 +# Type of pooling operation applied to the incoming feature map for each RoI +_C.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" + +_C.MODEL.ROI_BOX_HEAD.NUM_FC = 0 +# Hidden layer dimension for FC layers in the RoI box head +_C.MODEL.ROI_BOX_HEAD.FC_DIM = 1024 +_C.MODEL.ROI_BOX_HEAD.NUM_CONV = 0 +# Channel dimension for Conv layers in the RoI box head +_C.MODEL.ROI_BOX_HEAD.CONV_DIM = 256 +# Normalization method for the convolution layers. +# Options: "" (no norm), "GN", "SyncBN". +_C.MODEL.ROI_BOX_HEAD.NORM = "" +# Whether to use class agnostic for bbox regression +_C.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG = False +# If true, RoI heads use bounding boxes predicted by the box head rather than proposal boxes. +_C.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES = False + +# ---------------------------------------------------------------------------- # +# Cascaded Box Head +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_BOX_CASCADE_HEAD = CN() +# The number of cascade stages is implicitly defined by the length of the following two configs. +_C.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS = ( + (10.0, 10.0, 5.0, 5.0), + (20.0, 20.0, 10.0, 10.0), + (30.0, 30.0, 15.0, 15.0), +) +_C.MODEL.ROI_BOX_CASCADE_HEAD.IOUS = (0.5, 0.6, 0.7) + + +# ---------------------------------------------------------------------------- # +# Mask Head +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_MASK_HEAD = CN() +_C.MODEL.ROI_MASK_HEAD.NAME = "MaskRCNNConvUpsampleHead" +_C.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_MASK_HEAD.NUM_CONV = 0 # The number of convs in the mask head +_C.MODEL.ROI_MASK_HEAD.CONV_DIM = 256 +# Normalization method for the convolution layers. +# Options: "" (no norm), "GN", "SyncBN". +_C.MODEL.ROI_MASK_HEAD.NORM = "" +# Whether to use class agnostic for mask prediction +_C.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK = False +# Type of pooling operation applied to the incoming feature map for each RoI +_C.MODEL.ROI_MASK_HEAD.POOLER_TYPE = "ROIAlignV2" + + +# ---------------------------------------------------------------------------- # +# Keypoint Head +# ---------------------------------------------------------------------------- # +_C.MODEL.ROI_KEYPOINT_HEAD = CN() +_C.MODEL.ROI_KEYPOINT_HEAD.NAME = "KRCNNConvDeconvUpsampleHead" +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION = 14 +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO = 0 +_C.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS = tuple(512 for _ in range(8)) +_C.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS = 17 # 17 is the number of keypoints in COCO. + +# Images with too few (or no) keypoints are excluded from training. +_C.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE = 1 +# Normalize by the total number of visible keypoints in the minibatch if True. +# Otherwise, normalize by the total number of keypoints that could ever exist +# in the minibatch. +# The keypoint softmax loss is only calculated on visible keypoints. +# Since the number of visible keypoints can vary significantly between +# minibatches, this has the effect of up-weighting the importance of +# minibatches with few visible keypoints. (Imagine the extreme case of +# only one visible keypoint versus N: in the case of N, each one +# contributes 1/N to the gradient compared to the single keypoint +# determining the gradient direction). Instead, we can normalize the +# loss by the total number of keypoints, if it were the case that all +# keypoints were visible in a full minibatch. (Returning to the example, +# this means that the one visible keypoint contributes as much as each +# of the N keypoints.) +_C.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS = True +# Multi-task loss weight to use for keypoints +# Recommended values: +# - use 1.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is True +# - use 4.0 if NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS is False +_C.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT = 1.0 +# Type of pooling operation applied to the incoming feature map for each RoI +_C.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE = "ROIAlignV2" + +# ---------------------------------------------------------------------------- # +# Semantic Segmentation Head +# ---------------------------------------------------------------------------- # +_C.MODEL.SEM_SEG_HEAD = CN() +_C.MODEL.SEM_SEG_HEAD.NAME = "SemSegFPNHead" +_C.MODEL.SEM_SEG_HEAD.IN_FEATURES = ["p2", "p3", "p4", "p5"] +# Label in the semantic segmentation ground truth that is ignored, i.e., no loss is calculated for +# the correposnding pixel. +_C.MODEL.SEM_SEG_HEAD.IGNORE_VALUE = 255 +# Number of classes in the semantic segmentation head +_C.MODEL.SEM_SEG_HEAD.NUM_CLASSES = 54 +# Number of channels in the 3x3 convs inside semantic-FPN heads. +_C.MODEL.SEM_SEG_HEAD.CONVS_DIM = 128 +# Outputs from semantic-FPN heads are up-scaled to the COMMON_STRIDE stride. +_C.MODEL.SEM_SEG_HEAD.COMMON_STRIDE = 4 +# Normalization method for the convolution layers. Options: "" (no norm), "GN". +_C.MODEL.SEM_SEG_HEAD.NORM = "GN" +_C.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT = 1.0 + +_C.MODEL.PANOPTIC_FPN = CN() +# Scaling of all losses from instance detection / segmentation head. +_C.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT = 1.0 + +# options when combining instance & semantic segmentation outputs +_C.MODEL.PANOPTIC_FPN.COMBINE = CN({"ENABLED": True}) +_C.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH = 0.5 +_C.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT = 4096 +_C.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = 0.5 + + +# ---------------------------------------------------------------------------- # +# RetinaNet Head +# ---------------------------------------------------------------------------- # +_C.MODEL.RETINANET = CN() + +# This is the number of foreground classes. +_C.MODEL.RETINANET.NUM_CLASSES = 80 + +_C.MODEL.RETINANET.IN_FEATURES = ["p3", "p4", "p5", "p6", "p7"] + +# Convolutions to use in the cls and bbox tower +# NOTE: this doesn't include the last conv for logits +_C.MODEL.RETINANET.NUM_CONVS = 4 + +# IoU overlap ratio [bg, fg] for labeling anchors. +# Anchors with < bg are labeled negative (0) +# Anchors with >= bg and < fg are ignored (-1) +# Anchors with >= fg are labeled positive (1) +_C.MODEL.RETINANET.IOU_THRESHOLDS = [0.4, 0.5] +_C.MODEL.RETINANET.IOU_LABELS = [0, -1, 1] + +# Prior prob for rare case (i.e. foreground) at the beginning of training. +# This is used to set the bias for the logits layer of the classifier subnet. +# This improves training stability in the case of heavy class imbalance. +_C.MODEL.RETINANET.PRIOR_PROB = 0.01 + +# Inference cls score threshold, only anchors with score > INFERENCE_TH are +# considered for inference (to improve speed) +_C.MODEL.RETINANET.SCORE_THRESH_TEST = 0.05 +_C.MODEL.RETINANET.TOPK_CANDIDATES_TEST = 1000 +_C.MODEL.RETINANET.NMS_THRESH_TEST = 0.5 + +# Weights on (dx, dy, dw, dh) for normalizing Retinanet anchor regression targets +_C.MODEL.RETINANET.BBOX_REG_WEIGHTS = (1.0, 1.0, 1.0, 1.0) + +# Loss parameters +_C.MODEL.RETINANET.FOCAL_LOSS_GAMMA = 2.0 +_C.MODEL.RETINANET.FOCAL_LOSS_ALPHA = 0.25 +_C.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA = 0.1 + + +# ---------------------------------------------------------------------------- # +# ResNe[X]t options (ResNets = {ResNet, ResNeXt} +# Note that parts of a resnet may be used for both the backbone and the head +# These options apply to both +# ---------------------------------------------------------------------------- # +_C.MODEL.RESNETS = CN() + +_C.MODEL.RESNETS.DEPTH = 50 +_C.MODEL.RESNETS.OUT_FEATURES = ["res4"] # res4 for C4 backbone, res2..5 for FPN backbone + +# Number of groups to use; 1 ==> ResNet; > 1 ==> ResNeXt +_C.MODEL.RESNETS.NUM_GROUPS = 1 + +# Options: FrozenBN, GN, "SyncBN", "BN" +_C.MODEL.RESNETS.NORM = "FrozenBN" + +# Baseline width of each group. +# Scaling this parameters will scale the width of all bottleneck layers. +_C.MODEL.RESNETS.WIDTH_PER_GROUP = 64 + +# Place the stride 2 conv on the 1x1 filter +# Use True only for the original MSRA ResNet; use False for C2 and Torch models +_C.MODEL.RESNETS.STRIDE_IN_1X1 = True + +# Apply dilation in stage "res5" +_C.MODEL.RESNETS.RES5_DILATION = 1 + +# Output width of res2. Scaling this parameters will scale the width of all 1x1 convs in ResNet +# For R18 and R34, this needs to be set to 64 +_C.MODEL.RESNETS.RES2_OUT_CHANNELS = 256 +_C.MODEL.RESNETS.STEM_OUT_CHANNELS = 64 + +# Apply Deformable Convolution in stages +# Specify if apply deform_conv on Res2, Res3, Res4, Res5 +_C.MODEL.RESNETS.DEFORM_ON_PER_STAGE = [False, False, False, False] +# Use True to use modulated deform_conv (DeformableV2, https://arxiv.org/abs/1811.11168); +# Use False for DeformableV1. +_C.MODEL.RESNETS.DEFORM_MODULATED = False +# Number of groups in deformable conv. +_C.MODEL.RESNETS.DEFORM_NUM_GROUPS = 1 + + +# ---------------------------------------------------------------------------- # +# Solver +# ---------------------------------------------------------------------------- # +_C.SOLVER = CN() + +# See detectron2/solver/build.py for LR scheduler options +_C.SOLVER.LR_SCHEDULER_NAME = "WarmupMultiStepLR" + +_C.SOLVER.MAX_ITER = 40000 + +_C.SOLVER.BASE_LR = 0.001 + +_C.SOLVER.MOMENTUM = 0.9 + +_C.SOLVER.NESTEROV = False + +_C.SOLVER.WEIGHT_DECAY = 0.0001 +# The weight decay that's applied to parameters of normalization layers +# (typically the affine transformation) +_C.SOLVER.WEIGHT_DECAY_NORM = 0.0 + +_C.SOLVER.GAMMA = 0.1 +# The iteration number to decrease learning rate by GAMMA. +_C.SOLVER.STEPS = (30000,) + +_C.SOLVER.WARMUP_FACTOR = 1.0 / 1000 +_C.SOLVER.WARMUP_ITERS = 1000 +_C.SOLVER.WARMUP_METHOD = "linear" + +# Save a checkpoint after every this number of iterations +_C.SOLVER.CHECKPOINT_PERIOD = 5000 + +# Number of images per batch across all machines. +# If we have 16 GPUs and IMS_PER_BATCH = 32, +# each GPU will see 2 images per batch. +_C.SOLVER.IMS_PER_BATCH = 16 + +# Detectron v1 (and previous detection code) used a 2x higher LR and 0 WD for +# biases. This is not useful (at least for recent models). You should avoid +# changing these and they exist only to reproduce Detectron v1 training if +# desired. +_C.SOLVER.BIAS_LR_FACTOR = 1.0 +_C.SOLVER.WEIGHT_DECAY_BIAS = _C.SOLVER.WEIGHT_DECAY + +# Gradient clipping +_C.SOLVER.CLIP_GRADIENTS = CN({"ENABLED": False}) +# Type of gradient clipping, currently 2 values are supported: +# - "value": the absolute values of elements of each gradients are clipped +# - "norm": the norm of the gradient for each parameter is clipped thus +# affecting all elements in the parameter +_C.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "value" +# Maximum absolute value used for clipping gradients +_C.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 1.0 +# Floating point number p for L-p norm to be used with the "norm" +# gradient clipping type; for L-inf, please specify .inf +_C.SOLVER.CLIP_GRADIENTS.NORM_TYPE = 2.0 + +# ---------------------------------------------------------------------------- # +# Specific test options +# ---------------------------------------------------------------------------- # +_C.TEST = CN() +# For end-to-end tests to verify the expected accuracy. +# Each item is [task, metric, value, tolerance] +# e.g.: [['bbox', 'AP', 38.5, 0.2]] +_C.TEST.EXPECTED_RESULTS = [] +# The period (in terms of steps) to evaluate the model during training. +# Set to 0 to disable. +_C.TEST.EVAL_PERIOD = 0 +# The sigmas used to calculate keypoint OKS. See http://cocodataset.org/#keypoints-eval +# When empty it will use the defaults in COCO. +# Otherwise it should have the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. +_C.TEST.KEYPOINT_OKS_SIGMAS = [] +# Maximum number of detections to return per image during inference (100 is +# based on the limit established for the COCO dataset). +_C.TEST.DETECTIONS_PER_IMAGE = 100 + +_C.TEST.AUG = CN({"ENABLED": False}) +_C.TEST.AUG.MIN_SIZES = (400, 500, 600, 700, 800, 900, 1000, 1100, 1200) +_C.TEST.AUG.MAX_SIZE = 4000 +_C.TEST.AUG.FLIP = True + +_C.TEST.PRECISE_BN = CN({"ENABLED": False}) +_C.TEST.PRECISE_BN.NUM_ITER = 200 + +# ---------------------------------------------------------------------------- # +# Misc options +# ---------------------------------------------------------------------------- # +# Directory where output files are written +_C.OUTPUT_DIR = "./output" +# Set seed to negative to fully randomize everything. +# Set seed to positive to use a fixed seed. Note that a fixed seed increases +# reproducibility but does not guarantee fully deterministic behavior. +# Disabling all parallelism further increases reproducibility. +_C.SEED = -1 +# Benchmark different cudnn algorithms. +# If input images have very different sizes, this option will have large overhead +# for about 10k iterations. It usually hurts total time, but can benefit for certain models. +# If input images have the same or similar sizes, benchmark is often helpful. +_C.CUDNN_BENCHMARK = False +# The period (in terms of steps) for minibatch visualization at train time. +# Set to 0 to disable. +_C.VIS_PERIOD = 0 + +# global config is for quick hack purposes. +# You can set them in command line or config files, +# and access it with: +# +# from detectron2.config import global_cfg +# print(global_cfg.HACK) +# +# Do not commit any configs into it. +_C.GLOBAL = CN() +_C.GLOBAL.HACK = 1.0 diff --git a/detectron2/data/__init__.py b/detectron2/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b04d1dc275abef1a090a5f51c5d5a32d5541704d --- /dev/null +++ b/detectron2/data/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from . import transforms # isort:skip + +from .build import ( + build_detection_test_loader, + build_detection_train_loader, + get_detection_dataset_dicts, + load_proposals_into_dataset, + print_instances_class_histogram, +) +from .catalog import DatasetCatalog, MetadataCatalog +from .common import DatasetFromList, MapDataset +from .dataset_mapper import DatasetMapper + +# ensure the builtin datasets are registered +from . import datasets, samplers # isort:skip + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/data/build.py b/detectron2/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7e85789d75daf4ee206449ce0d3254e948db16 --- /dev/null +++ b/detectron2/data/build.py @@ -0,0 +1,397 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import bisect +import copy +import itertools +import logging +import numpy as np +import operator +import pickle +import torch.utils.data +from fvcore.common.file_io import PathManager +from tabulate import tabulate +from termcolor import colored + +from detectron2.structures import BoxMode +from detectron2.utils.comm import get_world_size +from detectron2.utils.env import seed_all_rng +from detectron2.utils.logger import log_first_n + +from . import samplers +from .catalog import DatasetCatalog, MetadataCatalog +from .common import AspectRatioGroupedDataset, DatasetFromList, MapDataset +from .dataset_mapper import DatasetMapper +from .detection_utils import check_metadata_consistency + +""" +This file contains the default logic to build a dataloader for training or testing. +""" + +__all__ = [ + "build_detection_train_loader", + "build_detection_test_loader", + "get_detection_dataset_dicts", + "load_proposals_into_dataset", + "print_instances_class_histogram", +] + + +def filter_images_with_only_crowd_annotations(dataset_dicts): + """ + Filter out images with none annotations or only crowd annotations + (i.e., images without non-crowd annotations). + A common training-time preprocessing on COCO dataset. + + Args: + dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. + + Returns: + list[dict]: the same format, but filtered. + """ + num_before = len(dataset_dicts) + + def valid(anns): + for ann in anns: + if ann.get("iscrowd", 0) == 0: + return True + return False + + dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])] + num_after = len(dataset_dicts) + logger = logging.getLogger(__name__) + logger.info( + "Removed {} images with no usable annotations. {} images left.".format( + num_before - num_after, num_after + ) + ) + return dataset_dicts + + +def filter_images_with_few_keypoints(dataset_dicts, min_keypoints_per_image): + """ + Filter out images with too few number of keypoints. + + Args: + dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. + + Returns: + list[dict]: the same format as dataset_dicts, but filtered. + """ + num_before = len(dataset_dicts) + + def visible_keypoints_in_image(dic): + # Each keypoints field has the format [x1, y1, v1, ...], where v is visibility + annotations = dic["annotations"] + return sum( + (np.array(ann["keypoints"][2::3]) > 0).sum() + for ann in annotations + if "keypoints" in ann + ) + + dataset_dicts = [ + x for x in dataset_dicts if visible_keypoints_in_image(x) >= min_keypoints_per_image + ] + num_after = len(dataset_dicts) + logger = logging.getLogger(__name__) + logger.info( + "Removed {} images with fewer than {} keypoints.".format( + num_before - num_after, min_keypoints_per_image + ) + ) + return dataset_dicts + + +def load_proposals_into_dataset(dataset_dicts, proposal_file): + """ + Load precomputed object proposals into the dataset. + + The proposal file should be a pickled dict with the following keys: + + - "ids": list[int] or list[str], the image ids + - "boxes": list[np.ndarray], each is an Nx4 array of boxes corresponding to the image id + - "objectness_logits": list[np.ndarray], each is an N sized array of objectness scores + corresponding to the boxes. + - "bbox_mode": the BoxMode of the boxes array. Defaults to ``BoxMode.XYXY_ABS``. + + Args: + dataset_dicts (list[dict]): annotations in Detectron2 Dataset format. + proposal_file (str): file path of pre-computed proposals, in pkl format. + + Returns: + list[dict]: the same format as dataset_dicts, but added proposal field. + """ + logger = logging.getLogger(__name__) + logger.info("Loading proposals from: {}".format(proposal_file)) + + with PathManager.open(proposal_file, "rb") as f: + proposals = pickle.load(f, encoding="latin1") + + # Rename the key names in D1 proposal files + rename_keys = {"indexes": "ids", "scores": "objectness_logits"} + for key in rename_keys: + if key in proposals: + proposals[rename_keys[key]] = proposals.pop(key) + + # Fetch the indexes of all proposals that are in the dataset + # Convert image_id to str since they could be int. + img_ids = set({str(record["image_id"]) for record in dataset_dicts}) + id_to_index = {str(id): i for i, id in enumerate(proposals["ids"]) if str(id) in img_ids} + + # Assuming default bbox_mode of precomputed proposals are 'XYXY_ABS' + bbox_mode = BoxMode(proposals["bbox_mode"]) if "bbox_mode" in proposals else BoxMode.XYXY_ABS + + for record in dataset_dicts: + # Get the index of the proposal + i = id_to_index[str(record["image_id"])] + + boxes = proposals["boxes"][i] + objectness_logits = proposals["objectness_logits"][i] + # Sort the proposals in descending order of the scores + inds = objectness_logits.argsort()[::-1] + record["proposal_boxes"] = boxes[inds] + record["proposal_objectness_logits"] = objectness_logits[inds] + record["proposal_bbox_mode"] = bbox_mode + + return dataset_dicts + + +def _quantize(x, bin_edges): + bin_edges = copy.copy(bin_edges) + bin_edges = sorted(bin_edges) + quantized = list(map(lambda y: bisect.bisect_right(bin_edges, y), x)) + return quantized + + +def print_instances_class_histogram(dataset_dicts, class_names): + """ + Args: + dataset_dicts (list[dict]): list of dataset dicts. + class_names (list[str]): list of class names (zero-indexed). + """ + num_classes = len(class_names) + hist_bins = np.arange(num_classes + 1) + histogram = np.zeros((num_classes,), dtype=np.int) + for entry in dataset_dicts: + annos = entry["annotations"] + classes = [x["category_id"] for x in annos if not x.get("iscrowd", 0)] + histogram += np.histogram(classes, bins=hist_bins)[0] + + N_COLS = min(6, len(class_names) * 2) + + def short_name(x): + # make long class names shorter. useful for lvis + if len(x) > 13: + return x[:11] + ".." + return x + + data = list( + itertools.chain(*[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]) + ) + total_num_instances = sum(data[1::2]) + data.extend([None] * (N_COLS - (len(data) % N_COLS))) + if num_classes > 1: + data.extend(["total", total_num_instances]) + data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) + table = tabulate( + data, + headers=["category", "#instances"] * (N_COLS // 2), + tablefmt="pipe", + numalign="left", + stralign="center", + ) + log_first_n( + logging.INFO, + "Distribution of instances among all {} categories:\n".format(num_classes) + + colored(table, "cyan"), + key="message", + ) + + +def get_detection_dataset_dicts( + dataset_names, filter_empty=True, min_keypoints=0, proposal_files=None +): + """ + Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. + + Args: + dataset_names (list[str]): a list of dataset names + filter_empty (bool): whether to filter out images without instance annotations + min_keypoints (int): filter out images with fewer keypoints than + `min_keypoints`. Set to 0 to do nothing. + proposal_files (list[str]): if given, a list of object proposal files + that match each dataset in `dataset_names`. + """ + assert len(dataset_names) + dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names] + for dataset_name, dicts in zip(dataset_names, dataset_dicts): + assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) + + if proposal_files is not None: + assert len(dataset_names) == len(proposal_files) + # load precomputed proposals from proposal files + dataset_dicts = [ + load_proposals_into_dataset(dataset_i_dicts, proposal_file) + for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) + ] + + dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) + + has_instances = "annotations" in dataset_dicts[0] + # Keep images without instance-level GT if the dataset has semantic labels. + if filter_empty and has_instances and "sem_seg_file_name" not in dataset_dicts[0]: + dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts) + + if min_keypoints > 0 and has_instances: + dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints) + + if has_instances: + try: + class_names = MetadataCatalog.get(dataset_names[0]).thing_classes + check_metadata_consistency("thing_classes", dataset_names) + print_instances_class_histogram(dataset_dicts, class_names) + except AttributeError: # class names are not available for this dataset + pass + return dataset_dicts + + +def build_detection_train_loader(cfg, mapper=None): + """ + A data loader is created by the following steps: + + 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. + 2. Coordinate a random shuffle order shared among all processes (all GPUs) + 3. Each process spawn another few workers to process the dicts. Each worker will: + * Map each metadata dict into another format to be consumed by the model. + * Batch them by simply putting dicts into a list. + + The batched ``list[mapped_dict]`` is what this dataloader will yield. + + Args: + cfg (CfgNode): the config + mapper (callable): a callable which takes a sample (dict) from dataset and + returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, True)`. + + Returns: + an infinite iterator of training data + """ + num_workers = get_world_size() + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_workers == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( + images_per_batch, num_workers + ) + assert ( + images_per_batch >= num_workers + ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( + images_per_batch, num_workers + ) + images_per_worker = images_per_batch // num_workers + + dataset_dicts = get_detection_dataset_dicts( + cfg.DATASETS.TRAIN, + filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, + proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, + ) + dataset = DatasetFromList(dataset_dicts, copy=False) + + if mapper is None: + mapper = DatasetMapper(cfg, True) + dataset = MapDataset(dataset, mapper) + + sampler_name = cfg.DATALOADER.SAMPLER_TRAIN + logger = logging.getLogger(__name__) + logger.info("Using training sampler {}".format(sampler_name)) + if sampler_name == "TrainingSampler": + sampler = samplers.TrainingSampler(len(dataset)) + elif sampler_name == "RepeatFactorTrainingSampler": + sampler = samplers.RepeatFactorTrainingSampler( + dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD + ) + else: + raise ValueError("Unknown training sampler: {}".format(sampler_name)) + + if cfg.DATALOADER.ASPECT_RATIO_GROUPING: + data_loader = torch.utils.data.DataLoader( + dataset, + sampler=sampler, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=None, + collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements + worker_init_fn=worker_init_reset_seed, + ) # yield individual mapped dict + data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) + else: + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, images_per_worker, drop_last=True + ) + # drop_last so the batch always have the same size + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=batch_sampler, + collate_fn=trivial_batch_collator, + worker_init_fn=worker_init_reset_seed, + ) + + return data_loader + + +def build_detection_test_loader(cfg, dataset_name, mapper=None): + """ + Similar to `build_detection_train_loader`. + But this function uses the given `dataset_name` argument (instead of the names in cfg), + and uses batch size 1. + + Args: + cfg: a detectron2 CfgNode + dataset_name (str): a name of the dataset that's available in the DatasetCatalog + mapper (callable): a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, False)`. + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + """ + dataset_dicts = get_detection_dataset_dicts( + [dataset_name], + filter_empty=False, + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + + dataset = DatasetFromList(dataset_dicts) + if mapper is None: + mapper = DatasetMapper(cfg, False) + dataset = MapDataset(dataset, mapper) + + sampler = samplers.InferenceSampler(len(dataset)) + # Always use 1 image per worker during inference since this is the + # standard when reporting inference time in papers. + batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) + + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=batch_sampler, + collate_fn=trivial_batch_collator, + ) + return data_loader + + +def trivial_batch_collator(batch): + """ + A batch collator that does nothing. + """ + return batch + + +def worker_init_reset_seed(worker_id): + seed_all_rng(np.random.randint(2 ** 31) + worker_id) diff --git a/detectron2/data/catalog.py b/detectron2/data/catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..beb4756024286fb53801a0b5ec2a2b3a91824eb0 --- /dev/null +++ b/detectron2/data/catalog.py @@ -0,0 +1,221 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import types +from typing import List + +from detectron2.utils.logger import log_first_n + +__all__ = ["DatasetCatalog", "MetadataCatalog"] + + +class DatasetCatalog(object): + """ + A catalog that stores information about the datasets and how to obtain them. + + It contains a mapping from strings + (which are names that identify a dataset, e.g. "coco_2014_train") + to a function which parses the dataset and returns the samples in the + format of `list[dict]`. + + The returned dicts should be in Detectron2 Dataset format (See DATASETS.md for details) + if used with the data loader functionalities in `data/build.py,data/detection_transform.py`. + + The purpose of having this catalog is to make it easy to choose + different datasets, by just using the strings in the config. + """ + + _REGISTERED = {} + + @staticmethod + def register(name, func): + """ + Args: + name (str): the name that identifies a dataset, e.g. "coco_2014_train". + func (callable): a callable which takes no arguments and returns a list of dicts. + """ + assert callable(func), "You must register a function with `DatasetCatalog.register`!" + assert name not in DatasetCatalog._REGISTERED, "Dataset '{}' is already registered!".format( + name + ) + DatasetCatalog._REGISTERED[name] = func + + @staticmethod + def get(name): + """ + Call the registered function and return its results. + + Args: + name (str): the name that identifies a dataset, e.g. "coco_2014_train". + + Returns: + list[dict]: dataset annotations.0 + """ + try: + f = DatasetCatalog._REGISTERED[name] + except KeyError: + raise KeyError( + "Dataset '{}' is not registered! Available datasets are: {}".format( + name, ", ".join(DatasetCatalog._REGISTERED.keys()) + ) + ) + return f() + + @staticmethod + def list() -> List[str]: + """ + List all registered datasets. + + Returns: + list[str] + """ + return list(DatasetCatalog._REGISTERED.keys()) + + @staticmethod + def clear(): + """ + Remove all registered dataset. + """ + DatasetCatalog._REGISTERED.clear() + + +class Metadata(types.SimpleNamespace): + """ + A class that supports simple attribute setter/getter. + It is intended for storing metadata of a dataset and make it accessible globally. + + Examples: + + .. code-block:: python + + # somewhere when you load the data: + MetadataCatalog.get("mydataset").thing_classes = ["person", "dog"] + + # somewhere when you print statistics or visualize: + classes = MetadataCatalog.get("mydataset").thing_classes + """ + + # the name of the dataset + # set default to N/A so that `self.name` in the errors will not trigger getattr again + name: str = "N/A" + + _RENAMED = { + "class_names": "thing_classes", + "dataset_id_to_contiguous_id": "thing_dataset_id_to_contiguous_id", + "stuff_class_names": "stuff_classes", + } + + def __getattr__(self, key): + if key in self._RENAMED: + log_first_n( + logging.WARNING, + "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]), + n=10, + ) + return getattr(self, self._RENAMED[key]) + + raise AttributeError( + "Attribute '{}' does not exist in the metadata of '{}'. Available keys are {}.".format( + key, self.name, str(self.__dict__.keys()) + ) + ) + + def __setattr__(self, key, val): + if key in self._RENAMED: + log_first_n( + logging.WARNING, + "Metadata '{}' was renamed to '{}'!".format(key, self._RENAMED[key]), + n=10, + ) + setattr(self, self._RENAMED[key], val) + + # Ensure that metadata of the same name stays consistent + try: + oldval = getattr(self, key) + assert oldval == val, ( + "Attribute '{}' in the metadata of '{}' cannot be set " + "to a different value!\n{} != {}".format(key, self.name, oldval, val) + ) + except AttributeError: + super().__setattr__(key, val) + + def as_dict(self): + """ + Returns all the metadata as a dict. + Note that modifications to the returned dict will not reflect on the Metadata object. + """ + return copy.copy(self.__dict__) + + def set(self, **kwargs): + """ + Set multiple metadata with kwargs. + """ + for k, v in kwargs.items(): + setattr(self, k, v) + return self + + def get(self, key, default=None): + """ + Access an attribute and return its value if exists. + Otherwise return default. + """ + try: + return getattr(self, key) + except AttributeError: + return default + + +class MetadataCatalog: + """ + MetadataCatalog provides access to "Metadata" of a given dataset. + + The metadata associated with a certain name is a singleton: once created, + the metadata will stay alive and will be returned by future calls to `get(name)`. + + It's like global variables, so don't abuse it. + It's meant for storing knowledge that's constant and shared across the execution + of the program, e.g.: the class names in COCO. + """ + + _NAME_TO_META = {} + + @staticmethod + def get(name): + """ + Args: + name (str): name of a dataset (e.g. coco_2014_train). + + Returns: + Metadata: The :class:`Metadata` instance associated with this name, + or create an empty one if none is available. + """ + assert len(name) + if name in MetadataCatalog._NAME_TO_META: + ret = MetadataCatalog._NAME_TO_META[name] + # TODO this is for the BC breaking change in D15247032. + # Remove this in the future. + if hasattr(ret, "dataset_name"): + logger = logging.getLogger() + logger.warning( + """ +The 'dataset_name' key in metadata is no longer used for +sharing metadata among splits after D15247032! Add +metadata to each split (now called dataset) separately! + """ + ) + parent_meta = MetadataCatalog.get(ret.dataset_name).as_dict() + ret.set(**parent_meta) + return ret + else: + m = MetadataCatalog._NAME_TO_META[name] = Metadata(name=name) + return m + + @staticmethod + def list(): + """ + List all registered metadata. + + Returns: + list[str]: keys (names of datasets) of all registered metadata + """ + return list(MetadataCatalog._NAME_TO_META.keys()) diff --git a/detectron2/data/common.py b/detectron2/data/common.py new file mode 100644 index 0000000000000000000000000000000000000000..a42c8b21b86338a3f034d01c3484dd32b1b845a9 --- /dev/null +++ b/detectron2/data/common.py @@ -0,0 +1,149 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import numpy as np +import pickle +import random +import torch.utils.data as data + +from detectron2.utils.serialize import PicklableWrapper + +__all__ = ["MapDataset", "DatasetFromList", "AspectRatioGroupedDataset"] + + +class MapDataset(data.Dataset): + """ + Map a function over the elements in a dataset. + + Args: + dataset: a dataset where map function is applied. + map_func: a callable which maps the element in dataset. map_func is + responsible for error handling, when error happens, it needs to + return None so the MapDataset will randomly use other + elements from the dataset. + """ + + def __init__(self, dataset, map_func): + self._dataset = dataset + self._map_func = PicklableWrapper(map_func) # wrap so that a lambda will work + + self._rng = random.Random(42) + self._fallback_candidates = set(range(len(dataset))) + + def __len__(self): + return len(self._dataset) + + def __getitem__(self, idx): + retry_count = 0 + cur_idx = int(idx) + + while True: + data = self._map_func(self._dataset[cur_idx]) + if data is not None: + self._fallback_candidates.add(cur_idx) + return data + + # _map_func fails for this idx, use a random new index from the pool + retry_count += 1 + self._fallback_candidates.discard(cur_idx) + cur_idx = self._rng.sample(self._fallback_candidates, k=1)[0] + + if retry_count >= 3: + logger = logging.getLogger(__name__) + logger.warning( + "Failed to apply `_map_func` for idx: {}, retry count: {}".format( + idx, retry_count + ) + ) + + +class DatasetFromList(data.Dataset): + """ + Wrap a list to a torch Dataset. It produces elements of the list as data. + """ + + def __init__(self, lst: list, copy: bool = True, serialize: bool = True): + """ + Args: + lst (list): a list which contains elements to produce. + copy (bool): whether to deepcopy the element when producing it, + so that the result can be modified in place without affecting the + source in the list. + serialize (bool): whether to hold memory using serialized objects, when + enabled, data loader workers can use shared RAM from master + process instead of making a copy. + """ + self._lst = lst + self._copy = copy + self._serialize = serialize + + def _serialize(data): + buffer = pickle.dumps(data, protocol=-1) + return np.frombuffer(buffer, dtype=np.uint8) + + if self._serialize: + logger = logging.getLogger(__name__) + logger.info( + "Serializing {} elements to byte tensors and concatenating them all ...".format( + len(self._lst) + ) + ) + self._lst = [_serialize(x) for x in self._lst] + self._addr = np.asarray([len(x) for x in self._lst], dtype=np.int64) + self._addr = np.cumsum(self._addr) + self._lst = np.concatenate(self._lst) + logger.info("Serialized dataset takes {:.2f} MiB".format(len(self._lst) / 1024 ** 2)) + + def __len__(self): + if self._serialize: + return len(self._addr) + else: + return len(self._lst) + + def __getitem__(self, idx): + if self._serialize: + start_addr = 0 if idx == 0 else self._addr[idx - 1].item() + end_addr = self._addr[idx].item() + bytes = memoryview(self._lst[start_addr:end_addr]) + return pickle.loads(bytes) + elif self._copy: + return copy.deepcopy(self._lst[idx]) + else: + return self._lst[idx] + + +class AspectRatioGroupedDataset(data.IterableDataset): + """ + Batch data that have similar aspect ratio together. + In this implementation, images whose aspect ratio < (or >) 1 will + be batched together. + This improves training speed because the images then need less padding + to form a batch. + + It assumes the underlying dataset produces dicts with "width" and "height" keys. + It will then produce a list of original dicts with length = batch_size, + all with similar aspect ratios. + """ + + def __init__(self, dataset, batch_size): + """ + Args: + dataset: an iterable. Each element must be a dict with keys + "width" and "height", which will be used to batch data. + batch_size (int): + """ + self.dataset = dataset + self.batch_size = batch_size + self._buckets = [[] for _ in range(2)] + # Hard-coded two aspect ratio groups: w > h and w < h. + # Can add support for more aspect ratio groups, but doesn't seem useful + + def __iter__(self): + for d in self.dataset: + w, h = d["width"], d["height"] + bucket_id = 0 if w > h else 1 + bucket = self._buckets[bucket_id] + bucket.append(d) + if len(bucket) == self.batch_size: + yield bucket[:] + del bucket[:] diff --git a/detectron2/data/dataset_mapper.py b/detectron2/data/dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..db73b378a6c2938a3beb700010a13172e6cc549f --- /dev/null +++ b/detectron2/data/dataset_mapper.py @@ -0,0 +1,149 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import numpy as np +import torch +from fvcore.common.file_io import PathManager +from PIL import Image + +from . import detection_utils as utils +from . import transforms as T + +""" +This file contains the default mapping that's applied to "dataset dicts". +""" + +__all__ = ["DatasetMapper"] + + +class DatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by the model. + + This is the default callable to be used to map your dataset dict into training data. + You may need to follow it to implement your own one for customized logic, + such as a different way to read or transform images. + See :doc:`/tutorials/data_loading` for details. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies cropping/geometric transforms to the image and annotations + 3. Prepare data and annotations to Tensor and :class:`Instances` + """ + + def __init__(self, cfg, is_train=True): + if cfg.INPUT.CROP.ENABLED and is_train: + self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) + logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen)) + else: + self.crop_gen = None + + self.tfm_gens = utils.build_transform_gen(cfg, is_train) + + # fmt: off + self.img_format = cfg.INPUT.FORMAT + self.mask_on = cfg.MODEL.MASK_ON + self.mask_format = cfg.INPUT.MASK_FORMAT + self.keypoint_on = cfg.MODEL.KEYPOINT_ON + self.load_proposals = cfg.MODEL.LOAD_PROPOSALS + # fmt: on + if self.keypoint_on and is_train: + # Flip only makes sense in training + self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) + else: + self.keypoint_hflip_indices = None + + if self.load_proposals: + self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE + self.proposal_topk = ( + cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN + if is_train + else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST + ) + self.is_train = is_train + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + # USER: Write your own image loading if it's not from a file + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + if "annotations" not in dataset_dict: + image, transforms = T.apply_transform_gens( + ([self.crop_gen] if self.crop_gen else []) + self.tfm_gens, image + ) + else: + # Crop around an instance if there are instances in the image. + # USER: Remove if you don't use cropping + if self.crop_gen: + crop_tfm = utils.gen_crop_transform_with_instance( + self.crop_gen.get_crop_size(image.shape[:2]), + image.shape[:2], + np.random.choice(dataset_dict["annotations"]), + ) + image = crop_tfm.apply_image(image) + image, transforms = T.apply_transform_gens(self.tfm_gens, image) + if self.crop_gen: + transforms = crop_tfm + transforms + + image_shape = image.shape[:2] # h, w + + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + + # USER: Remove if you don't use pre-computed proposals. + if self.load_proposals: + utils.transform_proposals( + dataset_dict, image_shape, transforms, self.min_box_side_len, self.proposal_topk + ) + + if not self.is_train: + # USER: Modify this if you want to keep them for some reason. + dataset_dict.pop("annotations", None) + dataset_dict.pop("sem_seg_file_name", None) + return dataset_dict + + if "annotations" in dataset_dict: + # USER: Modify this if you want to keep them for some reason. + for anno in dataset_dict["annotations"]: + if not self.mask_on: + anno.pop("segmentation", None) + if not self.keypoint_on: + anno.pop("keypoints", None) + + # USER: Implement additional transformations if you have other types of data + annos = [ + utils.transform_instance_annotations( + obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices + ) + for obj in dataset_dict.pop("annotations") + if obj.get("iscrowd", 0) == 0 + ] + instances = utils.annotations_to_instances( + annos, image_shape, mask_format=self.mask_format + ) + # Create a tight bounding box from masks, useful when image is cropped + if self.crop_gen and instances.has("gt_masks"): + instances.gt_boxes = instances.gt_masks.get_bounding_boxes() + dataset_dict["instances"] = utils.filter_empty_instances(instances) + + # USER: Remove if you don't do semantic/panoptic segmentation. + if "sem_seg_file_name" in dataset_dict: + with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f: + sem_seg_gt = Image.open(f) + sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8") + sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) + sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) + dataset_dict["sem_seg"] = sem_seg_gt + return dataset_dict diff --git a/detectron2/data/datasets/README.md b/detectron2/data/datasets/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9fb3e4f7afec17137c95c78be6ef06d520ec8032 --- /dev/null +++ b/detectron2/data/datasets/README.md @@ -0,0 +1,9 @@ + + +### Common Datasets + +The dataset implemented here do not need to load the data into the final format. +It should provide the minimal data structure needed to use the dataset, so it can be very efficient. + +For example, for an image dataset, just provide the file names and labels, but don't read the images. +Let the downstream decide how to read. diff --git a/detectron2/data/datasets/__init__.py b/detectron2/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a2bfbea6bcc23b090f90a58ac9fd2306f81c649d --- /dev/null +++ b/detectron2/data/datasets/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .cityscapes import load_cityscapes_instances +from .coco import load_coco_json, load_sem_seg +from .lvis import load_lvis_json, register_lvis_instances, get_lvis_instances_meta +from .register_coco import register_coco_instances, register_coco_panoptic_separated +from . import builtin # ensure the builtin datasets are registered + + +__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] diff --git a/detectron2/data/datasets/builtin.py b/detectron2/data/datasets/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..bcfd78ec006c9fbf7feba8766494a7c67048e703 --- /dev/null +++ b/detectron2/data/datasets/builtin.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + + +""" +This file registers pre-defined datasets at hard-coded paths, and their metadata. + +We hard-code metadata for common datasets. This will enable: +1. Consistency check when loading the datasets +2. Use models on these standard datasets directly and run demos, + without having to download the dataset annotations + +We hard-code some paths to the dataset that's assumed to +exist in "./datasets/". + +Users SHOULD NOT use this file to create new dataset / metadata for new dataset. +To add new dataset, refer to the tutorial "docs/DATASETS.md". +""" + +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog + +from .builtin_meta import _get_builtin_metadata +from .cityscapes import load_cityscapes_instances, load_cityscapes_semantic +from .lvis import get_lvis_instances_meta, register_lvis_instances +from .pascal_voc import register_pascal_voc +from .register_coco import register_coco_instances, register_coco_panoptic_separated + +# ==== Predefined datasets and splits for COCO ========== + +_PREDEFINED_SPLITS_COCO = {} +_PREDEFINED_SPLITS_COCO["coco"] = { + "coco_2014_train": ("coco/train2014", "coco/annotations/instances_train2014.json"), + "coco_2014_val": ("coco/val2014", "coco/annotations/instances_val2014.json"), + "coco_2014_minival": ("coco/val2014", "coco/annotations/instances_minival2014.json"), + "coco_2014_minival_100": ("coco/val2014", "coco/annotations/instances_minival2014_100.json"), + "coco_2014_valminusminival": ( + "coco/val2014", + "coco/annotations/instances_valminusminival2014.json", + ), + "coco_2017_train": ("coco/train2017", "coco/annotations/instances_train2017.json"), + "coco_2017_val": ("coco/val2017", "coco/annotations/instances_val2017.json"), + "coco_2017_test": ("coco/test2017", "coco/annotations/image_info_test2017.json"), + "coco_2017_test-dev": ("coco/test2017", "coco/annotations/image_info_test-dev2017.json"), + "coco_2017_val_100": ("coco/val2017", "coco/annotations/instances_val2017_100.json"), +} + +_PREDEFINED_SPLITS_COCO["coco_person"] = { + "keypoints_coco_2014_train": ( + "coco/train2014", + "coco/annotations/person_keypoints_train2014.json", + ), + "keypoints_coco_2014_val": ("coco/val2014", "coco/annotations/person_keypoints_val2014.json"), + "keypoints_coco_2014_minival": ( + "coco/val2014", + "coco/annotations/person_keypoints_minival2014.json", + ), + "keypoints_coco_2014_valminusminival": ( + "coco/val2014", + "coco/annotations/person_keypoints_valminusminival2014.json", + ), + "keypoints_coco_2014_minival_100": ( + "coco/val2014", + "coco/annotations/person_keypoints_minival2014_100.json", + ), + "keypoints_coco_2017_train": ( + "coco/train2017", + "coco/annotations/person_keypoints_train2017.json", + ), + "keypoints_coco_2017_val": ("coco/val2017", "coco/annotations/person_keypoints_val2017.json"), + "keypoints_coco_2017_val_100": ( + "coco/val2017", + "coco/annotations/person_keypoints_val2017_100.json", + ), +} + + +_PREDEFINED_SPLITS_COCO_PANOPTIC = { + "coco_2017_train_panoptic": ( + # This is the original panoptic annotation directory + "coco/panoptic_train2017", + "coco/annotations/panoptic_train2017.json", + # This directory contains semantic annotations that are + # converted from panoptic annotations. + # It is used by PanopticFPN. + # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py + # to create these directories. + "coco/panoptic_stuff_train2017", + ), + "coco_2017_val_panoptic": ( + "coco/panoptic_val2017", + "coco/annotations/panoptic_val2017.json", + "coco/panoptic_stuff_val2017", + ), + "coco_2017_val_100_panoptic": ( + "coco/panoptic_val2017_100", + "coco/annotations/panoptic_val2017_100.json", + "coco/panoptic_stuff_val2017_100", + ), +} + + +def register_all_coco(root): + for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_COCO.items(): + for key, (image_root, json_file) in splits_per_dataset.items(): + # Assume pre-defined datasets live in `./datasets`. + register_coco_instances( + key, + _get_builtin_metadata(dataset_name), + os.path.join(root, json_file) if "://" not in json_file else json_file, + os.path.join(root, image_root), + ) + + for ( + prefix, + (panoptic_root, panoptic_json, semantic_root), + ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): + prefix_instances = prefix[: -len("_panoptic")] + instances_meta = MetadataCatalog.get(prefix_instances) + image_root, instances_json = instances_meta.image_root, instances_meta.json_file + register_coco_panoptic_separated( + prefix, + _get_builtin_metadata("coco_panoptic_separated"), + image_root, + os.path.join(root, panoptic_root), + os.path.join(root, panoptic_json), + os.path.join(root, semantic_root), + instances_json, + ) + + +# ==== Predefined datasets and splits for LVIS ========== + + +_PREDEFINED_SPLITS_LVIS = { + "lvis_v0.5": { + "lvis_v0.5_train": ("coco/train2017", "lvis/lvis_v0.5_train.json"), + "lvis_v0.5_val": ("coco/val2017", "lvis/lvis_v0.5_val.json"), + "lvis_v0.5_val_rand_100": ("coco/val2017", "lvis/lvis_v0.5_val_rand_100.json"), + "lvis_v0.5_test": ("coco/test2017", "lvis/lvis_v0.5_image_info_test.json"), + }, + "lvis_v0.5_cocofied": { + "lvis_v0.5_train_cocofied": ("coco/train2017", "lvis/lvis_v0.5_train_cocofied.json"), + "lvis_v0.5_val_cocofied": ("coco/val2017", "lvis/lvis_v0.5_val_cocofied.json"), + }, +} + + +def register_all_lvis(root): + for dataset_name, splits_per_dataset in _PREDEFINED_SPLITS_LVIS.items(): + for key, (image_root, json_file) in splits_per_dataset.items(): + # Assume pre-defined datasets live in `./datasets`. + register_lvis_instances( + key, + get_lvis_instances_meta(dataset_name), + os.path.join(root, json_file) if "://" not in json_file else json_file, + os.path.join(root, image_root), + ) + + +# ==== Predefined splits for raw cityscapes images =========== + + +_RAW_CITYSCAPES_SPLITS = { + "cityscapes_fine_{task}_train": ("cityscapes/leftImg8bit/train", "cityscapes/gtFine/train"), + "cityscapes_fine_{task}_val": ("cityscapes/leftImg8bit/val", "cityscapes/gtFine/val"), + "cityscapes_fine_{task}_test": ("cityscapes/leftImg8bit/test", "cityscapes/gtFine/test"), +} + + +def register_all_cityscapes(root): + for key, (image_dir, gt_dir) in _RAW_CITYSCAPES_SPLITS.items(): + meta = _get_builtin_metadata("cityscapes") + image_dir = os.path.join(root, image_dir) + gt_dir = os.path.join(root, gt_dir) + + inst_key = key.format(task="instance_seg") + DatasetCatalog.register( + inst_key, + lambda x=image_dir, y=gt_dir: load_cityscapes_instances( + x, y, from_json=True, to_polygons=True + ), + ) + MetadataCatalog.get(inst_key).set( + image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_instance", **meta + ) + + sem_key = key.format(task="sem_seg") + DatasetCatalog.register( + sem_key, lambda x=image_dir, y=gt_dir: load_cityscapes_semantic(x, y) + ) + MetadataCatalog.get(sem_key).set( + image_dir=image_dir, gt_dir=gt_dir, evaluator_type="cityscapes_sem_seg", **meta + ) + + +# ==== Predefined splits for PASCAL VOC =========== +def register_all_pascal_voc(root): + SPLITS = [ + ("voc_2007_trainval", "VOC2007", "trainval"), + ("voc_2007_train", "VOC2007", "train"), + ("voc_2007_val", "VOC2007", "val"), + ("voc_2007_test", "VOC2007", "test"), + ("voc_2012_trainval", "VOC2012", "trainval"), + ("voc_2012_train", "VOC2012", "train"), + ("voc_2012_val", "VOC2012", "val"), + ] + for name, dirname, split in SPLITS: + year = 2007 if "2007" in name else 2012 + register_pascal_voc(name, os.path.join(root, dirname), split, year) + MetadataCatalog.get(name).evaluator_type = "pascal_voc" + + +# Register them all under "./datasets" +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_coco(_root) +register_all_lvis(_root) +register_all_cityscapes(_root) +register_all_pascal_voc(_root) diff --git a/detectron2/data/datasets/builtin_meta.py b/detectron2/data/datasets/builtin_meta.py new file mode 100644 index 0000000000000000000000000000000000000000..74c79863a9d1ef5df9b5ce64f97d6be8e4e37d59 --- /dev/null +++ b/detectron2/data/datasets/builtin_meta.py @@ -0,0 +1,267 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + + +# All coco categories, together with their nice-looking visualization colors +# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json +COCO_CATEGORIES = [ + {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, + {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, + {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, + {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, + {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, + {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, + {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, + {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, + {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, + {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, + {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, + {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, + {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, + {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, + {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, + {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, + {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, + {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, + {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, + {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, + {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, + {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, + {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, + {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, + {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, + {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, + {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, + {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, + {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, + {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, + {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, + {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, + {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, + {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, + {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, + {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, + {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, + {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, + {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, + {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, + {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, + {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, + {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, + {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, + {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, + {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, + {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, + {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, + {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, + {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, + {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, + {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, + {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, + {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, + {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, + {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, + {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, + {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, + {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, + {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, + {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, + {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, + {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, + {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, + {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, + {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, + {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, + {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, + {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, + {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, + {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, + {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, + {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, + {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, + {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, + {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, + {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, + {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, + {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, + {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, + {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"}, + {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"}, + {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"}, + {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"}, + {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"}, + {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"}, + {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"}, + {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"}, + {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"}, + {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"}, + {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"}, + {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"}, + {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"}, + {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"}, + {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"}, + {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"}, + {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"}, + {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"}, + {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"}, + {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"}, + {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"}, + {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"}, + {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"}, + {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"}, + {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"}, + {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"}, + {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"}, + {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"}, + {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"}, + {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"}, + {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"}, + {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"}, + {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"}, + {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"}, + {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"}, + {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"}, + {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"}, + {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"}, + {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"}, + {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"}, + {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"}, + {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"}, + {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"}, + {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"}, + {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"}, + {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"}, + {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"}, + {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"}, + {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"}, + {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"}, + {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"}, + {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"}, + {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"}, +] + +# fmt: off +COCO_PERSON_KEYPOINT_NAMES = ( + "nose", + "left_eye", "right_eye", + "left_ear", "right_ear", + "left_shoulder", "right_shoulder", + "left_elbow", "right_elbow", + "left_wrist", "right_wrist", + "left_hip", "right_hip", + "left_knee", "right_knee", + "left_ankle", "right_ankle", +) +# fmt: on + +# Pairs of keypoints that should be exchanged under horizontal flipping +COCO_PERSON_KEYPOINT_FLIP_MAP = ( + ("left_eye", "right_eye"), + ("left_ear", "right_ear"), + ("left_shoulder", "right_shoulder"), + ("left_elbow", "right_elbow"), + ("left_wrist", "right_wrist"), + ("left_hip", "right_hip"), + ("left_knee", "right_knee"), + ("left_ankle", "right_ankle"), +) + +# rules for pairs of keypoints to draw a line between, and the line color to use. +KEYPOINT_CONNECTION_RULES = [ + # face + ("left_ear", "left_eye", (102, 204, 255)), + ("right_ear", "right_eye", (51, 153, 255)), + ("left_eye", "nose", (102, 0, 204)), + ("nose", "right_eye", (51, 102, 255)), + # upper-body + ("left_shoulder", "right_shoulder", (255, 128, 0)), + ("left_shoulder", "left_elbow", (153, 255, 204)), + ("right_shoulder", "right_elbow", (128, 229, 255)), + ("left_elbow", "left_wrist", (153, 255, 153)), + ("right_elbow", "right_wrist", (102, 255, 224)), + # lower-body + ("left_hip", "right_hip", (255, 102, 0)), + ("left_hip", "left_knee", (255, 255, 77)), + ("right_hip", "right_knee", (153, 255, 204)), + ("left_knee", "left_ankle", (191, 255, 128)), + ("right_knee", "right_ankle", (255, 195, 77)), +] + + +def _get_coco_instances_meta(): + thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1] + thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] + assert len(thing_ids) == 80, len(thing_ids) + # Mapping from the incontiguous COCO category id to an id in [0, 79] + thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} + thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] + ret = { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + "thing_colors": thing_colors, + } + return ret + + +def _get_coco_panoptic_separated_meta(): + """ + Returns metadata for "separated" version of the panoptic segmentation dataset. + """ + stuff_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 0] + assert len(stuff_ids) == 53, len(stuff_ids) + + # For semantic segmentation, this mapping maps from contiguous stuff id + # (in [0, 53], used in models) to ids in the dataset (used for processing results) + # The id 0 is mapped to an extra category "thing". + stuff_dataset_id_to_contiguous_id = {k: i + 1 for i, k in enumerate(stuff_ids)} + # When converting COCO panoptic annotations to semantic annotations + # We label the "thing" category to 0 + stuff_dataset_id_to_contiguous_id[0] = 0 + + # 54 names for COCO stuff categories (including "things") + stuff_classes = ["things"] + [ + k["name"].replace("-other", "").replace("-merged", "") + for k in COCO_CATEGORIES + if k["isthing"] == 0 + ] + + # NOTE: I randomly picked a color for things + stuff_colors = [[82, 18, 128]] + [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 0] + ret = { + "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, + "stuff_classes": stuff_classes, + "stuff_colors": stuff_colors, + } + ret.update(_get_coco_instances_meta()) + return ret + + +def _get_builtin_metadata(dataset_name): + if dataset_name == "coco": + return _get_coco_instances_meta() + if dataset_name == "coco_panoptic_separated": + return _get_coco_panoptic_separated_meta() + elif dataset_name == "coco_person": + return { + "thing_classes": ["person"], + "keypoint_names": COCO_PERSON_KEYPOINT_NAMES, + "keypoint_flip_map": COCO_PERSON_KEYPOINT_FLIP_MAP, + "keypoint_connection_rules": KEYPOINT_CONNECTION_RULES, + } + elif dataset_name == "cityscapes": + # fmt: off + CITYSCAPES_THING_CLASSES = [ + "person", "rider", "car", "truck", + "bus", "train", "motorcycle", "bicycle", + ] + CITYSCAPES_STUFF_CLASSES = [ + "road", "sidewalk", "building", "wall", "fence", "pole", "traffic light", + "traffic sign", "vegetation", "terrain", "sky", "person", "rider", "car", + "truck", "bus", "train", "motorcycle", "bicycle", "license plate", + ] + # fmt: on + return { + "thing_classes": CITYSCAPES_THING_CLASSES, + "stuff_classes": CITYSCAPES_STUFF_CLASSES, + } + raise KeyError("No built-in metadata for dataset {}".format(dataset_name)) diff --git a/detectron2/data/datasets/cityscapes.py b/detectron2/data/datasets/cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..aa96c73cf3613837143fe029b75bdbd7871cf900 --- /dev/null +++ b/detectron2/data/datasets/cityscapes.py @@ -0,0 +1,329 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import functools +import json +import logging +import multiprocessing as mp +import numpy as np +import os +from itertools import chain +import pycocotools.mask as mask_util +from fvcore.common.file_io import PathManager +from PIL import Image + +from detectron2.structures import BoxMode +from detectron2.utils.comm import get_world_size +from detectron2.utils.logger import setup_logger + +try: + import cv2 # noqa +except ImportError: + # OpenCV is an optional dependency at the moment + pass + + +logger = logging.getLogger(__name__) + + +def get_cityscapes_files(image_dir, gt_dir): + files = [] + # scan through the directory + cities = PathManager.ls(image_dir) + logger.info(f"{len(cities)} cities found in '{image_dir}'.") + for city in cities: + city_img_dir = os.path.join(image_dir, city) + city_gt_dir = os.path.join(gt_dir, city) + for basename in PathManager.ls(city_img_dir): + image_file = os.path.join(city_img_dir, basename) + + suffix = "leftImg8bit.png" + assert basename.endswith(suffix) + basename = basename[: -len(suffix)] + + instance_file = os.path.join(city_gt_dir, basename + "gtFine_instanceIds.png") + label_file = os.path.join(city_gt_dir, basename + "gtFine_labelIds.png") + json_file = os.path.join(city_gt_dir, basename + "gtFine_polygons.json") + + files.append((image_file, instance_file, label_file, json_file)) + assert len(files), "No images found in {}".format(image_dir) + for f in files[0]: + assert PathManager.isfile(f), f + return files + + +def load_cityscapes_instances(image_dir, gt_dir, from_json=True, to_polygons=True): + """ + Args: + image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". + gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". + from_json (bool): whether to read annotations from the raw json file or the png files. + to_polygons (bool): whether to represent the segmentation as polygons + (COCO's format) instead of masks (cityscapes's format). + + Returns: + list[dict]: a list of dicts in Detectron2 standard format. (See + `Using Custom Datasets `_ ) + """ + if from_json: + assert to_polygons, ( + "Cityscapes's json annotations are in polygon format. " + "Converting to mask format is not supported now." + ) + files = get_cityscapes_files(image_dir, gt_dir) + + logger.info("Preprocessing cityscapes annotations ...") + # This is still not fast: all workers will execute duplicate works and will + # take up to 10m on a 8GPU server. + pool = mp.Pool(processes=max(mp.cpu_count() // get_world_size() // 2, 4)) + + ret = pool.map( + functools.partial(cityscapes_files_to_dict, from_json=from_json, to_polygons=to_polygons), + files, + ) + logger.info("Loaded {} images from {}".format(len(ret), image_dir)) + + # Map cityscape ids to contiguous ids + from cityscapesscripts.helpers.labels import labels + + labels = [l for l in labels if l.hasInstances and not l.ignoreInEval] + dataset_id_to_contiguous_id = {l.id: idx for idx, l in enumerate(labels)} + for dict_per_image in ret: + for anno in dict_per_image["annotations"]: + anno["category_id"] = dataset_id_to_contiguous_id[anno["category_id"]] + return ret + + +def load_cityscapes_semantic(image_dir, gt_dir): + """ + Args: + image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". + gt_dir (str): path to the raw annotations. e.g., "~/cityscapes/gtFine/train". + + Returns: + list[dict]: a list of dict, each has "file_name" and + "sem_seg_file_name". + """ + ret = [] + # gt_dir is small and contain many small files. make sense to fetch to local first + gt_dir = PathManager.get_local_path(gt_dir) + for image_file, _, label_file, json_file in get_cityscapes_files(image_dir, gt_dir): + label_file = label_file.replace("labelIds", "labelTrainIds") + + with PathManager.open(json_file, "r") as f: + jsonobj = json.load(f) + ret.append( + { + "file_name": image_file, + "sem_seg_file_name": label_file, + "height": jsonobj["imgHeight"], + "width": jsonobj["imgWidth"], + } + ) + assert len(ret), f"No images found in {image_dir}!" + assert PathManager.isfile( + ret[0]["sem_seg_file_name"] + ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa + return ret + + +def cityscapes_files_to_dict(files, from_json, to_polygons): + """ + Parse cityscapes annotation files to a instance segmentation dataset dict. + + Args: + files (tuple): consists of (image_file, instance_id_file, label_id_file, json_file) + from_json (bool): whether to read annotations from the raw json file or the png files. + to_polygons (bool): whether to represent the segmentation as polygons + (COCO's format) instead of masks (cityscapes's format). + + Returns: + A dict in Detectron2 Dataset format. + """ + from cityscapesscripts.helpers.labels import id2label, name2label + + image_file, instance_id_file, _, json_file = files + + annos = [] + + if from_json: + from shapely.geometry import MultiPolygon, Polygon + + with PathManager.open(json_file, "r") as f: + jsonobj = json.load(f) + ret = { + "file_name": image_file, + "image_id": os.path.basename(image_file), + "height": jsonobj["imgHeight"], + "width": jsonobj["imgWidth"], + } + + # `polygons_union` contains the union of all valid polygons. + polygons_union = Polygon() + + # CityscapesScripts draw the polygons in sequential order + # and each polygon *overwrites* existing ones. See + # (https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/preparation/json2instanceImg.py) # noqa + # We use reverse order, and each polygon *avoids* early ones. + # This will resolve the ploygon overlaps in the same way as CityscapesScripts. + for obj in jsonobj["objects"][::-1]: + if "deleted" in obj: # cityscapes data format specific + continue + label_name = obj["label"] + + try: + label = name2label[label_name] + except KeyError: + if label_name.endswith("group"): # crowd area + label = name2label[label_name[: -len("group")]] + else: + raise + if label.id < 0: # cityscapes data format + continue + + # Cityscapes's raw annotations uses integer coordinates + # Therefore +0.5 here + poly_coord = np.asarray(obj["polygon"], dtype="f4") + 0.5 + # CityscapesScript uses PIL.ImageDraw.polygon to rasterize + # polygons for evaluation. This function operates in integer space + # and draws each pixel whose center falls into the polygon. + # Therefore it draws a polygon which is 0.5 "fatter" in expectation. + # We therefore dilate the input polygon by 0.5 as our input. + poly = Polygon(poly_coord).buffer(0.5, resolution=4) + + if not label.hasInstances or label.ignoreInEval: + # even if we won't store the polygon it still contributes to overlaps resolution + polygons_union = polygons_union.union(poly) + continue + + # Take non-overlapping part of the polygon + poly_wo_overlaps = poly.difference(polygons_union) + if poly_wo_overlaps.is_empty: + continue + polygons_union = polygons_union.union(poly) + + anno = {} + anno["iscrowd"] = label_name.endswith("group") + anno["category_id"] = label.id + + if isinstance(poly_wo_overlaps, Polygon): + poly_list = [poly_wo_overlaps] + elif isinstance(poly_wo_overlaps, MultiPolygon): + poly_list = poly_wo_overlaps.geoms + else: + raise NotImplementedError("Unknown geometric structure {}".format(poly_wo_overlaps)) + + poly_coord = [] + for poly_el in poly_list: + # COCO API can work only with exterior boundaries now, hence we store only them. + # TODO: store both exterior and interior boundaries once other parts of the + # codebase support holes in polygons. + poly_coord.append(list(chain(*poly_el.exterior.coords))) + anno["segmentation"] = poly_coord + (xmin, ymin, xmax, ymax) = poly_wo_overlaps.bounds + + anno["bbox"] = (xmin, ymin, xmax, ymax) + anno["bbox_mode"] = BoxMode.XYXY_ABS + + annos.append(anno) + else: + # See also the official annotation parsing scripts at + # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/instances2dict.py # noqa + with PathManager.open(instance_id_file, "rb") as f: + inst_image = np.asarray(Image.open(f), order="F") + # ids < 24 are stuff labels (filtering them first is about 5% faster) + flattened_ids = np.unique(inst_image[inst_image >= 24]) + + ret = { + "file_name": image_file, + "image_id": os.path.basename(image_file), + "height": inst_image.shape[0], + "width": inst_image.shape[1], + } + + for instance_id in flattened_ids: + # For non-crowd annotations, instance_id // 1000 is the label_id + # Crowd annotations have <1000 instance ids + label_id = instance_id // 1000 if instance_id >= 1000 else instance_id + label = id2label[label_id] + if not label.hasInstances or label.ignoreInEval: + continue + + anno = {} + anno["iscrowd"] = instance_id < 1000 + anno["category_id"] = label.id + + mask = np.asarray(inst_image == instance_id, dtype=np.uint8, order="F") + + inds = np.nonzero(mask) + ymin, ymax = inds[0].min(), inds[0].max() + xmin, xmax = inds[1].min(), inds[1].max() + anno["bbox"] = (xmin, ymin, xmax, ymax) + if xmax <= xmin or ymax <= ymin: + continue + anno["bbox_mode"] = BoxMode.XYXY_ABS + if to_polygons: + # This conversion comes from D4809743 and D5171122, + # when Mask-RCNN was first developed. + contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[ + -2 + ] + polygons = [c.reshape(-1).tolist() for c in contours if len(c) >= 3] + # opencv's can produce invalid polygons + if len(polygons) == 0: + continue + anno["segmentation"] = polygons + else: + anno["segmentation"] = mask_util.encode(mask[:, :, None])[0] + annos.append(anno) + ret["annotations"] = annos + return ret + + +if __name__ == "__main__": + """ + Test the cityscapes dataset loader. + + Usage: + python -m detectron2.data.datasets.cityscapes \ + cityscapes/leftImg8bit/train cityscapes/gtFine/train + """ + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("image_dir") + parser.add_argument("gt_dir") + parser.add_argument("--type", choices=["instance", "semantic"], default="instance") + args = parser.parse_args() + from detectron2.data.catalog import Metadata + from detectron2.utils.visualizer import Visualizer + from cityscapesscripts.helpers.labels import labels + + logger = setup_logger(name=__name__) + + dirname = "cityscapes-data-vis" + os.makedirs(dirname, exist_ok=True) + + if args.type == "instance": + dicts = load_cityscapes_instances( + args.image_dir, args.gt_dir, from_json=True, to_polygons=True + ) + logger.info("Done loading {} samples.".format(len(dicts))) + + thing_classes = [k.name for k in labels if k.hasInstances and not k.ignoreInEval] + meta = Metadata().set(thing_classes=thing_classes) + + else: + dicts = load_cityscapes_semantic(args.image_dir, args.gt_dir) + logger.info("Done loading {} samples.".format(len(dicts))) + + stuff_names = [k.name for k in labels if k.trainId != 255] + stuff_colors = [k.color for k in labels if k.trainId != 255] + meta = Metadata().set(stuff_names=stuff_names, stuff_colors=stuff_colors) + + for d in dicts: + img = np.array(Image.open(PathManager.open(d["file_name"], "rb"))) + visualizer = Visualizer(img, metadata=meta) + vis = visualizer.draw_dataset_dict(d) + # cv2.imshow("a", vis.get_image()[:, :, ::-1]) + # cv2.waitKey() + fpath = os.path.join(dirname, os.path.basename(d["file_name"])) + vis.save(fpath) diff --git a/detectron2/data/datasets/coco.py b/detectron2/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..6138646793f505afe50736de045fc5bf154c479b --- /dev/null +++ b/detectron2/data/datasets/coco.py @@ -0,0 +1,466 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import contextlib +import datetime +import io +import json +import logging +import numpy as np +import os +import pycocotools.mask as mask_util +from fvcore.common.file_io import PathManager, file_lock +from fvcore.common.timer import Timer +from PIL import Image + +from detectron2.structures import Boxes, BoxMode, PolygonMasks + +from .. import DatasetCatalog, MetadataCatalog + +""" +This file contains functions to parse COCO-format annotations into dicts in "Detectron2 format". +""" + + +logger = logging.getLogger(__name__) + +__all__ = ["load_coco_json", "load_sem_seg", "convert_to_coco_json"] + + +def load_coco_json(json_file, image_root, dataset_name=None, extra_annotation_keys=None): + """ + Load a json file with COCO's instances annotation format. + Currently supports instance detection, instance segmentation, + and person keypoints annotations. + + Args: + json_file (str): full path to the json file in COCO instances annotation format. + image_root (str or path-like): the directory where the images in this json file exists. + dataset_name (str): the name of the dataset (e.g., coco_2017_train). + If provided, this function will also put "thing_classes" into + the metadata associated with this dataset. + extra_annotation_keys (list[str]): list of per-annotation keys that should also be + loaded into the dataset dict (besides "iscrowd", "bbox", "keypoints", + "category_id", "segmentation"). The values for these keys will be returned as-is. + For example, the densepose annotations are loaded in this way. + + Returns: + list[dict]: a list of dicts in Detectron2 standard dataset dicts format. (See + `Using Custom Datasets `_ ) + + Notes: + 1. This function does not read the image files. + The results do not have the "image" field. + """ + from pycocotools.coco import COCO + + timer = Timer() + json_file = PathManager.get_local_path(json_file) + with contextlib.redirect_stdout(io.StringIO()): + coco_api = COCO(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + + id_map = None + if dataset_name is not None: + meta = MetadataCatalog.get(dataset_name) + cat_ids = sorted(coco_api.getCatIds()) + cats = coco_api.loadCats(cat_ids) + # The categories in a custom json file may not be sorted. + thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] + meta.thing_classes = thing_classes + + # In COCO, certain category ids are artificially removed, + # and by convention they are always ignored. + # We deal with COCO's id issue and translate + # the category ids to contiguous ids in [0, 80). + + # It works by looking at the "categories" field in the json, therefore + # if users' own json also have incontiguous ids, we'll + # apply this mapping as well but print a warning. + if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): + if "coco" not in dataset_name: + logger.warning( + """ +Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. +""" + ) + id_map = {v: i for i, v in enumerate(cat_ids)} + meta.thing_dataset_id_to_contiguous_id = id_map + + # sort indices for reproducible results + img_ids = sorted(coco_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = coco_api.loadImgs(img_ids) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. Example of anns[0]: + # [{'segmentation': [[192.81, + # 247.09, + # ... + # 219.03, + # 249.06]], + # 'area': 1035.749, + # 'iscrowd': 0, + # 'image_id': 1268, + # 'bbox': [192.81, 224.8, 74.73, 33.43], + # 'category_id': 16, + # 'id': 42986}, + # ...] + anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] + + if "minival" not in json_file: + # The popular valminusminival & minival annotations for COCO2014 contain this bug. + # However the ratio of buggy annotations there is tiny and does not affect accuracy. + # Therefore we explicitly white-list them. + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( + json_file + ) + + imgs_anns = list(zip(imgs, anns)) + + logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)) + + dataset_dicts = [] + + ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + (extra_annotation_keys or []) + + num_instances_without_valid_segmentation = 0 + + for (img_dict, anno_dict_list) in imgs_anns: + record = {} + record["file_name"] = os.path.join(image_root, img_dict["file_name"]) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + image_id = record["image_id"] = img_dict["id"] + + objs = [] + for anno in anno_dict_list: + # Check that the image_id in this annotation is the same as + # the image_id we're looking at. + # This fails only when the data parsing logic or the annotation file is buggy. + + # The original COCO valminusminival2014 & minival2014 annotation files + # actually contains bugs that, together with certain ways of using COCO API, + # can trigger this assertion. + assert anno["image_id"] == image_id + + assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.' + + obj = {key: anno[key] for key in ann_keys if key in anno} + + segm = anno.get("segmentation", None) + if segm: # either list[list[float]] or dict(RLE) + if not isinstance(segm, dict): + # filter out invalid polygons (< 3 points) + segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + if len(segm) == 0: + num_instances_without_valid_segmentation += 1 + continue # ignore this instance + obj["segmentation"] = segm + + keypts = anno.get("keypoints", None) + if keypts: # list[int] + for idx, v in enumerate(keypts): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # Therefore we assume the coordinates are "pixel indices" and + # add 0.5 to convert to floating point coordinates. + keypts[idx] = v + 0.5 + obj["keypoints"] = keypts + + obj["bbox_mode"] = BoxMode.XYWH_ABS + if id_map: + obj["category_id"] = id_map[obj["category_id"]] + objs.append(obj) + record["annotations"] = objs + dataset_dicts.append(record) + + if num_instances_without_valid_segmentation > 0: + logger.warning( + "Filtered out {} instances without valid segmentation. " + "There might be issues in your dataset generation process.".format( + num_instances_without_valid_segmentation + ) + ) + return dataset_dicts + + +def load_sem_seg(gt_root, image_root, gt_ext="png", image_ext="jpg"): + """ + Load semantic segmentation datasets. All files under "gt_root" with "gt_ext" extension are + treated as ground truth annotations and all files under "image_root" with "image_ext" extension + as input images. Ground truth and input images are matched using file paths relative to + "gt_root" and "image_root" respectively without taking into account file extensions. + This works for COCO as well as some other datasets. + + Args: + gt_root (str): full path to ground truth semantic segmentation files. Semantic segmentation + annotations are stored as images with integer values in pixels that represent + corresponding semantic labels. + image_root (str): the directory where the input images are. + gt_ext (str): file extension for ground truth annotations. + image_ext (str): file extension for input images. + + Returns: + list[dict]: + a list of dicts in detectron2 standard format without instance-level + annotation. + + Notes: + 1. This function does not read the image and ground truth files. + The results do not have the "image" and "sem_seg" fields. + """ + + # We match input images with ground truth based on their relative filepaths (without file + # extensions) starting from 'image_root' and 'gt_root' respectively. + def file2id(folder_path, file_path): + # extract relative path starting from `folder_path` + image_id = os.path.normpath(os.path.relpath(file_path, start=folder_path)) + # remove file extension + image_id = os.path.splitext(image_id)[0] + return image_id + + input_files = sorted( + (os.path.join(image_root, f) for f in PathManager.ls(image_root) if f.endswith(image_ext)), + key=lambda file_path: file2id(image_root, file_path), + ) + gt_files = sorted( + (os.path.join(gt_root, f) for f in PathManager.ls(gt_root) if f.endswith(gt_ext)), + key=lambda file_path: file2id(gt_root, file_path), + ) + + assert len(gt_files) > 0, "No annotations found in {}.".format(gt_root) + + # Use the intersection, so that val2017_100 annotations can run smoothly with val2017 images + if len(input_files) != len(gt_files): + logger.warn( + "Directory {} and {} has {} and {} files, respectively.".format( + image_root, gt_root, len(input_files), len(gt_files) + ) + ) + input_basenames = [os.path.basename(f)[: -len(image_ext)] for f in input_files] + gt_basenames = [os.path.basename(f)[: -len(gt_ext)] for f in gt_files] + intersect = list(set(input_basenames) & set(gt_basenames)) + # sort, otherwise each worker may obtain a list[dict] in different order + intersect = sorted(intersect) + logger.warn("Will use their intersection of {} files.".format(len(intersect))) + input_files = [os.path.join(image_root, f + image_ext) for f in intersect] + gt_files = [os.path.join(gt_root, f + gt_ext) for f in intersect] + + logger.info( + "Loaded {} images with semantic segmentation from {}".format(len(input_files), image_root) + ) + + dataset_dicts = [] + for (img_path, gt_path) in zip(input_files, gt_files): + record = {} + record["file_name"] = img_path + record["sem_seg_file_name"] = gt_path + dataset_dicts.append(record) + + return dataset_dicts + + +def convert_to_coco_dict(dataset_name): + """ + Convert an instance detection/segmentation or keypoint detection dataset + in detectron2's standard format into COCO json format. + + Generic dataset description can be found here: + https://detectron2.readthedocs.io/tutorials/datasets.html#register-a-dataset + + COCO data format description can be found here: + http://cocodataset.org/#format-data + + Args: + dataset_name (str): + name of the source dataset + Must be registered in DatastCatalog and in detectron2's standard format. + Must have corresponding metadata "thing_classes" + Returns: + coco_dict: serializable dict in COCO json format + """ + + dataset_dicts = DatasetCatalog.get(dataset_name) + metadata = MetadataCatalog.get(dataset_name) + + # unmap the category mapping ids for COCO + if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): + reverse_id_mapping = {v: k for k, v in metadata.thing_dataset_id_to_contiguous_id.items()} + reverse_id_mapper = lambda contiguous_id: reverse_id_mapping[contiguous_id] # noqa + else: + reverse_id_mapper = lambda contiguous_id: contiguous_id # noqa + + categories = [ + {"id": reverse_id_mapper(id), "name": name} + for id, name in enumerate(metadata.thing_classes) + ] + + logger.info("Converting dataset dicts into COCO format") + coco_images = [] + coco_annotations = [] + + for image_id, image_dict in enumerate(dataset_dicts): + coco_image = { + "id": image_dict.get("image_id", image_id), + "width": image_dict["width"], + "height": image_dict["height"], + "file_name": image_dict["file_name"], + } + coco_images.append(coco_image) + + anns_per_image = image_dict["annotations"] + for annotation in anns_per_image: + # create a new dict with only COCO fields + coco_annotation = {} + + # COCO requirement: XYWH box format + bbox = annotation["bbox"] + bbox_mode = annotation["bbox_mode"] + bbox = BoxMode.convert(bbox, bbox_mode, BoxMode.XYWH_ABS) + + # COCO requirement: instance area + if "segmentation" in annotation: + # Computing areas for instances by counting the pixels + segmentation = annotation["segmentation"] + # TODO: check segmentation type: RLE, BinaryMask or Polygon + if isinstance(segmentation, list): + polygons = PolygonMasks([segmentation]) + area = polygons.area()[0].item() + elif isinstance(segmentation, dict): # RLE + area = mask_util.area(segmentation).item() + else: + raise TypeError(f"Unknown segmentation type {type(segmentation)}!") + else: + # Computing areas using bounding boxes + bbox_xy = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + area = Boxes([bbox_xy]).area()[0].item() + + if "keypoints" in annotation: + keypoints = annotation["keypoints"] # list[int] + for idx, v in enumerate(keypoints): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # For COCO format consistency we substract 0.5 + # https://github.com/facebookresearch/detectron2/pull/175#issuecomment-551202163 + keypoints[idx] = v - 0.5 + if "num_keypoints" in annotation: + num_keypoints = annotation["num_keypoints"] + else: + num_keypoints = sum(kp > 0 for kp in keypoints[2::3]) + + # COCO requirement: + # linking annotations to images + # "id" field must start with 1 + coco_annotation["id"] = len(coco_annotations) + 1 + coco_annotation["image_id"] = coco_image["id"] + coco_annotation["bbox"] = [round(float(x), 3) for x in bbox] + coco_annotation["area"] = float(area) + coco_annotation["iscrowd"] = annotation.get("iscrowd", 0) + coco_annotation["category_id"] = reverse_id_mapper(annotation["category_id"]) + + # Add optional fields + if "keypoints" in annotation: + coco_annotation["keypoints"] = keypoints + coco_annotation["num_keypoints"] = num_keypoints + + if "segmentation" in annotation: + coco_annotation["segmentation"] = annotation["segmentation"] + if isinstance(coco_annotation["segmentation"], dict): # RLE + coco_annotation["segmentation"]["counts"] = coco_annotation["segmentation"][ + "counts" + ].decode("ascii") + + coco_annotations.append(coco_annotation) + + logger.info( + "Conversion finished, " + f"#images: {len(coco_images)}, #annotations: {len(coco_annotations)}" + ) + + info = { + "date_created": str(datetime.datetime.now()), + "description": "Automatically generated COCO json file for Detectron2.", + } + coco_dict = { + "info": info, + "images": coco_images, + "annotations": coco_annotations, + "categories": categories, + "licenses": None, + } + return coco_dict + + +def convert_to_coco_json(dataset_name, output_file, allow_cached=True): + """ + Converts dataset into COCO format and saves it to a json file. + dataset_name must be registered in DatasetCatalog and in detectron2's standard format. + + Args: + dataset_name: + reference from the config file to the catalogs + must be registered in DatasetCatalog and in detectron2's standard format + output_file: path of json file that will be saved to + allow_cached: if json file is already present then skip conversion + """ + + # TODO: The dataset or the conversion script *may* change, + # a checksum would be useful for validating the cached data + + PathManager.mkdirs(os.path.dirname(output_file)) + with file_lock(output_file): + if PathManager.exists(output_file) and allow_cached: + logger.warning( + f"Using previously cached COCO format annotations at '{output_file}'. " + "You need to clear the cache file if your dataset has been modified." + ) + else: + logger.info(f"Converting annotations of dataset '{dataset_name}' to COCO format ...)") + coco_dict = convert_to_coco_dict(dataset_name) + + logger.info(f"Caching COCO format annotations at '{output_file}' ...") + with PathManager.open(output_file, "w") as f: + json.dump(coco_dict, f) + + +if __name__ == "__main__": + """ + Test the COCO json dataset loader. + + Usage: + python -m detectron2.data.datasets.coco \ + path/to/json path/to/image_root dataset_name + + "dataset_name" can be "coco_2014_minival_100", or other + pre-registered ones + """ + from detectron2.utils.logger import setup_logger + from detectron2.utils.visualizer import Visualizer + import detectron2.data.datasets # noqa # add pre-defined metadata + import sys + + logger = setup_logger(name=__name__) + assert sys.argv[3] in DatasetCatalog.list() + meta = MetadataCatalog.get(sys.argv[3]) + + dicts = load_coco_json(sys.argv[1], sys.argv[2], sys.argv[3]) + logger.info("Done loading {} samples.".format(len(dicts))) + + dirname = "coco-data-vis" + os.makedirs(dirname, exist_ok=True) + for d in dicts: + img = np.array(Image.open(d["file_name"])) + visualizer = Visualizer(img, metadata=meta) + vis = visualizer.draw_dataset_dict(d) + fpath = os.path.join(dirname, os.path.basename(d["file_name"])) + vis.save(fpath) diff --git a/detectron2/data/datasets/lvis.py b/detectron2/data/datasets/lvis.py new file mode 100644 index 0000000000000000000000000000000000000000..765ec7494a1b16526f588ee9f71658779ce936eb --- /dev/null +++ b/detectron2/data/datasets/lvis.py @@ -0,0 +1,209 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import os +from fvcore.common.file_io import PathManager +from fvcore.common.timer import Timer + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode + +from .builtin_meta import _get_coco_instances_meta +from .lvis_v0_5_categories import LVIS_CATEGORIES + +""" +This file contains functions to parse LVIS-format annotations into dicts in the +"Detectron2 format". +""" + +logger = logging.getLogger(__name__) + +__all__ = ["load_lvis_json", "register_lvis_instances", "get_lvis_instances_meta"] + + +def register_lvis_instances(name, metadata, json_file, image_root): + """ + Register a dataset in LVIS's json annotation format for instance detection and segmentation. + + Args: + name (str): a name that identifies the dataset, e.g. "lvis_v0.5_train". + metadata (dict): extra metadata associated with this dataset. It can be an empty dict. + json_file (str): path to the json instance annotation file. + image_root (str or path-like): directory which contains all the images. + """ + DatasetCatalog.register(name, lambda: load_lvis_json(json_file, image_root, name)) + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="lvis", **metadata + ) + + +def load_lvis_json(json_file, image_root, dataset_name=None): + """ + Load a json file in LVIS's annotation format. + + Args: + json_file (str): full path to the LVIS json annotation file. + image_root (str): the directory where the images in this json file exists. + dataset_name (str): the name of the dataset (e.g., "lvis_v0.5_train"). + If provided, this function will put "thing_classes" into the metadata + associated with this dataset. + + Returns: + list[dict]: a list of dicts in Detectron2 standard format. (See + `Using Custom Datasets `_ ) + + Notes: + 1. This function does not read the image files. + The results do not have the "image" field. + """ + from lvis import LVIS + + json_file = PathManager.get_local_path(json_file) + + timer = Timer() + lvis_api = LVIS(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + + if dataset_name is not None: + meta = get_lvis_instances_meta(dataset_name) + MetadataCatalog.get(dataset_name).set(**meta) + + # sort indices for reproducible results + img_ids = sorted(lvis_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = lvis_api.load_imgs(img_ids) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. Example of anns[0]: + # [{'segmentation': [[192.81, + # 247.09, + # ... + # 219.03, + # 249.06]], + # 'area': 1035.749, + # 'image_id': 1268, + # 'bbox': [192.81, 224.8, 74.73, 33.43], + # 'category_id': 16, + # 'id': 42986}, + # ...] + anns = [lvis_api.img_ann_map[img_id] for img_id in img_ids] + + # Sanity check that each annotation has a unique id + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique".format( + json_file + ) + + imgs_anns = list(zip(imgs, anns)) + + logger.info("Loaded {} images in the LVIS format from {}".format(len(imgs_anns), json_file)) + + dataset_dicts = [] + + for (img_dict, anno_dict_list) in imgs_anns: + record = {} + file_name = img_dict["file_name"] + if img_dict["file_name"].startswith("COCO"): + # Convert form the COCO 2014 file naming convention of + # COCO_[train/val/test]2014_000000000000.jpg to the 2017 naming convention of + # 000000000000.jpg (LVIS v1 will fix this naming issue) + file_name = file_name[-16:] + record["file_name"] = os.path.join(image_root, file_name) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + record["not_exhaustive_category_ids"] = img_dict.get("not_exhaustive_category_ids", []) + record["neg_category_ids"] = img_dict.get("neg_category_ids", []) + image_id = record["image_id"] = img_dict["id"] + + objs = [] + for anno in anno_dict_list: + # Check that the image_id in this annotation is the same as + # the image_id we're looking at. + # This fails only when the data parsing logic or the annotation file is buggy. + assert anno["image_id"] == image_id + obj = {"bbox": anno["bbox"], "bbox_mode": BoxMode.XYWH_ABS} + obj["category_id"] = anno["category_id"] - 1 # Convert 1-indexed to 0-indexed + segm = anno["segmentation"] # list[list[float]] + # filter out invalid polygons (< 3 points) + valid_segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + assert len(segm) == len( + valid_segm + ), "Annotation contains an invalid polygon with < 3 points" + assert len(segm) > 0 + obj["segmentation"] = segm + objs.append(obj) + record["annotations"] = objs + dataset_dicts.append(record) + + return dataset_dicts + + +def get_lvis_instances_meta(dataset_name): + """ + Load LVIS metadata. + + Args: + dataset_name (str): LVIS dataset name without the split name (e.g., "lvis_v0.5"). + + Returns: + dict: LVIS metadata with keys: thing_classes + """ + if "cocofied" in dataset_name: + return _get_coco_instances_meta() + if "v0.5" in dataset_name: + return _get_lvis_instances_meta_v0_5() + # There will be a v1 in the future + # elif dataset_name == "lvis_v1": + # return get_lvis_instances_meta_v1() + raise ValueError("No built-in metadata for dataset {}".format(dataset_name)) + + +def _get_lvis_instances_meta_v0_5(): + assert len(LVIS_CATEGORIES) == 1230 + cat_ids = [k["id"] for k in LVIS_CATEGORIES] + assert min(cat_ids) == 1 and max(cat_ids) == len( + cat_ids + ), "Category ids are not in [1, #categories], as expected" + # Ensure that the category list is sorted by id + lvis_categories = sorted(LVIS_CATEGORIES, key=lambda x: x["id"]) + thing_classes = [k["synonyms"][0] for k in lvis_categories] + meta = {"thing_classes": thing_classes} + return meta + + +if __name__ == "__main__": + """ + Test the LVIS json dataset loader. + + Usage: + python -m detectron2.data.datasets.lvis \ + path/to/json path/to/image_root dataset_name vis_limit + """ + import sys + import numpy as np + from detectron2.utils.logger import setup_logger + from PIL import Image + import detectron2.data.datasets # noqa # add pre-defined metadata + from detectron2.utils.visualizer import Visualizer + + logger = setup_logger(name=__name__) + meta = MetadataCatalog.get(sys.argv[3]) + + dicts = load_lvis_json(sys.argv[1], sys.argv[2], sys.argv[3]) + logger.info("Done loading {} samples.".format(len(dicts))) + + dirname = "lvis-data-vis" + os.makedirs(dirname, exist_ok=True) + for d in dicts[: int(sys.argv[4])]: + img = np.array(Image.open(d["file_name"])) + visualizer = Visualizer(img, metadata=meta) + vis = visualizer.draw_dataset_dict(d) + fpath = os.path.join(dirname, os.path.basename(d["file_name"])) + vis.save(fpath) diff --git a/detectron2/data/datasets/lvis_v0_5_categories.py b/detectron2/data/datasets/lvis_v0_5_categories.py new file mode 100644 index 0000000000000000000000000000000000000000..8205e605f85dab3674c6f1600d7675eef86b160f --- /dev/null +++ b/detectron2/data/datasets/lvis_v0_5_categories.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# Autogen with +# with open("lvis_v0.5_val.json", "r") as f: +# a = json.load(f) +# c = a["categories"] +# for x in c: +# del x["image_count"] +# del x["instance_count"] +# LVIS_CATEGORIES = repr(c) + " # noqa" + +# fmt: off +LVIS_CATEGORIES = [{'frequency': 'r', 'id': 1, 'synset': 'acorn.n.01', 'synonyms': ['acorn'], 'def': 'nut from an oak tree', 'name': 'acorn'}, {'frequency': 'c', 'id': 2, 'synset': 'aerosol.n.02', 'synonyms': ['aerosol_can', 'spray_can'], 'def': 'a dispenser that holds a substance under pressure', 'name': 'aerosol_can'}, {'frequency': 'f', 'id': 3, 'synset': 'air_conditioner.n.01', 'synonyms': ['air_conditioner'], 'def': 'a machine that keeps air cool and dry', 'name': 'air_conditioner'}, {'frequency': 'f', 'id': 4, 'synset': 'airplane.n.01', 'synonyms': ['airplane', 'aeroplane'], 'def': 'an aircraft that has a fixed wing and is powered by propellers or jets', 'name': 'airplane'}, {'frequency': 'c', 'id': 5, 'synset': 'alarm_clock.n.01', 'synonyms': ['alarm_clock'], 'def': 'a clock that wakes a sleeper at some preset time', 'name': 'alarm_clock'}, {'frequency': 'c', 'id': 6, 'synset': 'alcohol.n.01', 'synonyms': ['alcohol', 'alcoholic_beverage'], 'def': 'a liquor or brew containing alcohol as the active agent', 'name': 'alcohol'}, {'frequency': 'r', 'id': 7, 'synset': 'alligator.n.02', 'synonyms': ['alligator', 'gator'], 'def': 'amphibious reptiles related to crocodiles but with shorter broader snouts', 'name': 'alligator'}, {'frequency': 'c', 'id': 8, 'synset': 'almond.n.02', 'synonyms': ['almond'], 'def': 'oval-shaped edible seed of the almond tree', 'name': 'almond'}, {'frequency': 'c', 'id': 9, 'synset': 'ambulance.n.01', 'synonyms': ['ambulance'], 'def': 'a vehicle that takes people to and from hospitals', 'name': 'ambulance'}, {'frequency': 'r', 'id': 10, 'synset': 'amplifier.n.01', 'synonyms': ['amplifier'], 'def': 'electronic equipment that increases strength of signals', 'name': 'amplifier'}, {'frequency': 'c', 'id': 11, 'synset': 'anklet.n.03', 'synonyms': ['anklet', 'ankle_bracelet'], 'def': 'an ornament worn around the ankle', 'name': 'anklet'}, {'frequency': 'f', 'id': 12, 'synset': 'antenna.n.01', 'synonyms': ['antenna', 'aerial', 'transmitting_aerial'], 'def': 'an electrical device that sends or receives radio or television signals', 'name': 'antenna'}, {'frequency': 'f', 'id': 13, 'synset': 'apple.n.01', 'synonyms': ['apple'], 'def': 'fruit with red or yellow or green skin and sweet to tart crisp whitish flesh', 'name': 'apple'}, {'frequency': 'r', 'id': 14, 'synset': 'apple_juice.n.01', 'synonyms': ['apple_juice'], 'def': 'the juice of apples', 'name': 'apple_juice'}, {'frequency': 'r', 'id': 15, 'synset': 'applesauce.n.01', 'synonyms': ['applesauce'], 'def': 'puree of stewed apples usually sweetened and spiced', 'name': 'applesauce'}, {'frequency': 'r', 'id': 16, 'synset': 'apricot.n.02', 'synonyms': ['apricot'], 'def': 'downy yellow to rosy-colored fruit resembling a small peach', 'name': 'apricot'}, {'frequency': 'f', 'id': 17, 'synset': 'apron.n.01', 'synonyms': ['apron'], 'def': 'a garment of cloth that is tied about the waist and worn to protect clothing', 'name': 'apron'}, {'frequency': 'c', 'id': 18, 'synset': 'aquarium.n.01', 'synonyms': ['aquarium', 'fish_tank'], 'def': 'a tank/pool/bowl filled with water for keeping live fish and underwater animals', 'name': 'aquarium'}, {'frequency': 'c', 'id': 19, 'synset': 'armband.n.02', 'synonyms': ['armband'], 'def': 'a band worn around the upper arm', 'name': 'armband'}, {'frequency': 'f', 'id': 20, 'synset': 'armchair.n.01', 'synonyms': ['armchair'], 'def': 'chair with a support on each side for arms', 'name': 'armchair'}, {'frequency': 'r', 'id': 21, 'synset': 'armoire.n.01', 'synonyms': ['armoire'], 'def': 'a large wardrobe or cabinet', 'name': 'armoire'}, {'frequency': 'r', 'id': 22, 'synset': 'armor.n.01', 'synonyms': ['armor', 'armour'], 'def': 'protective covering made of metal and used in combat', 'name': 'armor'}, {'frequency': 'c', 'id': 23, 'synset': 'artichoke.n.02', 'synonyms': ['artichoke'], 'def': 'a thistlelike flower head with edible fleshy leaves and heart', 'name': 'artichoke'}, {'frequency': 'f', 'id': 24, 'synset': 'ashcan.n.01', 'synonyms': ['trash_can', 'garbage_can', 'wastebin', 'dustbin', 'trash_barrel', 'trash_bin'], 'def': 'a bin that holds rubbish until it is collected', 'name': 'trash_can'}, {'frequency': 'c', 'id': 25, 'synset': 'ashtray.n.01', 'synonyms': ['ashtray'], 'def': "a receptacle for the ash from smokers' cigars or cigarettes", 'name': 'ashtray'}, {'frequency': 'c', 'id': 26, 'synset': 'asparagus.n.02', 'synonyms': ['asparagus'], 'def': 'edible young shoots of the asparagus plant', 'name': 'asparagus'}, {'frequency': 'c', 'id': 27, 'synset': 'atomizer.n.01', 'synonyms': ['atomizer', 'atomiser', 'spray', 'sprayer', 'nebulizer', 'nebuliser'], 'def': 'a dispenser that turns a liquid (such as perfume) into a fine mist', 'name': 'atomizer'}, {'frequency': 'c', 'id': 28, 'synset': 'avocado.n.01', 'synonyms': ['avocado'], 'def': 'a pear-shaped fruit with green or blackish skin and rich yellowish pulp enclosing a single large seed', 'name': 'avocado'}, {'frequency': 'c', 'id': 29, 'synset': 'award.n.02', 'synonyms': ['award', 'accolade'], 'def': 'a tangible symbol signifying approval or distinction', 'name': 'award'}, {'frequency': 'f', 'id': 30, 'synset': 'awning.n.01', 'synonyms': ['awning'], 'def': 'a canopy made of canvas to shelter people or things from rain or sun', 'name': 'awning'}, {'frequency': 'r', 'id': 31, 'synset': 'ax.n.01', 'synonyms': ['ax', 'axe'], 'def': 'an edge tool with a heavy bladed head mounted across a handle', 'name': 'ax'}, {'frequency': 'f', 'id': 32, 'synset': 'baby_buggy.n.01', 'synonyms': ['baby_buggy', 'baby_carriage', 'perambulator', 'pram', 'stroller'], 'def': 'a small vehicle with four wheels in which a baby or child is pushed around', 'name': 'baby_buggy'}, {'frequency': 'c', 'id': 33, 'synset': 'backboard.n.01', 'synonyms': ['basketball_backboard'], 'def': 'a raised vertical board with basket attached; used to play basketball', 'name': 'basketball_backboard'}, {'frequency': 'f', 'id': 34, 'synset': 'backpack.n.01', 'synonyms': ['backpack', 'knapsack', 'packsack', 'rucksack', 'haversack'], 'def': 'a bag carried by a strap on your back or shoulder', 'name': 'backpack'}, {'frequency': 'f', 'id': 35, 'synset': 'bag.n.04', 'synonyms': ['handbag', 'purse', 'pocketbook'], 'def': 'a container used for carrying money and small personal items or accessories', 'name': 'handbag'}, {'frequency': 'f', 'id': 36, 'synset': 'bag.n.06', 'synonyms': ['suitcase', 'baggage', 'luggage'], 'def': 'cases used to carry belongings when traveling', 'name': 'suitcase'}, {'frequency': 'c', 'id': 37, 'synset': 'bagel.n.01', 'synonyms': ['bagel', 'beigel'], 'def': 'glazed yeast-raised doughnut-shaped roll with hard crust', 'name': 'bagel'}, {'frequency': 'r', 'id': 38, 'synset': 'bagpipe.n.01', 'synonyms': ['bagpipe'], 'def': 'a tubular wind instrument; the player blows air into a bag and squeezes it out', 'name': 'bagpipe'}, {'frequency': 'r', 'id': 39, 'synset': 'baguet.n.01', 'synonyms': ['baguet', 'baguette'], 'def': 'narrow French stick loaf', 'name': 'baguet'}, {'frequency': 'r', 'id': 40, 'synset': 'bait.n.02', 'synonyms': ['bait', 'lure'], 'def': 'something used to lure fish or other animals into danger so they can be trapped or killed', 'name': 'bait'}, {'frequency': 'f', 'id': 41, 'synset': 'ball.n.06', 'synonyms': ['ball'], 'def': 'a spherical object used as a plaything', 'name': 'ball'}, {'frequency': 'r', 'id': 42, 'synset': 'ballet_skirt.n.01', 'synonyms': ['ballet_skirt', 'tutu'], 'def': 'very short skirt worn by ballerinas', 'name': 'ballet_skirt'}, {'frequency': 'f', 'id': 43, 'synset': 'balloon.n.01', 'synonyms': ['balloon'], 'def': 'large tough nonrigid bag filled with gas or heated air', 'name': 'balloon'}, {'frequency': 'c', 'id': 44, 'synset': 'bamboo.n.02', 'synonyms': ['bamboo'], 'def': 'woody tropical grass having hollow woody stems', 'name': 'bamboo'}, {'frequency': 'f', 'id': 45, 'synset': 'banana.n.02', 'synonyms': ['banana'], 'def': 'elongated crescent-shaped yellow fruit with soft sweet flesh', 'name': 'banana'}, {'frequency': 'r', 'id': 46, 'synset': 'band_aid.n.01', 'synonyms': ['Band_Aid'], 'def': 'trade name for an adhesive bandage to cover small cuts or blisters', 'name': 'Band_Aid'}, {'frequency': 'c', 'id': 47, 'synset': 'bandage.n.01', 'synonyms': ['bandage'], 'def': 'a piece of soft material that covers and protects an injured part of the body', 'name': 'bandage'}, {'frequency': 'c', 'id': 48, 'synset': 'bandanna.n.01', 'synonyms': ['bandanna', 'bandana'], 'def': 'large and brightly colored handkerchief; often used as a neckerchief', 'name': 'bandanna'}, {'frequency': 'r', 'id': 49, 'synset': 'banjo.n.01', 'synonyms': ['banjo'], 'def': 'a stringed instrument of the guitar family with a long neck and circular body', 'name': 'banjo'}, {'frequency': 'f', 'id': 50, 'synset': 'banner.n.01', 'synonyms': ['banner', 'streamer'], 'def': 'long strip of cloth or paper used for decoration or advertising', 'name': 'banner'}, {'frequency': 'r', 'id': 51, 'synset': 'barbell.n.01', 'synonyms': ['barbell'], 'def': 'a bar to which heavy discs are attached at each end; used in weightlifting', 'name': 'barbell'}, {'frequency': 'r', 'id': 52, 'synset': 'barge.n.01', 'synonyms': ['barge'], 'def': 'a flatbottom boat for carrying heavy loads (especially on canals)', 'name': 'barge'}, {'frequency': 'f', 'id': 53, 'synset': 'barrel.n.02', 'synonyms': ['barrel', 'cask'], 'def': 'a cylindrical container that holds liquids', 'name': 'barrel'}, {'frequency': 'c', 'id': 54, 'synset': 'barrette.n.01', 'synonyms': ['barrette'], 'def': "a pin for holding women's hair in place", 'name': 'barrette'}, {'frequency': 'c', 'id': 55, 'synset': 'barrow.n.03', 'synonyms': ['barrow', 'garden_cart', 'lawn_cart', 'wheelbarrow'], 'def': 'a cart for carrying small loads; has handles and one or more wheels', 'name': 'barrow'}, {'frequency': 'f', 'id': 56, 'synset': 'base.n.03', 'synonyms': ['baseball_base'], 'def': 'a place that the runner must touch before scoring', 'name': 'baseball_base'}, {'frequency': 'f', 'id': 57, 'synset': 'baseball.n.02', 'synonyms': ['baseball'], 'def': 'a ball used in playing baseball', 'name': 'baseball'}, {'frequency': 'f', 'id': 58, 'synset': 'baseball_bat.n.01', 'synonyms': ['baseball_bat'], 'def': 'an implement used in baseball by the batter', 'name': 'baseball_bat'}, {'frequency': 'f', 'id': 59, 'synset': 'baseball_cap.n.01', 'synonyms': ['baseball_cap', 'jockey_cap', 'golf_cap'], 'def': 'a cap with a bill', 'name': 'baseball_cap'}, {'frequency': 'f', 'id': 60, 'synset': 'baseball_glove.n.01', 'synonyms': ['baseball_glove', 'baseball_mitt'], 'def': 'the handwear used by fielders in playing baseball', 'name': 'baseball_glove'}, {'frequency': 'f', 'id': 61, 'synset': 'basket.n.01', 'synonyms': ['basket', 'handbasket'], 'def': 'a container that is usually woven and has handles', 'name': 'basket'}, {'frequency': 'c', 'id': 62, 'synset': 'basket.n.03', 'synonyms': ['basketball_hoop'], 'def': 'metal hoop supporting a net through which players try to throw the basketball', 'name': 'basketball_hoop'}, {'frequency': 'c', 'id': 63, 'synset': 'basketball.n.02', 'synonyms': ['basketball'], 'def': 'an inflated ball used in playing basketball', 'name': 'basketball'}, {'frequency': 'r', 'id': 64, 'synset': 'bass_horn.n.01', 'synonyms': ['bass_horn', 'sousaphone', 'tuba'], 'def': 'the lowest brass wind instrument', 'name': 'bass_horn'}, {'frequency': 'r', 'id': 65, 'synset': 'bat.n.01', 'synonyms': ['bat_(animal)'], 'def': 'nocturnal mouselike mammal with forelimbs modified to form membranous wings', 'name': 'bat_(animal)'}, {'frequency': 'f', 'id': 66, 'synset': 'bath_mat.n.01', 'synonyms': ['bath_mat'], 'def': 'a heavy towel or mat to stand on while drying yourself after a bath', 'name': 'bath_mat'}, {'frequency': 'f', 'id': 67, 'synset': 'bath_towel.n.01', 'synonyms': ['bath_towel'], 'def': 'a large towel; to dry yourself after a bath', 'name': 'bath_towel'}, {'frequency': 'c', 'id': 68, 'synset': 'bathrobe.n.01', 'synonyms': ['bathrobe'], 'def': 'a loose-fitting robe of towelling; worn after a bath or swim', 'name': 'bathrobe'}, {'frequency': 'f', 'id': 69, 'synset': 'bathtub.n.01', 'synonyms': ['bathtub', 'bathing_tub'], 'def': 'a large open container that you fill with water and use to wash the body', 'name': 'bathtub'}, {'frequency': 'r', 'id': 70, 'synset': 'batter.n.02', 'synonyms': ['batter_(food)'], 'def': 'a liquid or semiliquid mixture, as of flour, eggs, and milk, used in cooking', 'name': 'batter_(food)'}, {'frequency': 'c', 'id': 71, 'synset': 'battery.n.02', 'synonyms': ['battery'], 'def': 'a portable device that produces electricity', 'name': 'battery'}, {'frequency': 'r', 'id': 72, 'synset': 'beach_ball.n.01', 'synonyms': ['beachball'], 'def': 'large and light ball; for play at the seaside', 'name': 'beachball'}, {'frequency': 'c', 'id': 73, 'synset': 'bead.n.01', 'synonyms': ['bead'], 'def': 'a small ball with a hole through the middle used for ornamentation, jewellery, etc.', 'name': 'bead'}, {'frequency': 'r', 'id': 74, 'synset': 'beaker.n.01', 'synonyms': ['beaker'], 'def': 'a flatbottomed jar made of glass or plastic; used for chemistry', 'name': 'beaker'}, {'frequency': 'c', 'id': 75, 'synset': 'bean_curd.n.01', 'synonyms': ['bean_curd', 'tofu'], 'def': 'cheeselike food made of curdled soybean milk', 'name': 'bean_curd'}, {'frequency': 'c', 'id': 76, 'synset': 'beanbag.n.01', 'synonyms': ['beanbag'], 'def': 'a bag filled with dried beans or similar items; used in games or to sit on', 'name': 'beanbag'}, {'frequency': 'f', 'id': 77, 'synset': 'beanie.n.01', 'synonyms': ['beanie', 'beany'], 'def': 'a small skullcap; formerly worn by schoolboys and college freshmen', 'name': 'beanie'}, {'frequency': 'f', 'id': 78, 'synset': 'bear.n.01', 'synonyms': ['bear'], 'def': 'large carnivorous or omnivorous mammals with shaggy coats and claws', 'name': 'bear'}, {'frequency': 'f', 'id': 79, 'synset': 'bed.n.01', 'synonyms': ['bed'], 'def': 'a piece of furniture that provides a place to sleep', 'name': 'bed'}, {'frequency': 'c', 'id': 80, 'synset': 'bedspread.n.01', 'synonyms': ['bedspread', 'bedcover', 'bed_covering', 'counterpane', 'spread'], 'def': 'decorative cover for a bed', 'name': 'bedspread'}, {'frequency': 'f', 'id': 81, 'synset': 'beef.n.01', 'synonyms': ['cow'], 'def': 'cattle that are reared for their meat', 'name': 'cow'}, {'frequency': 'c', 'id': 82, 'synset': 'beef.n.02', 'synonyms': ['beef_(food)', 'boeuf_(food)'], 'def': 'meat from an adult domestic bovine', 'name': 'beef_(food)'}, {'frequency': 'r', 'id': 83, 'synset': 'beeper.n.01', 'synonyms': ['beeper', 'pager'], 'def': 'an device that beeps when the person carrying it is being paged', 'name': 'beeper'}, {'frequency': 'f', 'id': 84, 'synset': 'beer_bottle.n.01', 'synonyms': ['beer_bottle'], 'def': 'a bottle that holds beer', 'name': 'beer_bottle'}, {'frequency': 'c', 'id': 85, 'synset': 'beer_can.n.01', 'synonyms': ['beer_can'], 'def': 'a can that holds beer', 'name': 'beer_can'}, {'frequency': 'r', 'id': 86, 'synset': 'beetle.n.01', 'synonyms': ['beetle'], 'def': 'insect with hard wing covers', 'name': 'beetle'}, {'frequency': 'f', 'id': 87, 'synset': 'bell.n.01', 'synonyms': ['bell'], 'def': 'a hollow device made of metal that makes a ringing sound when struck', 'name': 'bell'}, {'frequency': 'f', 'id': 88, 'synset': 'bell_pepper.n.02', 'synonyms': ['bell_pepper', 'capsicum'], 'def': 'large bell-shaped sweet pepper in green or red or yellow or orange or black varieties', 'name': 'bell_pepper'}, {'frequency': 'f', 'id': 89, 'synset': 'belt.n.02', 'synonyms': ['belt'], 'def': 'a band to tie or buckle around the body (usually at the waist)', 'name': 'belt'}, {'frequency': 'f', 'id': 90, 'synset': 'belt_buckle.n.01', 'synonyms': ['belt_buckle'], 'def': 'the buckle used to fasten a belt', 'name': 'belt_buckle'}, {'frequency': 'f', 'id': 91, 'synset': 'bench.n.01', 'synonyms': ['bench'], 'def': 'a long seat for more than one person', 'name': 'bench'}, {'frequency': 'c', 'id': 92, 'synset': 'beret.n.01', 'synonyms': ['beret'], 'def': 'a cap with no brim or bill; made of soft cloth', 'name': 'beret'}, {'frequency': 'c', 'id': 93, 'synset': 'bib.n.02', 'synonyms': ['bib'], 'def': 'a napkin tied under the chin of a child while eating', 'name': 'bib'}, {'frequency': 'r', 'id': 94, 'synset': 'bible.n.01', 'synonyms': ['Bible'], 'def': 'the sacred writings of the Christian religions', 'name': 'Bible'}, {'frequency': 'f', 'id': 95, 'synset': 'bicycle.n.01', 'synonyms': ['bicycle', 'bike_(bicycle)'], 'def': 'a wheeled vehicle that has two wheels and is moved by foot pedals', 'name': 'bicycle'}, {'frequency': 'f', 'id': 96, 'synset': 'bill.n.09', 'synonyms': ['visor', 'vizor'], 'def': 'a brim that projects to the front to shade the eyes', 'name': 'visor'}, {'frequency': 'c', 'id': 97, 'synset': 'binder.n.03', 'synonyms': ['binder', 'ring-binder'], 'def': 'holds loose papers or magazines', 'name': 'binder'}, {'frequency': 'c', 'id': 98, 'synset': 'binoculars.n.01', 'synonyms': ['binoculars', 'field_glasses', 'opera_glasses'], 'def': 'an optical instrument designed for simultaneous use by both eyes', 'name': 'binoculars'}, {'frequency': 'f', 'id': 99, 'synset': 'bird.n.01', 'synonyms': ['bird'], 'def': 'animal characterized by feathers and wings', 'name': 'bird'}, {'frequency': 'r', 'id': 100, 'synset': 'bird_feeder.n.01', 'synonyms': ['birdfeeder'], 'def': 'an outdoor device that supplies food for wild birds', 'name': 'birdfeeder'}, {'frequency': 'r', 'id': 101, 'synset': 'birdbath.n.01', 'synonyms': ['birdbath'], 'def': 'an ornamental basin (usually in a garden) for birds to bathe in', 'name': 'birdbath'}, {'frequency': 'c', 'id': 102, 'synset': 'birdcage.n.01', 'synonyms': ['birdcage'], 'def': 'a cage in which a bird can be kept', 'name': 'birdcage'}, {'frequency': 'c', 'id': 103, 'synset': 'birdhouse.n.01', 'synonyms': ['birdhouse'], 'def': 'a shelter for birds', 'name': 'birdhouse'}, {'frequency': 'f', 'id': 104, 'synset': 'birthday_cake.n.01', 'synonyms': ['birthday_cake'], 'def': 'decorated cake served at a birthday party', 'name': 'birthday_cake'}, {'frequency': 'r', 'id': 105, 'synset': 'birthday_card.n.01', 'synonyms': ['birthday_card'], 'def': 'a card expressing a birthday greeting', 'name': 'birthday_card'}, {'frequency': 'r', 'id': 106, 'synset': 'biscuit.n.01', 'synonyms': ['biscuit_(bread)'], 'def': 'small round bread leavened with baking-powder or soda', 'name': 'biscuit_(bread)'}, {'frequency': 'r', 'id': 107, 'synset': 'black_flag.n.01', 'synonyms': ['pirate_flag'], 'def': 'a flag usually bearing a white skull and crossbones on a black background', 'name': 'pirate_flag'}, {'frequency': 'c', 'id': 108, 'synset': 'black_sheep.n.02', 'synonyms': ['black_sheep'], 'def': 'sheep with a black coat', 'name': 'black_sheep'}, {'frequency': 'c', 'id': 109, 'synset': 'blackboard.n.01', 'synonyms': ['blackboard', 'chalkboard'], 'def': 'sheet of slate; for writing with chalk', 'name': 'blackboard'}, {'frequency': 'f', 'id': 110, 'synset': 'blanket.n.01', 'synonyms': ['blanket'], 'def': 'bedding that keeps a person warm in bed', 'name': 'blanket'}, {'frequency': 'c', 'id': 111, 'synset': 'blazer.n.01', 'synonyms': ['blazer', 'sport_jacket', 'sport_coat', 'sports_jacket', 'sports_coat'], 'def': 'lightweight jacket; often striped in the colors of a club or school', 'name': 'blazer'}, {'frequency': 'f', 'id': 112, 'synset': 'blender.n.01', 'synonyms': ['blender', 'liquidizer', 'liquidiser'], 'def': 'an electrically powered mixer that mix or chop or liquefy foods', 'name': 'blender'}, {'frequency': 'r', 'id': 113, 'synset': 'blimp.n.02', 'synonyms': ['blimp'], 'def': 'a small nonrigid airship used for observation or as a barrage balloon', 'name': 'blimp'}, {'frequency': 'c', 'id': 114, 'synset': 'blinker.n.01', 'synonyms': ['blinker', 'flasher'], 'def': 'a light that flashes on and off; used as a signal or to send messages', 'name': 'blinker'}, {'frequency': 'c', 'id': 115, 'synset': 'blueberry.n.02', 'synonyms': ['blueberry'], 'def': 'sweet edible dark-blue berries of blueberry plants', 'name': 'blueberry'}, {'frequency': 'r', 'id': 116, 'synset': 'boar.n.02', 'synonyms': ['boar'], 'def': 'an uncastrated male hog', 'name': 'boar'}, {'frequency': 'r', 'id': 117, 'synset': 'board.n.09', 'synonyms': ['gameboard'], 'def': 'a flat portable surface (usually rectangular) designed for board games', 'name': 'gameboard'}, {'frequency': 'f', 'id': 118, 'synset': 'boat.n.01', 'synonyms': ['boat', 'ship_(boat)'], 'def': 'a vessel for travel on water', 'name': 'boat'}, {'frequency': 'c', 'id': 119, 'synset': 'bobbin.n.01', 'synonyms': ['bobbin', 'spool', 'reel'], 'def': 'a thing around which thread/tape/film or other flexible materials can be wound', 'name': 'bobbin'}, {'frequency': 'r', 'id': 120, 'synset': 'bobby_pin.n.01', 'synonyms': ['bobby_pin', 'hairgrip'], 'def': 'a flat wire hairpin used to hold bobbed hair in place', 'name': 'bobby_pin'}, {'frequency': 'c', 'id': 121, 'synset': 'boiled_egg.n.01', 'synonyms': ['boiled_egg', 'coddled_egg'], 'def': 'egg cooked briefly in the shell in gently boiling water', 'name': 'boiled_egg'}, {'frequency': 'r', 'id': 122, 'synset': 'bolo_tie.n.01', 'synonyms': ['bolo_tie', 'bolo', 'bola_tie', 'bola'], 'def': 'a cord fastened around the neck with an ornamental clasp and worn as a necktie', 'name': 'bolo_tie'}, {'frequency': 'c', 'id': 123, 'synset': 'bolt.n.03', 'synonyms': ['deadbolt'], 'def': 'the part of a lock that is engaged or withdrawn with a key', 'name': 'deadbolt'}, {'frequency': 'f', 'id': 124, 'synset': 'bolt.n.06', 'synonyms': ['bolt'], 'def': 'a screw that screws into a nut to form a fastener', 'name': 'bolt'}, {'frequency': 'r', 'id': 125, 'synset': 'bonnet.n.01', 'synonyms': ['bonnet'], 'def': 'a hat tied under the chin', 'name': 'bonnet'}, {'frequency': 'f', 'id': 126, 'synset': 'book.n.01', 'synonyms': ['book'], 'def': 'a written work or composition that has been published', 'name': 'book'}, {'frequency': 'r', 'id': 127, 'synset': 'book_bag.n.01', 'synonyms': ['book_bag'], 'def': 'a bag in which students carry their books', 'name': 'book_bag'}, {'frequency': 'c', 'id': 128, 'synset': 'bookcase.n.01', 'synonyms': ['bookcase'], 'def': 'a piece of furniture with shelves for storing books', 'name': 'bookcase'}, {'frequency': 'c', 'id': 129, 'synset': 'booklet.n.01', 'synonyms': ['booklet', 'brochure', 'leaflet', 'pamphlet'], 'def': 'a small book usually having a paper cover', 'name': 'booklet'}, {'frequency': 'r', 'id': 130, 'synset': 'bookmark.n.01', 'synonyms': ['bookmark', 'bookmarker'], 'def': 'a marker (a piece of paper or ribbon) placed between the pages of a book', 'name': 'bookmark'}, {'frequency': 'r', 'id': 131, 'synset': 'boom.n.04', 'synonyms': ['boom_microphone', 'microphone_boom'], 'def': 'a pole carrying an overhead microphone projected over a film or tv set', 'name': 'boom_microphone'}, {'frequency': 'f', 'id': 132, 'synset': 'boot.n.01', 'synonyms': ['boot'], 'def': 'footwear that covers the whole foot and lower leg', 'name': 'boot'}, {'frequency': 'f', 'id': 133, 'synset': 'bottle.n.01', 'synonyms': ['bottle'], 'def': 'a glass or plastic vessel used for storing drinks or other liquids', 'name': 'bottle'}, {'frequency': 'c', 'id': 134, 'synset': 'bottle_opener.n.01', 'synonyms': ['bottle_opener'], 'def': 'an opener for removing caps or corks from bottles', 'name': 'bottle_opener'}, {'frequency': 'c', 'id': 135, 'synset': 'bouquet.n.01', 'synonyms': ['bouquet'], 'def': 'an arrangement of flowers that is usually given as a present', 'name': 'bouquet'}, {'frequency': 'r', 'id': 136, 'synset': 'bow.n.04', 'synonyms': ['bow_(weapon)'], 'def': 'a weapon for shooting arrows', 'name': 'bow_(weapon)'}, {'frequency': 'f', 'id': 137, 'synset': 'bow.n.08', 'synonyms': ['bow_(decorative_ribbons)'], 'def': 'a decorative interlacing of ribbons', 'name': 'bow_(decorative_ribbons)'}, {'frequency': 'f', 'id': 138, 'synset': 'bow_tie.n.01', 'synonyms': ['bow-tie', 'bowtie'], 'def': "a man's tie that ties in a bow", 'name': 'bow-tie'}, {'frequency': 'f', 'id': 139, 'synset': 'bowl.n.03', 'synonyms': ['bowl'], 'def': 'a dish that is round and open at the top for serving foods', 'name': 'bowl'}, {'frequency': 'r', 'id': 140, 'synset': 'bowl.n.08', 'synonyms': ['pipe_bowl'], 'def': 'a small round container that is open at the top for holding tobacco', 'name': 'pipe_bowl'}, {'frequency': 'c', 'id': 141, 'synset': 'bowler_hat.n.01', 'synonyms': ['bowler_hat', 'bowler', 'derby_hat', 'derby', 'plug_hat'], 'def': 'a felt hat that is round and hard with a narrow brim', 'name': 'bowler_hat'}, {'frequency': 'r', 'id': 142, 'synset': 'bowling_ball.n.01', 'synonyms': ['bowling_ball'], 'def': 'a large ball with finger holes used in the sport of bowling', 'name': 'bowling_ball'}, {'frequency': 'r', 'id': 143, 'synset': 'bowling_pin.n.01', 'synonyms': ['bowling_pin'], 'def': 'a club-shaped wooden object used in bowling', 'name': 'bowling_pin'}, {'frequency': 'r', 'id': 144, 'synset': 'boxing_glove.n.01', 'synonyms': ['boxing_glove'], 'def': 'large glove coverings the fists of a fighter worn for the sport of boxing', 'name': 'boxing_glove'}, {'frequency': 'c', 'id': 145, 'synset': 'brace.n.06', 'synonyms': ['suspenders'], 'def': 'elastic straps that hold trousers up (usually used in the plural)', 'name': 'suspenders'}, {'frequency': 'f', 'id': 146, 'synset': 'bracelet.n.02', 'synonyms': ['bracelet', 'bangle'], 'def': 'jewelry worn around the wrist for decoration', 'name': 'bracelet'}, {'frequency': 'r', 'id': 147, 'synset': 'brass.n.07', 'synonyms': ['brass_plaque'], 'def': 'a memorial made of brass', 'name': 'brass_plaque'}, {'frequency': 'c', 'id': 148, 'synset': 'brassiere.n.01', 'synonyms': ['brassiere', 'bra', 'bandeau'], 'def': 'an undergarment worn by women to support their breasts', 'name': 'brassiere'}, {'frequency': 'c', 'id': 149, 'synset': 'bread-bin.n.01', 'synonyms': ['bread-bin', 'breadbox'], 'def': 'a container used to keep bread or cake in', 'name': 'bread-bin'}, {'frequency': 'r', 'id': 150, 'synset': 'breechcloth.n.01', 'synonyms': ['breechcloth', 'breechclout', 'loincloth'], 'def': 'a garment that provides covering for the loins', 'name': 'breechcloth'}, {'frequency': 'c', 'id': 151, 'synset': 'bridal_gown.n.01', 'synonyms': ['bridal_gown', 'wedding_gown', 'wedding_dress'], 'def': 'a gown worn by the bride at a wedding', 'name': 'bridal_gown'}, {'frequency': 'c', 'id': 152, 'synset': 'briefcase.n.01', 'synonyms': ['briefcase'], 'def': 'a case with a handle; for carrying papers or files or books', 'name': 'briefcase'}, {'frequency': 'c', 'id': 153, 'synset': 'bristle_brush.n.01', 'synonyms': ['bristle_brush'], 'def': 'a brush that is made with the short stiff hairs of an animal or plant', 'name': 'bristle_brush'}, {'frequency': 'f', 'id': 154, 'synset': 'broccoli.n.01', 'synonyms': ['broccoli'], 'def': 'plant with dense clusters of tight green flower buds', 'name': 'broccoli'}, {'frequency': 'r', 'id': 155, 'synset': 'brooch.n.01', 'synonyms': ['broach'], 'def': 'a decorative pin worn by women', 'name': 'broach'}, {'frequency': 'c', 'id': 156, 'synset': 'broom.n.01', 'synonyms': ['broom'], 'def': 'bundle of straws or twigs attached to a long handle; used for cleaning', 'name': 'broom'}, {'frequency': 'c', 'id': 157, 'synset': 'brownie.n.03', 'synonyms': ['brownie'], 'def': 'square or bar of very rich chocolate cake usually with nuts', 'name': 'brownie'}, {'frequency': 'c', 'id': 158, 'synset': 'brussels_sprouts.n.01', 'synonyms': ['brussels_sprouts'], 'def': 'the small edible cabbage-like buds growing along a stalk', 'name': 'brussels_sprouts'}, {'frequency': 'r', 'id': 159, 'synset': 'bubble_gum.n.01', 'synonyms': ['bubble_gum'], 'def': 'a kind of chewing gum that can be blown into bubbles', 'name': 'bubble_gum'}, {'frequency': 'f', 'id': 160, 'synset': 'bucket.n.01', 'synonyms': ['bucket', 'pail'], 'def': 'a roughly cylindrical vessel that is open at the top', 'name': 'bucket'}, {'frequency': 'r', 'id': 161, 'synset': 'buggy.n.01', 'synonyms': ['horse_buggy'], 'def': 'a small lightweight carriage; drawn by a single horse', 'name': 'horse_buggy'}, {'frequency': 'c', 'id': 162, 'synset': 'bull.n.11', 'synonyms': ['bull'], 'def': 'mature male cow', 'name': 'bull'}, {'frequency': 'r', 'id': 163, 'synset': 'bulldog.n.01', 'synonyms': ['bulldog'], 'def': 'a thickset short-haired dog with a large head and strong undershot lower jaw', 'name': 'bulldog'}, {'frequency': 'r', 'id': 164, 'synset': 'bulldozer.n.01', 'synonyms': ['bulldozer', 'dozer'], 'def': 'large powerful tractor; a large blade in front flattens areas of ground', 'name': 'bulldozer'}, {'frequency': 'c', 'id': 165, 'synset': 'bullet_train.n.01', 'synonyms': ['bullet_train'], 'def': 'a high-speed passenger train', 'name': 'bullet_train'}, {'frequency': 'c', 'id': 166, 'synset': 'bulletin_board.n.02', 'synonyms': ['bulletin_board', 'notice_board'], 'def': 'a board that hangs on a wall; displays announcements', 'name': 'bulletin_board'}, {'frequency': 'r', 'id': 167, 'synset': 'bulletproof_vest.n.01', 'synonyms': ['bulletproof_vest'], 'def': 'a vest capable of resisting the impact of a bullet', 'name': 'bulletproof_vest'}, {'frequency': 'c', 'id': 168, 'synset': 'bullhorn.n.01', 'synonyms': ['bullhorn', 'megaphone'], 'def': 'a portable loudspeaker with built-in microphone and amplifier', 'name': 'bullhorn'}, {'frequency': 'r', 'id': 169, 'synset': 'bully_beef.n.01', 'synonyms': ['corned_beef', 'corn_beef'], 'def': 'beef cured or pickled in brine', 'name': 'corned_beef'}, {'frequency': 'f', 'id': 170, 'synset': 'bun.n.01', 'synonyms': ['bun', 'roll'], 'def': 'small rounded bread either plain or sweet', 'name': 'bun'}, {'frequency': 'c', 'id': 171, 'synset': 'bunk_bed.n.01', 'synonyms': ['bunk_bed'], 'def': 'beds built one above the other', 'name': 'bunk_bed'}, {'frequency': 'f', 'id': 172, 'synset': 'buoy.n.01', 'synonyms': ['buoy'], 'def': 'a float attached by rope to the seabed to mark channels in a harbor or underwater hazards', 'name': 'buoy'}, {'frequency': 'r', 'id': 173, 'synset': 'burrito.n.01', 'synonyms': ['burrito'], 'def': 'a flour tortilla folded around a filling', 'name': 'burrito'}, {'frequency': 'f', 'id': 174, 'synset': 'bus.n.01', 'synonyms': ['bus_(vehicle)', 'autobus', 'charabanc', 'double-decker', 'motorbus', 'motorcoach'], 'def': 'a vehicle carrying many passengers; used for public transport', 'name': 'bus_(vehicle)'}, {'frequency': 'c', 'id': 175, 'synset': 'business_card.n.01', 'synonyms': ['business_card'], 'def': "a card on which are printed the person's name and business affiliation", 'name': 'business_card'}, {'frequency': 'c', 'id': 176, 'synset': 'butcher_knife.n.01', 'synonyms': ['butcher_knife'], 'def': 'a large sharp knife for cutting or trimming meat', 'name': 'butcher_knife'}, {'frequency': 'c', 'id': 177, 'synset': 'butter.n.01', 'synonyms': ['butter'], 'def': 'an edible emulsion of fat globules made by churning milk or cream; for cooking and table use', 'name': 'butter'}, {'frequency': 'c', 'id': 178, 'synset': 'butterfly.n.01', 'synonyms': ['butterfly'], 'def': 'insect typically having a slender body with knobbed antennae and broad colorful wings', 'name': 'butterfly'}, {'frequency': 'f', 'id': 179, 'synset': 'button.n.01', 'synonyms': ['button'], 'def': 'a round fastener sewn to shirts and coats etc to fit through buttonholes', 'name': 'button'}, {'frequency': 'f', 'id': 180, 'synset': 'cab.n.03', 'synonyms': ['cab_(taxi)', 'taxi', 'taxicab'], 'def': 'a car that takes passengers where they want to go in exchange for money', 'name': 'cab_(taxi)'}, {'frequency': 'r', 'id': 181, 'synset': 'cabana.n.01', 'synonyms': ['cabana'], 'def': 'a small tent used as a dressing room beside the sea or a swimming pool', 'name': 'cabana'}, {'frequency': 'r', 'id': 182, 'synset': 'cabin_car.n.01', 'synonyms': ['cabin_car', 'caboose'], 'def': 'a car on a freight train for use of the train crew; usually the last car on the train', 'name': 'cabin_car'}, {'frequency': 'f', 'id': 183, 'synset': 'cabinet.n.01', 'synonyms': ['cabinet'], 'def': 'a piece of furniture resembling a cupboard with doors and shelves and drawers', 'name': 'cabinet'}, {'frequency': 'r', 'id': 184, 'synset': 'cabinet.n.03', 'synonyms': ['locker', 'storage_locker'], 'def': 'a storage compartment for clothes and valuables; usually it has a lock', 'name': 'locker'}, {'frequency': 'f', 'id': 185, 'synset': 'cake.n.03', 'synonyms': ['cake'], 'def': 'baked goods made from or based on a mixture of flour, sugar, eggs, and fat', 'name': 'cake'}, {'frequency': 'c', 'id': 186, 'synset': 'calculator.n.02', 'synonyms': ['calculator'], 'def': 'a small machine that is used for mathematical calculations', 'name': 'calculator'}, {'frequency': 'f', 'id': 187, 'synset': 'calendar.n.02', 'synonyms': ['calendar'], 'def': 'a list or register of events (appointments/social events/court cases, etc)', 'name': 'calendar'}, {'frequency': 'c', 'id': 188, 'synset': 'calf.n.01', 'synonyms': ['calf'], 'def': 'young of domestic cattle', 'name': 'calf'}, {'frequency': 'c', 'id': 189, 'synset': 'camcorder.n.01', 'synonyms': ['camcorder'], 'def': 'a portable television camera and videocassette recorder', 'name': 'camcorder'}, {'frequency': 'c', 'id': 190, 'synset': 'camel.n.01', 'synonyms': ['camel'], 'def': 'cud-chewing mammal used as a draft or saddle animal in desert regions', 'name': 'camel'}, {'frequency': 'f', 'id': 191, 'synset': 'camera.n.01', 'synonyms': ['camera'], 'def': 'equipment for taking photographs', 'name': 'camera'}, {'frequency': 'c', 'id': 192, 'synset': 'camera_lens.n.01', 'synonyms': ['camera_lens'], 'def': 'a lens that focuses the image in a camera', 'name': 'camera_lens'}, {'frequency': 'c', 'id': 193, 'synset': 'camper.n.02', 'synonyms': ['camper_(vehicle)', 'camping_bus', 'motor_home'], 'def': 'a recreational vehicle equipped for camping out while traveling', 'name': 'camper_(vehicle)'}, {'frequency': 'f', 'id': 194, 'synset': 'can.n.01', 'synonyms': ['can', 'tin_can'], 'def': 'airtight sealed metal container for food or drink or paint etc.', 'name': 'can'}, {'frequency': 'c', 'id': 195, 'synset': 'can_opener.n.01', 'synonyms': ['can_opener', 'tin_opener'], 'def': 'a device for cutting cans open', 'name': 'can_opener'}, {'frequency': 'r', 'id': 196, 'synset': 'candelabrum.n.01', 'synonyms': ['candelabrum', 'candelabra'], 'def': 'branched candlestick; ornamental; has several lights', 'name': 'candelabrum'}, {'frequency': 'f', 'id': 197, 'synset': 'candle.n.01', 'synonyms': ['candle', 'candlestick'], 'def': 'stick of wax with a wick in the middle', 'name': 'candle'}, {'frequency': 'f', 'id': 198, 'synset': 'candlestick.n.01', 'synonyms': ['candle_holder'], 'def': 'a holder with sockets for candles', 'name': 'candle_holder'}, {'frequency': 'r', 'id': 199, 'synset': 'candy_bar.n.01', 'synonyms': ['candy_bar'], 'def': 'a candy shaped as a bar', 'name': 'candy_bar'}, {'frequency': 'c', 'id': 200, 'synset': 'candy_cane.n.01', 'synonyms': ['candy_cane'], 'def': 'a hard candy in the shape of a rod (usually with stripes)', 'name': 'candy_cane'}, {'frequency': 'c', 'id': 201, 'synset': 'cane.n.01', 'synonyms': ['walking_cane'], 'def': 'a stick that people can lean on to help them walk', 'name': 'walking_cane'}, {'frequency': 'c', 'id': 202, 'synset': 'canister.n.02', 'synonyms': ['canister', 'cannister'], 'def': 'metal container for storing dry foods such as tea or flour', 'name': 'canister'}, {'frequency': 'r', 'id': 203, 'synset': 'cannon.n.02', 'synonyms': ['cannon'], 'def': 'heavy gun fired from a tank', 'name': 'cannon'}, {'frequency': 'c', 'id': 204, 'synset': 'canoe.n.01', 'synonyms': ['canoe'], 'def': 'small and light boat; pointed at both ends; propelled with a paddle', 'name': 'canoe'}, {'frequency': 'r', 'id': 205, 'synset': 'cantaloup.n.02', 'synonyms': ['cantaloup', 'cantaloupe'], 'def': 'the fruit of a cantaloup vine; small to medium-sized melon with yellowish flesh', 'name': 'cantaloup'}, {'frequency': 'r', 'id': 206, 'synset': 'canteen.n.01', 'synonyms': ['canteen'], 'def': 'a flask for carrying water; used by soldiers or travelers', 'name': 'canteen'}, {'frequency': 'c', 'id': 207, 'synset': 'cap.n.01', 'synonyms': ['cap_(headwear)'], 'def': 'a tight-fitting headwear', 'name': 'cap_(headwear)'}, {'frequency': 'f', 'id': 208, 'synset': 'cap.n.02', 'synonyms': ['bottle_cap', 'cap_(container_lid)'], 'def': 'a top (as for a bottle)', 'name': 'bottle_cap'}, {'frequency': 'r', 'id': 209, 'synset': 'cape.n.02', 'synonyms': ['cape'], 'def': 'a sleeveless garment like a cloak but shorter', 'name': 'cape'}, {'frequency': 'c', 'id': 210, 'synset': 'cappuccino.n.01', 'synonyms': ['cappuccino', 'coffee_cappuccino'], 'def': 'equal parts of espresso and steamed milk', 'name': 'cappuccino'}, {'frequency': 'f', 'id': 211, 'synset': 'car.n.01', 'synonyms': ['car_(automobile)', 'auto_(automobile)', 'automobile'], 'def': 'a motor vehicle with four wheels', 'name': 'car_(automobile)'}, {'frequency': 'f', 'id': 212, 'synset': 'car.n.02', 'synonyms': ['railcar_(part_of_a_train)', 'railway_car_(part_of_a_train)', 'railroad_car_(part_of_a_train)'], 'def': 'a wheeled vehicle adapted to the rails of railroad', 'name': 'railcar_(part_of_a_train)'}, {'frequency': 'r', 'id': 213, 'synset': 'car.n.04', 'synonyms': ['elevator_car'], 'def': 'where passengers ride up and down', 'name': 'elevator_car'}, {'frequency': 'r', 'id': 214, 'synset': 'car_battery.n.01', 'synonyms': ['car_battery', 'automobile_battery'], 'def': 'a battery in a motor vehicle', 'name': 'car_battery'}, {'frequency': 'c', 'id': 215, 'synset': 'card.n.02', 'synonyms': ['identity_card'], 'def': 'a card certifying the identity of the bearer', 'name': 'identity_card'}, {'frequency': 'c', 'id': 216, 'synset': 'card.n.03', 'synonyms': ['card'], 'def': 'a rectangular piece of paper used to send messages (e.g. greetings or pictures)', 'name': 'card'}, {'frequency': 'r', 'id': 217, 'synset': 'cardigan.n.01', 'synonyms': ['cardigan'], 'def': 'knitted jacket that is fastened up the front with buttons or a zipper', 'name': 'cardigan'}, {'frequency': 'r', 'id': 218, 'synset': 'cargo_ship.n.01', 'synonyms': ['cargo_ship', 'cargo_vessel'], 'def': 'a ship designed to carry cargo', 'name': 'cargo_ship'}, {'frequency': 'r', 'id': 219, 'synset': 'carnation.n.01', 'synonyms': ['carnation'], 'def': 'plant with pink to purple-red spice-scented usually double flowers', 'name': 'carnation'}, {'frequency': 'c', 'id': 220, 'synset': 'carriage.n.02', 'synonyms': ['horse_carriage'], 'def': 'a vehicle with wheels drawn by one or more horses', 'name': 'horse_carriage'}, {'frequency': 'f', 'id': 221, 'synset': 'carrot.n.01', 'synonyms': ['carrot'], 'def': 'deep orange edible root of the cultivated carrot plant', 'name': 'carrot'}, {'frequency': 'c', 'id': 222, 'synset': 'carryall.n.01', 'synonyms': ['tote_bag'], 'def': 'a capacious bag or basket', 'name': 'tote_bag'}, {'frequency': 'c', 'id': 223, 'synset': 'cart.n.01', 'synonyms': ['cart'], 'def': 'a heavy open wagon usually having two wheels and drawn by an animal', 'name': 'cart'}, {'frequency': 'c', 'id': 224, 'synset': 'carton.n.02', 'synonyms': ['carton'], 'def': 'a box made of cardboard; opens by flaps on top', 'name': 'carton'}, {'frequency': 'c', 'id': 225, 'synset': 'cash_register.n.01', 'synonyms': ['cash_register', 'register_(for_cash_transactions)'], 'def': 'a cashbox with an adding machine to register transactions', 'name': 'cash_register'}, {'frequency': 'r', 'id': 226, 'synset': 'casserole.n.01', 'synonyms': ['casserole'], 'def': 'food cooked and served in a casserole', 'name': 'casserole'}, {'frequency': 'r', 'id': 227, 'synset': 'cassette.n.01', 'synonyms': ['cassette'], 'def': 'a container that holds a magnetic tape used for recording or playing sound or video', 'name': 'cassette'}, {'frequency': 'c', 'id': 228, 'synset': 'cast.n.05', 'synonyms': ['cast', 'plaster_cast', 'plaster_bandage'], 'def': 'bandage consisting of a firm covering that immobilizes broken bones while they heal', 'name': 'cast'}, {'frequency': 'f', 'id': 229, 'synset': 'cat.n.01', 'synonyms': ['cat'], 'def': 'a domestic house cat', 'name': 'cat'}, {'frequency': 'c', 'id': 230, 'synset': 'cauliflower.n.02', 'synonyms': ['cauliflower'], 'def': 'edible compact head of white undeveloped flowers', 'name': 'cauliflower'}, {'frequency': 'r', 'id': 231, 'synset': 'caviar.n.01', 'synonyms': ['caviar', 'caviare'], 'def': "salted roe of sturgeon or other large fish; usually served as an hors d'oeuvre", 'name': 'caviar'}, {'frequency': 'c', 'id': 232, 'synset': 'cayenne.n.02', 'synonyms': ['cayenne_(spice)', 'cayenne_pepper_(spice)', 'red_pepper_(spice)'], 'def': 'ground pods and seeds of pungent red peppers of the genus Capsicum', 'name': 'cayenne_(spice)'}, {'frequency': 'c', 'id': 233, 'synset': 'cd_player.n.01', 'synonyms': ['CD_player'], 'def': 'electronic equipment for playing compact discs (CDs)', 'name': 'CD_player'}, {'frequency': 'c', 'id': 234, 'synset': 'celery.n.01', 'synonyms': ['celery'], 'def': 'widely cultivated herb with aromatic leaf stalks that are eaten raw or cooked', 'name': 'celery'}, {'frequency': 'f', 'id': 235, 'synset': 'cellular_telephone.n.01', 'synonyms': ['cellular_telephone', 'cellular_phone', 'cellphone', 'mobile_phone', 'smart_phone'], 'def': 'a hand-held mobile telephone', 'name': 'cellular_telephone'}, {'frequency': 'r', 'id': 236, 'synset': 'chain_mail.n.01', 'synonyms': ['chain_mail', 'ring_mail', 'chain_armor', 'chain_armour', 'ring_armor', 'ring_armour'], 'def': '(Middle Ages) flexible armor made of interlinked metal rings', 'name': 'chain_mail'}, {'frequency': 'f', 'id': 237, 'synset': 'chair.n.01', 'synonyms': ['chair'], 'def': 'a seat for one person, with a support for the back', 'name': 'chair'}, {'frequency': 'r', 'id': 238, 'synset': 'chaise_longue.n.01', 'synonyms': ['chaise_longue', 'chaise', 'daybed'], 'def': 'a long chair; for reclining', 'name': 'chaise_longue'}, {'frequency': 'r', 'id': 239, 'synset': 'champagne.n.01', 'synonyms': ['champagne'], 'def': 'a white sparkling wine produced in Champagne or resembling that produced there', 'name': 'champagne'}, {'frequency': 'f', 'id': 240, 'synset': 'chandelier.n.01', 'synonyms': ['chandelier'], 'def': 'branched lighting fixture; often ornate; hangs from the ceiling', 'name': 'chandelier'}, {'frequency': 'r', 'id': 241, 'synset': 'chap.n.04', 'synonyms': ['chap'], 'def': 'leather leggings without a seat; worn over trousers by cowboys to protect their legs', 'name': 'chap'}, {'frequency': 'r', 'id': 242, 'synset': 'checkbook.n.01', 'synonyms': ['checkbook', 'chequebook'], 'def': 'a book issued to holders of checking accounts', 'name': 'checkbook'}, {'frequency': 'r', 'id': 243, 'synset': 'checkerboard.n.01', 'synonyms': ['checkerboard'], 'def': 'a board having 64 squares of two alternating colors', 'name': 'checkerboard'}, {'frequency': 'c', 'id': 244, 'synset': 'cherry.n.03', 'synonyms': ['cherry'], 'def': 'a red fruit with a single hard stone', 'name': 'cherry'}, {'frequency': 'r', 'id': 245, 'synset': 'chessboard.n.01', 'synonyms': ['chessboard'], 'def': 'a checkerboard used to play chess', 'name': 'chessboard'}, {'frequency': 'r', 'id': 246, 'synset': 'chest_of_drawers.n.01', 'synonyms': ['chest_of_drawers_(furniture)', 'bureau_(furniture)', 'chest_(furniture)'], 'def': 'furniture with drawers for keeping clothes', 'name': 'chest_of_drawers_(furniture)'}, {'frequency': 'c', 'id': 247, 'synset': 'chicken.n.02', 'synonyms': ['chicken_(animal)'], 'def': 'a domestic fowl bred for flesh or eggs', 'name': 'chicken_(animal)'}, {'frequency': 'c', 'id': 248, 'synset': 'chicken_wire.n.01', 'synonyms': ['chicken_wire'], 'def': 'a galvanized wire network with a hexagonal mesh; used to build fences', 'name': 'chicken_wire'}, {'frequency': 'r', 'id': 249, 'synset': 'chickpea.n.01', 'synonyms': ['chickpea', 'garbanzo'], 'def': 'the seed of the chickpea plant; usually dried', 'name': 'chickpea'}, {'frequency': 'r', 'id': 250, 'synset': 'chihuahua.n.03', 'synonyms': ['Chihuahua'], 'def': 'an old breed of tiny short-haired dog with protruding eyes from Mexico', 'name': 'Chihuahua'}, {'frequency': 'r', 'id': 251, 'synset': 'chili.n.02', 'synonyms': ['chili_(vegetable)', 'chili_pepper_(vegetable)', 'chilli_(vegetable)', 'chilly_(vegetable)', 'chile_(vegetable)'], 'def': 'very hot and finely tapering pepper of special pungency', 'name': 'chili_(vegetable)'}, {'frequency': 'r', 'id': 252, 'synset': 'chime.n.01', 'synonyms': ['chime', 'gong'], 'def': 'an instrument consisting of a set of bells that are struck with a hammer', 'name': 'chime'}, {'frequency': 'r', 'id': 253, 'synset': 'chinaware.n.01', 'synonyms': ['chinaware'], 'def': 'dishware made of high quality porcelain', 'name': 'chinaware'}, {'frequency': 'c', 'id': 254, 'synset': 'chip.n.04', 'synonyms': ['crisp_(potato_chip)', 'potato_chip'], 'def': 'a thin crisp slice of potato fried in deep fat', 'name': 'crisp_(potato_chip)'}, {'frequency': 'r', 'id': 255, 'synset': 'chip.n.06', 'synonyms': ['poker_chip'], 'def': 'a small disk-shaped counter used to represent money when gambling', 'name': 'poker_chip'}, {'frequency': 'c', 'id': 256, 'synset': 'chocolate_bar.n.01', 'synonyms': ['chocolate_bar'], 'def': 'a bar of chocolate candy', 'name': 'chocolate_bar'}, {'frequency': 'c', 'id': 257, 'synset': 'chocolate_cake.n.01', 'synonyms': ['chocolate_cake'], 'def': 'cake containing chocolate', 'name': 'chocolate_cake'}, {'frequency': 'r', 'id': 258, 'synset': 'chocolate_milk.n.01', 'synonyms': ['chocolate_milk'], 'def': 'milk flavored with chocolate syrup', 'name': 'chocolate_milk'}, {'frequency': 'r', 'id': 259, 'synset': 'chocolate_mousse.n.01', 'synonyms': ['chocolate_mousse'], 'def': 'dessert mousse made with chocolate', 'name': 'chocolate_mousse'}, {'frequency': 'f', 'id': 260, 'synset': 'choker.n.03', 'synonyms': ['choker', 'collar', 'neckband'], 'def': 'necklace that fits tightly around the neck', 'name': 'choker'}, {'frequency': 'f', 'id': 261, 'synset': 'chopping_board.n.01', 'synonyms': ['chopping_board', 'cutting_board', 'chopping_block'], 'def': 'a wooden board where meats or vegetables can be cut', 'name': 'chopping_board'}, {'frequency': 'c', 'id': 262, 'synset': 'chopstick.n.01', 'synonyms': ['chopstick'], 'def': 'one of a pair of slender sticks used as oriental tableware to eat food with', 'name': 'chopstick'}, {'frequency': 'f', 'id': 263, 'synset': 'christmas_tree.n.05', 'synonyms': ['Christmas_tree'], 'def': 'an ornamented evergreen used as a Christmas decoration', 'name': 'Christmas_tree'}, {'frequency': 'c', 'id': 264, 'synset': 'chute.n.02', 'synonyms': ['slide'], 'def': 'sloping channel through which things can descend', 'name': 'slide'}, {'frequency': 'r', 'id': 265, 'synset': 'cider.n.01', 'synonyms': ['cider', 'cyder'], 'def': 'a beverage made from juice pressed from apples', 'name': 'cider'}, {'frequency': 'r', 'id': 266, 'synset': 'cigar_box.n.01', 'synonyms': ['cigar_box'], 'def': 'a box for holding cigars', 'name': 'cigar_box'}, {'frequency': 'c', 'id': 267, 'synset': 'cigarette.n.01', 'synonyms': ['cigarette'], 'def': 'finely ground tobacco wrapped in paper; for smoking', 'name': 'cigarette'}, {'frequency': 'c', 'id': 268, 'synset': 'cigarette_case.n.01', 'synonyms': ['cigarette_case', 'cigarette_pack'], 'def': 'a small flat case for holding cigarettes', 'name': 'cigarette_case'}, {'frequency': 'f', 'id': 269, 'synset': 'cistern.n.02', 'synonyms': ['cistern', 'water_tank'], 'def': 'a tank that holds the water used to flush a toilet', 'name': 'cistern'}, {'frequency': 'r', 'id': 270, 'synset': 'clarinet.n.01', 'synonyms': ['clarinet'], 'def': 'a single-reed instrument with a straight tube', 'name': 'clarinet'}, {'frequency': 'r', 'id': 271, 'synset': 'clasp.n.01', 'synonyms': ['clasp'], 'def': 'a fastener (as a buckle or hook) that is used to hold two things together', 'name': 'clasp'}, {'frequency': 'c', 'id': 272, 'synset': 'cleansing_agent.n.01', 'synonyms': ['cleansing_agent', 'cleanser', 'cleaner'], 'def': 'a preparation used in cleaning something', 'name': 'cleansing_agent'}, {'frequency': 'r', 'id': 273, 'synset': 'clementine.n.01', 'synonyms': ['clementine'], 'def': 'a variety of mandarin orange', 'name': 'clementine'}, {'frequency': 'c', 'id': 274, 'synset': 'clip.n.03', 'synonyms': ['clip'], 'def': 'any of various small fasteners used to hold loose articles together', 'name': 'clip'}, {'frequency': 'c', 'id': 275, 'synset': 'clipboard.n.01', 'synonyms': ['clipboard'], 'def': 'a small writing board with a clip at the top for holding papers', 'name': 'clipboard'}, {'frequency': 'f', 'id': 276, 'synset': 'clock.n.01', 'synonyms': ['clock', 'timepiece', 'timekeeper'], 'def': 'a timepiece that shows the time of day', 'name': 'clock'}, {'frequency': 'f', 'id': 277, 'synset': 'clock_tower.n.01', 'synonyms': ['clock_tower'], 'def': 'a tower with a large clock visible high up on an outside face', 'name': 'clock_tower'}, {'frequency': 'c', 'id': 278, 'synset': 'clothes_hamper.n.01', 'synonyms': ['clothes_hamper', 'laundry_basket', 'clothes_basket'], 'def': 'a hamper that holds dirty clothes to be washed or wet clothes to be dried', 'name': 'clothes_hamper'}, {'frequency': 'c', 'id': 279, 'synset': 'clothespin.n.01', 'synonyms': ['clothespin', 'clothes_peg'], 'def': 'wood or plastic fastener; for holding clothes on a clothesline', 'name': 'clothespin'}, {'frequency': 'r', 'id': 280, 'synset': 'clutch_bag.n.01', 'synonyms': ['clutch_bag'], 'def': "a woman's strapless purse that is carried in the hand", 'name': 'clutch_bag'}, {'frequency': 'f', 'id': 281, 'synset': 'coaster.n.03', 'synonyms': ['coaster'], 'def': 'a covering (plate or mat) that protects the surface of a table', 'name': 'coaster'}, {'frequency': 'f', 'id': 282, 'synset': 'coat.n.01', 'synonyms': ['coat'], 'def': 'an outer garment that has sleeves and covers the body from shoulder down', 'name': 'coat'}, {'frequency': 'c', 'id': 283, 'synset': 'coat_hanger.n.01', 'synonyms': ['coat_hanger', 'clothes_hanger', 'dress_hanger'], 'def': "a hanger that is shaped like a person's shoulders", 'name': 'coat_hanger'}, {'frequency': 'r', 'id': 284, 'synset': 'coatrack.n.01', 'synonyms': ['coatrack', 'hatrack'], 'def': 'a rack with hooks for temporarily holding coats and hats', 'name': 'coatrack'}, {'frequency': 'c', 'id': 285, 'synset': 'cock.n.04', 'synonyms': ['cock', 'rooster'], 'def': 'adult male chicken', 'name': 'cock'}, {'frequency': 'c', 'id': 286, 'synset': 'coconut.n.02', 'synonyms': ['coconut', 'cocoanut'], 'def': 'large hard-shelled brown oval nut with a fibrous husk', 'name': 'coconut'}, {'frequency': 'r', 'id': 287, 'synset': 'coffee_filter.n.01', 'synonyms': ['coffee_filter'], 'def': 'filter (usually of paper) that passes the coffee and retains the coffee grounds', 'name': 'coffee_filter'}, {'frequency': 'f', 'id': 288, 'synset': 'coffee_maker.n.01', 'synonyms': ['coffee_maker', 'coffee_machine'], 'def': 'a kitchen appliance for brewing coffee automatically', 'name': 'coffee_maker'}, {'frequency': 'f', 'id': 289, 'synset': 'coffee_table.n.01', 'synonyms': ['coffee_table', 'cocktail_table'], 'def': 'low table where magazines can be placed and coffee or cocktails are served', 'name': 'coffee_table'}, {'frequency': 'c', 'id': 290, 'synset': 'coffeepot.n.01', 'synonyms': ['coffeepot'], 'def': 'tall pot in which coffee is brewed', 'name': 'coffeepot'}, {'frequency': 'r', 'id': 291, 'synset': 'coil.n.05', 'synonyms': ['coil'], 'def': 'tubing that is wound in a spiral', 'name': 'coil'}, {'frequency': 'c', 'id': 292, 'synset': 'coin.n.01', 'synonyms': ['coin'], 'def': 'a flat metal piece (usually a disc) used as money', 'name': 'coin'}, {'frequency': 'r', 'id': 293, 'synset': 'colander.n.01', 'synonyms': ['colander', 'cullender'], 'def': 'bowl-shaped strainer; used to wash or drain foods', 'name': 'colander'}, {'frequency': 'c', 'id': 294, 'synset': 'coleslaw.n.01', 'synonyms': ['coleslaw', 'slaw'], 'def': 'basically shredded cabbage', 'name': 'coleslaw'}, {'frequency': 'r', 'id': 295, 'synset': 'coloring_material.n.01', 'synonyms': ['coloring_material', 'colouring_material'], 'def': 'any material used for its color', 'name': 'coloring_material'}, {'frequency': 'r', 'id': 296, 'synset': 'combination_lock.n.01', 'synonyms': ['combination_lock'], 'def': 'lock that can be opened only by turning dials in a special sequence', 'name': 'combination_lock'}, {'frequency': 'c', 'id': 297, 'synset': 'comforter.n.04', 'synonyms': ['pacifier', 'teething_ring'], 'def': 'device used for an infant to suck or bite on', 'name': 'pacifier'}, {'frequency': 'r', 'id': 298, 'synset': 'comic_book.n.01', 'synonyms': ['comic_book'], 'def': 'a magazine devoted to comic strips', 'name': 'comic_book'}, {'frequency': 'f', 'id': 299, 'synset': 'computer_keyboard.n.01', 'synonyms': ['computer_keyboard', 'keyboard_(computer)'], 'def': 'a keyboard that is a data input device for computers', 'name': 'computer_keyboard'}, {'frequency': 'r', 'id': 300, 'synset': 'concrete_mixer.n.01', 'synonyms': ['concrete_mixer', 'cement_mixer'], 'def': 'a machine with a large revolving drum in which cement/concrete is mixed', 'name': 'concrete_mixer'}, {'frequency': 'f', 'id': 301, 'synset': 'cone.n.01', 'synonyms': ['cone', 'traffic_cone'], 'def': 'a cone-shaped object used to direct traffic', 'name': 'cone'}, {'frequency': 'f', 'id': 302, 'synset': 'control.n.09', 'synonyms': ['control', 'controller'], 'def': 'a mechanism that controls the operation of a machine', 'name': 'control'}, {'frequency': 'r', 'id': 303, 'synset': 'convertible.n.01', 'synonyms': ['convertible_(automobile)'], 'def': 'a car that has top that can be folded or removed', 'name': 'convertible_(automobile)'}, {'frequency': 'r', 'id': 304, 'synset': 'convertible.n.03', 'synonyms': ['sofa_bed'], 'def': 'a sofa that can be converted into a bed', 'name': 'sofa_bed'}, {'frequency': 'c', 'id': 305, 'synset': 'cookie.n.01', 'synonyms': ['cookie', 'cooky', 'biscuit_(cookie)'], 'def': "any of various small flat sweet cakes (`biscuit' is the British term)", 'name': 'cookie'}, {'frequency': 'r', 'id': 306, 'synset': 'cookie_jar.n.01', 'synonyms': ['cookie_jar', 'cooky_jar'], 'def': 'a jar in which cookies are kept (and sometimes money is hidden)', 'name': 'cookie_jar'}, {'frequency': 'r', 'id': 307, 'synset': 'cooking_utensil.n.01', 'synonyms': ['cooking_utensil'], 'def': 'a kitchen utensil made of material that does not melt easily; used for cooking', 'name': 'cooking_utensil'}, {'frequency': 'f', 'id': 308, 'synset': 'cooler.n.01', 'synonyms': ['cooler_(for_food)', 'ice_chest'], 'def': 'an insulated box for storing food often with ice', 'name': 'cooler_(for_food)'}, {'frequency': 'c', 'id': 309, 'synset': 'cork.n.04', 'synonyms': ['cork_(bottle_plug)', 'bottle_cork'], 'def': 'the plug in the mouth of a bottle (especially a wine bottle)', 'name': 'cork_(bottle_plug)'}, {'frequency': 'r', 'id': 310, 'synset': 'corkboard.n.01', 'synonyms': ['corkboard'], 'def': 'a sheet consisting of cork granules', 'name': 'corkboard'}, {'frequency': 'r', 'id': 311, 'synset': 'corkscrew.n.01', 'synonyms': ['corkscrew', 'bottle_screw'], 'def': 'a bottle opener that pulls corks', 'name': 'corkscrew'}, {'frequency': 'c', 'id': 312, 'synset': 'corn.n.03', 'synonyms': ['edible_corn', 'corn', 'maize'], 'def': 'ears of corn that can be prepared and served for human food', 'name': 'edible_corn'}, {'frequency': 'r', 'id': 313, 'synset': 'cornbread.n.01', 'synonyms': ['cornbread'], 'def': 'bread made primarily of cornmeal', 'name': 'cornbread'}, {'frequency': 'c', 'id': 314, 'synset': 'cornet.n.01', 'synonyms': ['cornet', 'horn', 'trumpet'], 'def': 'a brass musical instrument with a narrow tube and a flared bell and many valves', 'name': 'cornet'}, {'frequency': 'c', 'id': 315, 'synset': 'cornice.n.01', 'synonyms': ['cornice', 'valance', 'valance_board', 'pelmet'], 'def': 'a decorative framework to conceal curtain fixtures at the top of a window casing', 'name': 'cornice'}, {'frequency': 'r', 'id': 316, 'synset': 'cornmeal.n.01', 'synonyms': ['cornmeal'], 'def': 'coarsely ground corn', 'name': 'cornmeal'}, {'frequency': 'r', 'id': 317, 'synset': 'corset.n.01', 'synonyms': ['corset', 'girdle'], 'def': "a woman's close-fitting foundation garment", 'name': 'corset'}, {'frequency': 'r', 'id': 318, 'synset': 'cos.n.02', 'synonyms': ['romaine_lettuce'], 'def': 'lettuce with long dark-green leaves in a loosely packed elongated head', 'name': 'romaine_lettuce'}, {'frequency': 'c', 'id': 319, 'synset': 'costume.n.04', 'synonyms': ['costume'], 'def': 'the attire characteristic of a country or a time or a social class', 'name': 'costume'}, {'frequency': 'r', 'id': 320, 'synset': 'cougar.n.01', 'synonyms': ['cougar', 'puma', 'catamount', 'mountain_lion', 'panther'], 'def': 'large American feline resembling a lion', 'name': 'cougar'}, {'frequency': 'r', 'id': 321, 'synset': 'coverall.n.01', 'synonyms': ['coverall'], 'def': 'a loose-fitting protective garment that is worn over other clothing', 'name': 'coverall'}, {'frequency': 'r', 'id': 322, 'synset': 'cowbell.n.01', 'synonyms': ['cowbell'], 'def': 'a bell hung around the neck of cow so that the cow can be easily located', 'name': 'cowbell'}, {'frequency': 'f', 'id': 323, 'synset': 'cowboy_hat.n.01', 'synonyms': ['cowboy_hat', 'ten-gallon_hat'], 'def': 'a hat with a wide brim and a soft crown; worn by American ranch hands', 'name': 'cowboy_hat'}, {'frequency': 'r', 'id': 324, 'synset': 'crab.n.01', 'synonyms': ['crab_(animal)'], 'def': 'decapod having eyes on short stalks and a broad flattened shell and pincers', 'name': 'crab_(animal)'}, {'frequency': 'c', 'id': 325, 'synset': 'cracker.n.01', 'synonyms': ['cracker'], 'def': 'a thin crisp wafer', 'name': 'cracker'}, {'frequency': 'r', 'id': 326, 'synset': 'crape.n.01', 'synonyms': ['crape', 'crepe', 'French_pancake'], 'def': 'small very thin pancake', 'name': 'crape'}, {'frequency': 'f', 'id': 327, 'synset': 'crate.n.01', 'synonyms': ['crate'], 'def': 'a rugged box (usually made of wood); used for shipping', 'name': 'crate'}, {'frequency': 'r', 'id': 328, 'synset': 'crayon.n.01', 'synonyms': ['crayon', 'wax_crayon'], 'def': 'writing or drawing implement made of a colored stick of composition wax', 'name': 'crayon'}, {'frequency': 'r', 'id': 329, 'synset': 'cream_pitcher.n.01', 'synonyms': ['cream_pitcher'], 'def': 'a small pitcher for serving cream', 'name': 'cream_pitcher'}, {'frequency': 'r', 'id': 330, 'synset': 'credit_card.n.01', 'synonyms': ['credit_card', 'charge_card', 'debit_card'], 'def': 'a card, usually plastic, used to pay for goods and services', 'name': 'credit_card'}, {'frequency': 'c', 'id': 331, 'synset': 'crescent_roll.n.01', 'synonyms': ['crescent_roll', 'croissant'], 'def': 'very rich flaky crescent-shaped roll', 'name': 'crescent_roll'}, {'frequency': 'c', 'id': 332, 'synset': 'crib.n.01', 'synonyms': ['crib', 'cot'], 'def': 'baby bed with high sides made of slats', 'name': 'crib'}, {'frequency': 'c', 'id': 333, 'synset': 'crock.n.03', 'synonyms': ['crock_pot', 'earthenware_jar'], 'def': 'an earthen jar (made of baked clay)', 'name': 'crock_pot'}, {'frequency': 'f', 'id': 334, 'synset': 'crossbar.n.01', 'synonyms': ['crossbar'], 'def': 'a horizontal bar that goes across something', 'name': 'crossbar'}, {'frequency': 'r', 'id': 335, 'synset': 'crouton.n.01', 'synonyms': ['crouton'], 'def': 'a small piece of toasted or fried bread; served in soup or salads', 'name': 'crouton'}, {'frequency': 'r', 'id': 336, 'synset': 'crow.n.01', 'synonyms': ['crow'], 'def': 'black birds having a raucous call', 'name': 'crow'}, {'frequency': 'c', 'id': 337, 'synset': 'crown.n.04', 'synonyms': ['crown'], 'def': 'an ornamental jeweled headdress signifying sovereignty', 'name': 'crown'}, {'frequency': 'c', 'id': 338, 'synset': 'crucifix.n.01', 'synonyms': ['crucifix'], 'def': 'representation of the cross on which Jesus died', 'name': 'crucifix'}, {'frequency': 'c', 'id': 339, 'synset': 'cruise_ship.n.01', 'synonyms': ['cruise_ship', 'cruise_liner'], 'def': 'a passenger ship used commercially for pleasure cruises', 'name': 'cruise_ship'}, {'frequency': 'c', 'id': 340, 'synset': 'cruiser.n.01', 'synonyms': ['police_cruiser', 'patrol_car', 'police_car', 'squad_car'], 'def': 'a car in which policemen cruise the streets', 'name': 'police_cruiser'}, {'frequency': 'c', 'id': 341, 'synset': 'crumb.n.03', 'synonyms': ['crumb'], 'def': 'small piece of e.g. bread or cake', 'name': 'crumb'}, {'frequency': 'r', 'id': 342, 'synset': 'crutch.n.01', 'synonyms': ['crutch'], 'def': 'a wooden or metal staff that fits under the armpit and reaches to the ground', 'name': 'crutch'}, {'frequency': 'c', 'id': 343, 'synset': 'cub.n.03', 'synonyms': ['cub_(animal)'], 'def': 'the young of certain carnivorous mammals such as the bear or wolf or lion', 'name': 'cub_(animal)'}, {'frequency': 'r', 'id': 344, 'synset': 'cube.n.05', 'synonyms': ['cube', 'square_block'], 'def': 'a block in the (approximate) shape of a cube', 'name': 'cube'}, {'frequency': 'f', 'id': 345, 'synset': 'cucumber.n.02', 'synonyms': ['cucumber', 'cuke'], 'def': 'cylindrical green fruit with thin green rind and white flesh eaten as a vegetable', 'name': 'cucumber'}, {'frequency': 'c', 'id': 346, 'synset': 'cufflink.n.01', 'synonyms': ['cufflink'], 'def': 'jewelry consisting of linked buttons used to fasten the cuffs of a shirt', 'name': 'cufflink'}, {'frequency': 'f', 'id': 347, 'synset': 'cup.n.01', 'synonyms': ['cup'], 'def': 'a small open container usually used for drinking; usually has a handle', 'name': 'cup'}, {'frequency': 'c', 'id': 348, 'synset': 'cup.n.08', 'synonyms': ['trophy_cup'], 'def': 'a metal vessel with handles that is awarded as a trophy to a competition winner', 'name': 'trophy_cup'}, {'frequency': 'c', 'id': 349, 'synset': 'cupcake.n.01', 'synonyms': ['cupcake'], 'def': 'small cake baked in a muffin tin', 'name': 'cupcake'}, {'frequency': 'r', 'id': 350, 'synset': 'curler.n.01', 'synonyms': ['hair_curler', 'hair_roller', 'hair_crimper'], 'def': 'a cylindrical tube around which the hair is wound to curl it', 'name': 'hair_curler'}, {'frequency': 'r', 'id': 351, 'synset': 'curling_iron.n.01', 'synonyms': ['curling_iron'], 'def': 'a cylindrical home appliance that heats hair that has been curled around it', 'name': 'curling_iron'}, {'frequency': 'f', 'id': 352, 'synset': 'curtain.n.01', 'synonyms': ['curtain', 'drapery'], 'def': 'hanging cloth used as a blind (especially for a window)', 'name': 'curtain'}, {'frequency': 'f', 'id': 353, 'synset': 'cushion.n.03', 'synonyms': ['cushion'], 'def': 'a soft bag filled with air or padding such as feathers or foam rubber', 'name': 'cushion'}, {'frequency': 'r', 'id': 354, 'synset': 'custard.n.01', 'synonyms': ['custard'], 'def': 'sweetened mixture of milk and eggs baked or boiled or frozen', 'name': 'custard'}, {'frequency': 'c', 'id': 355, 'synset': 'cutter.n.06', 'synonyms': ['cutting_tool'], 'def': 'a cutting implement; a tool for cutting', 'name': 'cutting_tool'}, {'frequency': 'r', 'id': 356, 'synset': 'cylinder.n.04', 'synonyms': ['cylinder'], 'def': 'a cylindrical container', 'name': 'cylinder'}, {'frequency': 'r', 'id': 357, 'synset': 'cymbal.n.01', 'synonyms': ['cymbal'], 'def': 'a percussion instrument consisting of a concave brass disk', 'name': 'cymbal'}, {'frequency': 'r', 'id': 358, 'synset': 'dachshund.n.01', 'synonyms': ['dachshund', 'dachsie', 'badger_dog'], 'def': 'small long-bodied short-legged breed of dog having a short sleek coat and long drooping ears', 'name': 'dachshund'}, {'frequency': 'r', 'id': 359, 'synset': 'dagger.n.01', 'synonyms': ['dagger'], 'def': 'a short knife with a pointed blade used for piercing or stabbing', 'name': 'dagger'}, {'frequency': 'r', 'id': 360, 'synset': 'dartboard.n.01', 'synonyms': ['dartboard'], 'def': 'a circular board of wood or cork used as the target in the game of darts', 'name': 'dartboard'}, {'frequency': 'r', 'id': 361, 'synset': 'date.n.08', 'synonyms': ['date_(fruit)'], 'def': 'sweet edible fruit of the date palm with a single long woody seed', 'name': 'date_(fruit)'}, {'frequency': 'f', 'id': 362, 'synset': 'deck_chair.n.01', 'synonyms': ['deck_chair', 'beach_chair'], 'def': 'a folding chair for use outdoors; a wooden frame supports a length of canvas', 'name': 'deck_chair'}, {'frequency': 'c', 'id': 363, 'synset': 'deer.n.01', 'synonyms': ['deer', 'cervid'], 'def': "distinguished from Bovidae by the male's having solid deciduous antlers", 'name': 'deer'}, {'frequency': 'c', 'id': 364, 'synset': 'dental_floss.n.01', 'synonyms': ['dental_floss', 'floss'], 'def': 'a soft thread for cleaning the spaces between the teeth', 'name': 'dental_floss'}, {'frequency': 'f', 'id': 365, 'synset': 'desk.n.01', 'synonyms': ['desk'], 'def': 'a piece of furniture with a writing surface and usually drawers or other compartments', 'name': 'desk'}, {'frequency': 'r', 'id': 366, 'synset': 'detergent.n.01', 'synonyms': ['detergent'], 'def': 'a surface-active chemical widely used in industry and laundering', 'name': 'detergent'}, {'frequency': 'c', 'id': 367, 'synset': 'diaper.n.01', 'synonyms': ['diaper'], 'def': 'garment consisting of a folded cloth drawn up between the legs and fastened at the waist', 'name': 'diaper'}, {'frequency': 'r', 'id': 368, 'synset': 'diary.n.01', 'synonyms': ['diary', 'journal'], 'def': 'a daily written record of (usually personal) experiences and observations', 'name': 'diary'}, {'frequency': 'r', 'id': 369, 'synset': 'die.n.01', 'synonyms': ['die', 'dice'], 'def': 'a small cube with 1 to 6 spots on the six faces; used in gambling', 'name': 'die'}, {'frequency': 'r', 'id': 370, 'synset': 'dinghy.n.01', 'synonyms': ['dinghy', 'dory', 'rowboat'], 'def': 'a small boat of shallow draft with seats and oars with which it is propelled', 'name': 'dinghy'}, {'frequency': 'f', 'id': 371, 'synset': 'dining_table.n.01', 'synonyms': ['dining_table'], 'def': 'a table at which meals are served', 'name': 'dining_table'}, {'frequency': 'r', 'id': 372, 'synset': 'dinner_jacket.n.01', 'synonyms': ['tux', 'tuxedo'], 'def': 'semiformal evening dress for men', 'name': 'tux'}, {'frequency': 'c', 'id': 373, 'synset': 'dish.n.01', 'synonyms': ['dish'], 'def': 'a piece of dishware normally used as a container for holding or serving food', 'name': 'dish'}, {'frequency': 'c', 'id': 374, 'synset': 'dish.n.05', 'synonyms': ['dish_antenna'], 'def': 'directional antenna consisting of a parabolic reflector', 'name': 'dish_antenna'}, {'frequency': 'c', 'id': 375, 'synset': 'dishrag.n.01', 'synonyms': ['dishrag', 'dishcloth'], 'def': 'a cloth for washing dishes', 'name': 'dishrag'}, {'frequency': 'c', 'id': 376, 'synset': 'dishtowel.n.01', 'synonyms': ['dishtowel', 'tea_towel'], 'def': 'a towel for drying dishes', 'name': 'dishtowel'}, {'frequency': 'f', 'id': 377, 'synset': 'dishwasher.n.01', 'synonyms': ['dishwasher', 'dishwashing_machine'], 'def': 'a machine for washing dishes', 'name': 'dishwasher'}, {'frequency': 'r', 'id': 378, 'synset': 'dishwasher_detergent.n.01', 'synonyms': ['dishwasher_detergent', 'dishwashing_detergent', 'dishwashing_liquid'], 'def': 'a low-sudsing detergent designed for use in dishwashers', 'name': 'dishwasher_detergent'}, {'frequency': 'r', 'id': 379, 'synset': 'diskette.n.01', 'synonyms': ['diskette', 'floppy', 'floppy_disk'], 'def': 'a small plastic magnetic disk enclosed in a stiff envelope used to store data', 'name': 'diskette'}, {'frequency': 'c', 'id': 380, 'synset': 'dispenser.n.01', 'synonyms': ['dispenser'], 'def': 'a container so designed that the contents can be used in prescribed amounts', 'name': 'dispenser'}, {'frequency': 'c', 'id': 381, 'synset': 'dixie_cup.n.01', 'synonyms': ['Dixie_cup', 'paper_cup'], 'def': 'a disposable cup made of paper; for holding drinks', 'name': 'Dixie_cup'}, {'frequency': 'f', 'id': 382, 'synset': 'dog.n.01', 'synonyms': ['dog'], 'def': 'a common domesticated dog', 'name': 'dog'}, {'frequency': 'f', 'id': 383, 'synset': 'dog_collar.n.01', 'synonyms': ['dog_collar'], 'def': 'a collar for a dog', 'name': 'dog_collar'}, {'frequency': 'c', 'id': 384, 'synset': 'doll.n.01', 'synonyms': ['doll'], 'def': 'a toy replica of a HUMAN (NOT AN ANIMAL)', 'name': 'doll'}, {'frequency': 'r', 'id': 385, 'synset': 'dollar.n.02', 'synonyms': ['dollar', 'dollar_bill', 'one_dollar_bill'], 'def': 'a piece of paper money worth one dollar', 'name': 'dollar'}, {'frequency': 'r', 'id': 386, 'synset': 'dolphin.n.02', 'synonyms': ['dolphin'], 'def': 'any of various small toothed whales with a beaklike snout; larger than porpoises', 'name': 'dolphin'}, {'frequency': 'c', 'id': 387, 'synset': 'domestic_ass.n.01', 'synonyms': ['domestic_ass', 'donkey'], 'def': 'domestic beast of burden descended from the African wild ass; patient but stubborn', 'name': 'domestic_ass'}, {'frequency': 'r', 'id': 388, 'synset': 'domino.n.03', 'synonyms': ['eye_mask'], 'def': 'a mask covering the upper part of the face but with holes for the eyes', 'name': 'eye_mask'}, {'frequency': 'r', 'id': 389, 'synset': 'doorbell.n.01', 'synonyms': ['doorbell', 'buzzer'], 'def': 'a button at an outer door that gives a ringing or buzzing signal when pushed', 'name': 'doorbell'}, {'frequency': 'f', 'id': 390, 'synset': 'doorknob.n.01', 'synonyms': ['doorknob', 'doorhandle'], 'def': "a knob used to open a door (often called `doorhandle' in Great Britain)", 'name': 'doorknob'}, {'frequency': 'c', 'id': 391, 'synset': 'doormat.n.02', 'synonyms': ['doormat', 'welcome_mat'], 'def': 'a mat placed outside an exterior door for wiping the shoes before entering', 'name': 'doormat'}, {'frequency': 'f', 'id': 392, 'synset': 'doughnut.n.02', 'synonyms': ['doughnut', 'donut'], 'def': 'a small ring-shaped friedcake', 'name': 'doughnut'}, {'frequency': 'r', 'id': 393, 'synset': 'dove.n.01', 'synonyms': ['dove'], 'def': 'any of numerous small pigeons', 'name': 'dove'}, {'frequency': 'r', 'id': 394, 'synset': 'dragonfly.n.01', 'synonyms': ['dragonfly'], 'def': 'slender-bodied non-stinging insect having iridescent wings that are outspread at rest', 'name': 'dragonfly'}, {'frequency': 'f', 'id': 395, 'synset': 'drawer.n.01', 'synonyms': ['drawer'], 'def': 'a boxlike container in a piece of furniture; made so as to slide in and out', 'name': 'drawer'}, {'frequency': 'c', 'id': 396, 'synset': 'drawers.n.01', 'synonyms': ['underdrawers', 'boxers', 'boxershorts'], 'def': 'underpants worn by men', 'name': 'underdrawers'}, {'frequency': 'f', 'id': 397, 'synset': 'dress.n.01', 'synonyms': ['dress', 'frock'], 'def': 'a one-piece garment for a woman; has skirt and bodice', 'name': 'dress'}, {'frequency': 'c', 'id': 398, 'synset': 'dress_hat.n.01', 'synonyms': ['dress_hat', 'high_hat', 'opera_hat', 'silk_hat', 'top_hat'], 'def': "a man's hat with a tall crown; usually covered with silk or with beaver fur", 'name': 'dress_hat'}, {'frequency': 'c', 'id': 399, 'synset': 'dress_suit.n.01', 'synonyms': ['dress_suit'], 'def': 'formalwear consisting of full evening dress for men', 'name': 'dress_suit'}, {'frequency': 'c', 'id': 400, 'synset': 'dresser.n.05', 'synonyms': ['dresser'], 'def': 'a cabinet with shelves', 'name': 'dresser'}, {'frequency': 'c', 'id': 401, 'synset': 'drill.n.01', 'synonyms': ['drill'], 'def': 'a tool with a sharp rotating point for making holes in hard materials', 'name': 'drill'}, {'frequency': 'r', 'id': 402, 'synset': 'drinking_fountain.n.01', 'synonyms': ['drinking_fountain'], 'def': 'a public fountain to provide a jet of drinking water', 'name': 'drinking_fountain'}, {'frequency': 'r', 'id': 403, 'synset': 'drone.n.04', 'synonyms': ['drone'], 'def': 'an aircraft without a pilot that is operated by remote control', 'name': 'drone'}, {'frequency': 'r', 'id': 404, 'synset': 'dropper.n.01', 'synonyms': ['dropper', 'eye_dropper'], 'def': 'pipet consisting of a small tube with a vacuum bulb at one end for drawing liquid in and releasing it a drop at a time', 'name': 'dropper'}, {'frequency': 'c', 'id': 405, 'synset': 'drum.n.01', 'synonyms': ['drum_(musical_instrument)'], 'def': 'a musical percussion instrument; usually consists of a hollow cylinder with a membrane stretched across each end', 'name': 'drum_(musical_instrument)'}, {'frequency': 'r', 'id': 406, 'synset': 'drumstick.n.02', 'synonyms': ['drumstick'], 'def': 'a stick used for playing a drum', 'name': 'drumstick'}, {'frequency': 'f', 'id': 407, 'synset': 'duck.n.01', 'synonyms': ['duck'], 'def': 'small web-footed broad-billed swimming bird', 'name': 'duck'}, {'frequency': 'r', 'id': 408, 'synset': 'duckling.n.02', 'synonyms': ['duckling'], 'def': 'young duck', 'name': 'duckling'}, {'frequency': 'c', 'id': 409, 'synset': 'duct_tape.n.01', 'synonyms': ['duct_tape'], 'def': 'a wide silvery adhesive tape', 'name': 'duct_tape'}, {'frequency': 'f', 'id': 410, 'synset': 'duffel_bag.n.01', 'synonyms': ['duffel_bag', 'duffle_bag', 'duffel', 'duffle'], 'def': 'a large cylindrical bag of heavy cloth', 'name': 'duffel_bag'}, {'frequency': 'r', 'id': 411, 'synset': 'dumbbell.n.01', 'synonyms': ['dumbbell'], 'def': 'an exercising weight with two ball-like ends connected by a short handle', 'name': 'dumbbell'}, {'frequency': 'c', 'id': 412, 'synset': 'dumpster.n.01', 'synonyms': ['dumpster'], 'def': 'a container designed to receive and transport and dump waste', 'name': 'dumpster'}, {'frequency': 'r', 'id': 413, 'synset': 'dustpan.n.02', 'synonyms': ['dustpan'], 'def': 'a short-handled receptacle into which dust can be swept', 'name': 'dustpan'}, {'frequency': 'r', 'id': 414, 'synset': 'dutch_oven.n.02', 'synonyms': ['Dutch_oven'], 'def': 'iron or earthenware cooking pot; used for stews', 'name': 'Dutch_oven'}, {'frequency': 'c', 'id': 415, 'synset': 'eagle.n.01', 'synonyms': ['eagle'], 'def': 'large birds of prey noted for their broad wings and strong soaring flight', 'name': 'eagle'}, {'frequency': 'f', 'id': 416, 'synset': 'earphone.n.01', 'synonyms': ['earphone', 'earpiece', 'headphone'], 'def': 'device for listening to audio that is held over or inserted into the ear', 'name': 'earphone'}, {'frequency': 'r', 'id': 417, 'synset': 'earplug.n.01', 'synonyms': ['earplug'], 'def': 'a soft plug that is inserted into the ear canal to block sound', 'name': 'earplug'}, {'frequency': 'f', 'id': 418, 'synset': 'earring.n.01', 'synonyms': ['earring'], 'def': 'jewelry to ornament the ear', 'name': 'earring'}, {'frequency': 'c', 'id': 419, 'synset': 'easel.n.01', 'synonyms': ['easel'], 'def': "an upright tripod for displaying something (usually an artist's canvas)", 'name': 'easel'}, {'frequency': 'r', 'id': 420, 'synset': 'eclair.n.01', 'synonyms': ['eclair'], 'def': 'oblong cream puff', 'name': 'eclair'}, {'frequency': 'r', 'id': 421, 'synset': 'eel.n.01', 'synonyms': ['eel'], 'def': 'an elongate fish with fatty flesh', 'name': 'eel'}, {'frequency': 'f', 'id': 422, 'synset': 'egg.n.02', 'synonyms': ['egg', 'eggs'], 'def': 'oval reproductive body of a fowl (especially a hen) used as food', 'name': 'egg'}, {'frequency': 'r', 'id': 423, 'synset': 'egg_roll.n.01', 'synonyms': ['egg_roll', 'spring_roll'], 'def': 'minced vegetables and meat wrapped in a pancake and fried', 'name': 'egg_roll'}, {'frequency': 'c', 'id': 424, 'synset': 'egg_yolk.n.01', 'synonyms': ['egg_yolk', 'yolk_(egg)'], 'def': 'the yellow spherical part of an egg', 'name': 'egg_yolk'}, {'frequency': 'c', 'id': 425, 'synset': 'eggbeater.n.02', 'synonyms': ['eggbeater', 'eggwhisk'], 'def': 'a mixer for beating eggs or whipping cream', 'name': 'eggbeater'}, {'frequency': 'c', 'id': 426, 'synset': 'eggplant.n.01', 'synonyms': ['eggplant', 'aubergine'], 'def': 'egg-shaped vegetable having a shiny skin typically dark purple', 'name': 'eggplant'}, {'frequency': 'r', 'id': 427, 'synset': 'electric_chair.n.01', 'synonyms': ['electric_chair'], 'def': 'a chair-shaped instrument of execution by electrocution', 'name': 'electric_chair'}, {'frequency': 'f', 'id': 428, 'synset': 'electric_refrigerator.n.01', 'synonyms': ['refrigerator'], 'def': 'a refrigerator in which the coolant is pumped around by an electric motor', 'name': 'refrigerator'}, {'frequency': 'f', 'id': 429, 'synset': 'elephant.n.01', 'synonyms': ['elephant'], 'def': 'a common elephant', 'name': 'elephant'}, {'frequency': 'r', 'id': 430, 'synset': 'elk.n.01', 'synonyms': ['elk', 'moose'], 'def': 'large northern deer with enormous flattened antlers in the male', 'name': 'elk'}, {'frequency': 'c', 'id': 431, 'synset': 'envelope.n.01', 'synonyms': ['envelope'], 'def': 'a flat (usually rectangular) container for a letter, thin package, etc.', 'name': 'envelope'}, {'frequency': 'c', 'id': 432, 'synset': 'eraser.n.01', 'synonyms': ['eraser'], 'def': 'an implement used to erase something', 'name': 'eraser'}, {'frequency': 'r', 'id': 433, 'synset': 'escargot.n.01', 'synonyms': ['escargot'], 'def': 'edible snail usually served in the shell with a sauce of melted butter and garlic', 'name': 'escargot'}, {'frequency': 'r', 'id': 434, 'synset': 'eyepatch.n.01', 'synonyms': ['eyepatch'], 'def': 'a protective cloth covering for an injured eye', 'name': 'eyepatch'}, {'frequency': 'r', 'id': 435, 'synset': 'falcon.n.01', 'synonyms': ['falcon'], 'def': 'birds of prey having long pointed powerful wings adapted for swift flight', 'name': 'falcon'}, {'frequency': 'f', 'id': 436, 'synset': 'fan.n.01', 'synonyms': ['fan'], 'def': 'a device for creating a current of air by movement of a surface or surfaces', 'name': 'fan'}, {'frequency': 'f', 'id': 437, 'synset': 'faucet.n.01', 'synonyms': ['faucet', 'spigot', 'tap'], 'def': 'a regulator for controlling the flow of a liquid from a reservoir', 'name': 'faucet'}, {'frequency': 'r', 'id': 438, 'synset': 'fedora.n.01', 'synonyms': ['fedora'], 'def': 'a hat made of felt with a creased crown', 'name': 'fedora'}, {'frequency': 'r', 'id': 439, 'synset': 'ferret.n.02', 'synonyms': ['ferret'], 'def': 'domesticated albino variety of the European polecat bred for hunting rats and rabbits', 'name': 'ferret'}, {'frequency': 'c', 'id': 440, 'synset': 'ferris_wheel.n.01', 'synonyms': ['Ferris_wheel'], 'def': 'a large wheel with suspended seats that remain upright as the wheel rotates', 'name': 'Ferris_wheel'}, {'frequency': 'r', 'id': 441, 'synset': 'ferry.n.01', 'synonyms': ['ferry', 'ferryboat'], 'def': 'a boat that transports people or vehicles across a body of water and operates on a regular schedule', 'name': 'ferry'}, {'frequency': 'r', 'id': 442, 'synset': 'fig.n.04', 'synonyms': ['fig_(fruit)'], 'def': 'fleshy sweet pear-shaped yellowish or purple fruit eaten fresh or preserved or dried', 'name': 'fig_(fruit)'}, {'frequency': 'c', 'id': 443, 'synset': 'fighter.n.02', 'synonyms': ['fighter_jet', 'fighter_aircraft', 'attack_aircraft'], 'def': 'a high-speed military or naval airplane designed to destroy enemy targets', 'name': 'fighter_jet'}, {'frequency': 'f', 'id': 444, 'synset': 'figurine.n.01', 'synonyms': ['figurine'], 'def': 'a small carved or molded figure', 'name': 'figurine'}, {'frequency': 'c', 'id': 445, 'synset': 'file.n.03', 'synonyms': ['file_cabinet', 'filing_cabinet'], 'def': 'office furniture consisting of a container for keeping papers in order', 'name': 'file_cabinet'}, {'frequency': 'r', 'id': 446, 'synset': 'file.n.04', 'synonyms': ['file_(tool)'], 'def': 'a steel hand tool with small sharp teeth on some or all of its surfaces; used for smoothing wood or metal', 'name': 'file_(tool)'}, {'frequency': 'f', 'id': 447, 'synset': 'fire_alarm.n.02', 'synonyms': ['fire_alarm', 'smoke_alarm'], 'def': 'an alarm that is tripped off by fire or smoke', 'name': 'fire_alarm'}, {'frequency': 'c', 'id': 448, 'synset': 'fire_engine.n.01', 'synonyms': ['fire_engine', 'fire_truck'], 'def': 'large trucks that carry firefighters and equipment to the site of a fire', 'name': 'fire_engine'}, {'frequency': 'c', 'id': 449, 'synset': 'fire_extinguisher.n.01', 'synonyms': ['fire_extinguisher', 'extinguisher'], 'def': 'a manually operated device for extinguishing small fires', 'name': 'fire_extinguisher'}, {'frequency': 'c', 'id': 450, 'synset': 'fire_hose.n.01', 'synonyms': ['fire_hose'], 'def': 'a large hose that carries water from a fire hydrant to the site of the fire', 'name': 'fire_hose'}, {'frequency': 'f', 'id': 451, 'synset': 'fireplace.n.01', 'synonyms': ['fireplace'], 'def': 'an open recess in a wall at the base of a chimney where a fire can be built', 'name': 'fireplace'}, {'frequency': 'f', 'id': 452, 'synset': 'fireplug.n.01', 'synonyms': ['fireplug', 'fire_hydrant', 'hydrant'], 'def': 'an upright hydrant for drawing water to use in fighting a fire', 'name': 'fireplug'}, {'frequency': 'c', 'id': 453, 'synset': 'fish.n.01', 'synonyms': ['fish'], 'def': 'any of various mostly cold-blooded aquatic vertebrates usually having scales and breathing through gills', 'name': 'fish'}, {'frequency': 'r', 'id': 454, 'synset': 'fish.n.02', 'synonyms': ['fish_(food)'], 'def': 'the flesh of fish used as food', 'name': 'fish_(food)'}, {'frequency': 'r', 'id': 455, 'synset': 'fishbowl.n.02', 'synonyms': ['fishbowl', 'goldfish_bowl'], 'def': 'a transparent bowl in which small fish are kept', 'name': 'fishbowl'}, {'frequency': 'r', 'id': 456, 'synset': 'fishing_boat.n.01', 'synonyms': ['fishing_boat', 'fishing_vessel'], 'def': 'a vessel for fishing', 'name': 'fishing_boat'}, {'frequency': 'c', 'id': 457, 'synset': 'fishing_rod.n.01', 'synonyms': ['fishing_rod', 'fishing_pole'], 'def': 'a rod that is used in fishing to extend the fishing line', 'name': 'fishing_rod'}, {'frequency': 'f', 'id': 458, 'synset': 'flag.n.01', 'synonyms': ['flag'], 'def': 'emblem usually consisting of a rectangular piece of cloth of distinctive design (do not include pole)', 'name': 'flag'}, {'frequency': 'f', 'id': 459, 'synset': 'flagpole.n.02', 'synonyms': ['flagpole', 'flagstaff'], 'def': 'a tall staff or pole on which a flag is raised', 'name': 'flagpole'}, {'frequency': 'c', 'id': 460, 'synset': 'flamingo.n.01', 'synonyms': ['flamingo'], 'def': 'large pink web-footed bird with down-bent bill', 'name': 'flamingo'}, {'frequency': 'c', 'id': 461, 'synset': 'flannel.n.01', 'synonyms': ['flannel'], 'def': 'a soft light woolen fabric; used for clothing', 'name': 'flannel'}, {'frequency': 'r', 'id': 462, 'synset': 'flash.n.10', 'synonyms': ['flash', 'flashbulb'], 'def': 'a lamp for providing momentary light to take a photograph', 'name': 'flash'}, {'frequency': 'c', 'id': 463, 'synset': 'flashlight.n.01', 'synonyms': ['flashlight', 'torch'], 'def': 'a small portable battery-powered electric lamp', 'name': 'flashlight'}, {'frequency': 'r', 'id': 464, 'synset': 'fleece.n.03', 'synonyms': ['fleece'], 'def': 'a soft bulky fabric with deep pile; used chiefly for clothing', 'name': 'fleece'}, {'frequency': 'f', 'id': 465, 'synset': 'flip-flop.n.02', 'synonyms': ['flip-flop_(sandal)'], 'def': 'a backless sandal held to the foot by a thong between two toes', 'name': 'flip-flop_(sandal)'}, {'frequency': 'c', 'id': 466, 'synset': 'flipper.n.01', 'synonyms': ['flipper_(footwear)', 'fin_(footwear)'], 'def': 'a shoe to aid a person in swimming', 'name': 'flipper_(footwear)'}, {'frequency': 'f', 'id': 467, 'synset': 'flower_arrangement.n.01', 'synonyms': ['flower_arrangement', 'floral_arrangement'], 'def': 'a decorative arrangement of flowers', 'name': 'flower_arrangement'}, {'frequency': 'c', 'id': 468, 'synset': 'flute.n.02', 'synonyms': ['flute_glass', 'champagne_flute'], 'def': 'a tall narrow wineglass', 'name': 'flute_glass'}, {'frequency': 'r', 'id': 469, 'synset': 'foal.n.01', 'synonyms': ['foal'], 'def': 'a young horse', 'name': 'foal'}, {'frequency': 'c', 'id': 470, 'synset': 'folding_chair.n.01', 'synonyms': ['folding_chair'], 'def': 'a chair that can be folded flat for storage', 'name': 'folding_chair'}, {'frequency': 'c', 'id': 471, 'synset': 'food_processor.n.01', 'synonyms': ['food_processor'], 'def': 'a kitchen appliance for shredding, blending, chopping, or slicing food', 'name': 'food_processor'}, {'frequency': 'c', 'id': 472, 'synset': 'football.n.02', 'synonyms': ['football_(American)'], 'def': 'the inflated oblong ball used in playing American football', 'name': 'football_(American)'}, {'frequency': 'r', 'id': 473, 'synset': 'football_helmet.n.01', 'synonyms': ['football_helmet'], 'def': 'a padded helmet with a face mask to protect the head of football players', 'name': 'football_helmet'}, {'frequency': 'c', 'id': 474, 'synset': 'footstool.n.01', 'synonyms': ['footstool', 'footrest'], 'def': 'a low seat or a stool to rest the feet of a seated person', 'name': 'footstool'}, {'frequency': 'f', 'id': 475, 'synset': 'fork.n.01', 'synonyms': ['fork'], 'def': 'cutlery used for serving and eating food', 'name': 'fork'}, {'frequency': 'r', 'id': 476, 'synset': 'forklift.n.01', 'synonyms': ['forklift'], 'def': 'an industrial vehicle with a power operated fork in front that can be inserted under loads to lift and move them', 'name': 'forklift'}, {'frequency': 'r', 'id': 477, 'synset': 'freight_car.n.01', 'synonyms': ['freight_car'], 'def': 'a railway car that carries freight', 'name': 'freight_car'}, {'frequency': 'r', 'id': 478, 'synset': 'french_toast.n.01', 'synonyms': ['French_toast'], 'def': 'bread slice dipped in egg and milk and fried', 'name': 'French_toast'}, {'frequency': 'c', 'id': 479, 'synset': 'freshener.n.01', 'synonyms': ['freshener', 'air_freshener'], 'def': 'anything that freshens', 'name': 'freshener'}, {'frequency': 'f', 'id': 480, 'synset': 'frisbee.n.01', 'synonyms': ['frisbee'], 'def': 'a light, plastic disk propelled with a flip of the wrist for recreation or competition', 'name': 'frisbee'}, {'frequency': 'c', 'id': 481, 'synset': 'frog.n.01', 'synonyms': ['frog', 'toad', 'toad_frog'], 'def': 'a tailless stout-bodied amphibians with long hind limbs for leaping', 'name': 'frog'}, {'frequency': 'c', 'id': 482, 'synset': 'fruit_juice.n.01', 'synonyms': ['fruit_juice'], 'def': 'drink produced by squeezing or crushing fruit', 'name': 'fruit_juice'}, {'frequency': 'r', 'id': 483, 'synset': 'fruit_salad.n.01', 'synonyms': ['fruit_salad'], 'def': 'salad composed of fruits', 'name': 'fruit_salad'}, {'frequency': 'c', 'id': 484, 'synset': 'frying_pan.n.01', 'synonyms': ['frying_pan', 'frypan', 'skillet'], 'def': 'a pan used for frying foods', 'name': 'frying_pan'}, {'frequency': 'r', 'id': 485, 'synset': 'fudge.n.01', 'synonyms': ['fudge'], 'def': 'soft creamy candy', 'name': 'fudge'}, {'frequency': 'r', 'id': 486, 'synset': 'funnel.n.02', 'synonyms': ['funnel'], 'def': 'a cone-shaped utensil used to channel a substance into a container with a small mouth', 'name': 'funnel'}, {'frequency': 'c', 'id': 487, 'synset': 'futon.n.01', 'synonyms': ['futon'], 'def': 'a pad that is used for sleeping on the floor or on a raised frame', 'name': 'futon'}, {'frequency': 'r', 'id': 488, 'synset': 'gag.n.02', 'synonyms': ['gag', 'muzzle'], 'def': "restraint put into a person's mouth to prevent speaking or shouting", 'name': 'gag'}, {'frequency': 'r', 'id': 489, 'synset': 'garbage.n.03', 'synonyms': ['garbage'], 'def': 'a receptacle where waste can be discarded', 'name': 'garbage'}, {'frequency': 'c', 'id': 490, 'synset': 'garbage_truck.n.01', 'synonyms': ['garbage_truck'], 'def': 'a truck for collecting domestic refuse', 'name': 'garbage_truck'}, {'frequency': 'c', 'id': 491, 'synset': 'garden_hose.n.01', 'synonyms': ['garden_hose'], 'def': 'a hose used for watering a lawn or garden', 'name': 'garden_hose'}, {'frequency': 'c', 'id': 492, 'synset': 'gargle.n.01', 'synonyms': ['gargle', 'mouthwash'], 'def': 'a medicated solution used for gargling and rinsing the mouth', 'name': 'gargle'}, {'frequency': 'r', 'id': 493, 'synset': 'gargoyle.n.02', 'synonyms': ['gargoyle'], 'def': 'an ornament consisting of a grotesquely carved figure of a person or animal', 'name': 'gargoyle'}, {'frequency': 'c', 'id': 494, 'synset': 'garlic.n.02', 'synonyms': ['garlic', 'ail'], 'def': 'aromatic bulb used as seasoning', 'name': 'garlic'}, {'frequency': 'r', 'id': 495, 'synset': 'gasmask.n.01', 'synonyms': ['gasmask', 'respirator', 'gas_helmet'], 'def': 'a protective face mask with a filter', 'name': 'gasmask'}, {'frequency': 'r', 'id': 496, 'synset': 'gazelle.n.01', 'synonyms': ['gazelle'], 'def': 'small swift graceful antelope of Africa and Asia having lustrous eyes', 'name': 'gazelle'}, {'frequency': 'c', 'id': 497, 'synset': 'gelatin.n.02', 'synonyms': ['gelatin', 'jelly'], 'def': 'an edible jelly made with gelatin and used as a dessert or salad base or a coating for foods', 'name': 'gelatin'}, {'frequency': 'r', 'id': 498, 'synset': 'gem.n.02', 'synonyms': ['gemstone'], 'def': 'a crystalline rock that can be cut and polished for jewelry', 'name': 'gemstone'}, {'frequency': 'c', 'id': 499, 'synset': 'giant_panda.n.01', 'synonyms': ['giant_panda', 'panda', 'panda_bear'], 'def': 'large black-and-white herbivorous mammal of bamboo forests of China and Tibet', 'name': 'giant_panda'}, {'frequency': 'c', 'id': 500, 'synset': 'gift_wrap.n.01', 'synonyms': ['gift_wrap'], 'def': 'attractive wrapping paper suitable for wrapping gifts', 'name': 'gift_wrap'}, {'frequency': 'c', 'id': 501, 'synset': 'ginger.n.03', 'synonyms': ['ginger', 'gingerroot'], 'def': 'the root of the common ginger plant; used fresh as a seasoning', 'name': 'ginger'}, {'frequency': 'f', 'id': 502, 'synset': 'giraffe.n.01', 'synonyms': ['giraffe'], 'def': 'tall animal having a spotted coat and small horns and very long neck and legs', 'name': 'giraffe'}, {'frequency': 'c', 'id': 503, 'synset': 'girdle.n.02', 'synonyms': ['cincture', 'sash', 'waistband', 'waistcloth'], 'def': 'a band of material around the waist that strengthens a skirt or trousers', 'name': 'cincture'}, {'frequency': 'f', 'id': 504, 'synset': 'glass.n.02', 'synonyms': ['glass_(drink_container)', 'drinking_glass'], 'def': 'a container for holding liquids while drinking', 'name': 'glass_(drink_container)'}, {'frequency': 'c', 'id': 505, 'synset': 'globe.n.03', 'synonyms': ['globe'], 'def': 'a sphere on which a map (especially of the earth) is represented', 'name': 'globe'}, {'frequency': 'f', 'id': 506, 'synset': 'glove.n.02', 'synonyms': ['glove'], 'def': 'handwear covering the hand', 'name': 'glove'}, {'frequency': 'c', 'id': 507, 'synset': 'goat.n.01', 'synonyms': ['goat'], 'def': 'a common goat', 'name': 'goat'}, {'frequency': 'f', 'id': 508, 'synset': 'goggles.n.01', 'synonyms': ['goggles'], 'def': 'tight-fitting spectacles worn to protect the eyes', 'name': 'goggles'}, {'frequency': 'r', 'id': 509, 'synset': 'goldfish.n.01', 'synonyms': ['goldfish'], 'def': 'small golden or orange-red freshwater fishes used as pond or aquarium pets', 'name': 'goldfish'}, {'frequency': 'r', 'id': 510, 'synset': 'golf_club.n.02', 'synonyms': ['golf_club', 'golf-club'], 'def': 'golf equipment used by a golfer to hit a golf ball', 'name': 'golf_club'}, {'frequency': 'c', 'id': 511, 'synset': 'golfcart.n.01', 'synonyms': ['golfcart'], 'def': 'a small motor vehicle in which golfers can ride between shots', 'name': 'golfcart'}, {'frequency': 'r', 'id': 512, 'synset': 'gondola.n.02', 'synonyms': ['gondola_(boat)'], 'def': 'long narrow flat-bottomed boat propelled by sculling; traditionally used on canals of Venice', 'name': 'gondola_(boat)'}, {'frequency': 'c', 'id': 513, 'synset': 'goose.n.01', 'synonyms': ['goose'], 'def': 'loud, web-footed long-necked aquatic birds usually larger than ducks', 'name': 'goose'}, {'frequency': 'r', 'id': 514, 'synset': 'gorilla.n.01', 'synonyms': ['gorilla'], 'def': 'largest ape', 'name': 'gorilla'}, {'frequency': 'r', 'id': 515, 'synset': 'gourd.n.02', 'synonyms': ['gourd'], 'def': 'any of numerous inedible fruits with hard rinds', 'name': 'gourd'}, {'frequency': 'r', 'id': 516, 'synset': 'gown.n.04', 'synonyms': ['surgical_gown', 'scrubs_(surgical_clothing)'], 'def': 'protective garment worn by surgeons during operations', 'name': 'surgical_gown'}, {'frequency': 'f', 'id': 517, 'synset': 'grape.n.01', 'synonyms': ['grape'], 'def': 'any of various juicy fruit with green or purple skins; grow in clusters', 'name': 'grape'}, {'frequency': 'r', 'id': 518, 'synset': 'grasshopper.n.01', 'synonyms': ['grasshopper'], 'def': 'plant-eating insect with hind legs adapted for leaping', 'name': 'grasshopper'}, {'frequency': 'c', 'id': 519, 'synset': 'grater.n.01', 'synonyms': ['grater'], 'def': 'utensil with sharp perforations for shredding foods (as vegetables or cheese)', 'name': 'grater'}, {'frequency': 'c', 'id': 520, 'synset': 'gravestone.n.01', 'synonyms': ['gravestone', 'headstone', 'tombstone'], 'def': 'a stone that is used to mark a grave', 'name': 'gravestone'}, {'frequency': 'r', 'id': 521, 'synset': 'gravy_boat.n.01', 'synonyms': ['gravy_boat', 'gravy_holder'], 'def': 'a dish (often boat-shaped) for serving gravy or sauce', 'name': 'gravy_boat'}, {'frequency': 'c', 'id': 522, 'synset': 'green_bean.n.02', 'synonyms': ['green_bean'], 'def': 'a common bean plant cultivated for its slender green edible pods', 'name': 'green_bean'}, {'frequency': 'c', 'id': 523, 'synset': 'green_onion.n.01', 'synonyms': ['green_onion', 'spring_onion', 'scallion'], 'def': 'a young onion before the bulb has enlarged', 'name': 'green_onion'}, {'frequency': 'r', 'id': 524, 'synset': 'griddle.n.01', 'synonyms': ['griddle'], 'def': 'cooking utensil consisting of a flat heated surface on which food is cooked', 'name': 'griddle'}, {'frequency': 'r', 'id': 525, 'synset': 'grillroom.n.01', 'synonyms': ['grillroom', 'grill_(restaurant)'], 'def': 'a restaurant where food is cooked on a grill', 'name': 'grillroom'}, {'frequency': 'r', 'id': 526, 'synset': 'grinder.n.04', 'synonyms': ['grinder_(tool)'], 'def': 'a machine tool that polishes metal', 'name': 'grinder_(tool)'}, {'frequency': 'r', 'id': 527, 'synset': 'grits.n.01', 'synonyms': ['grits', 'hominy_grits'], 'def': 'coarsely ground corn boiled as a breakfast dish', 'name': 'grits'}, {'frequency': 'c', 'id': 528, 'synset': 'grizzly.n.01', 'synonyms': ['grizzly', 'grizzly_bear'], 'def': 'powerful brownish-yellow bear of the uplands of western North America', 'name': 'grizzly'}, {'frequency': 'c', 'id': 529, 'synset': 'grocery_bag.n.01', 'synonyms': ['grocery_bag'], 'def': "a sack for holding customer's groceries", 'name': 'grocery_bag'}, {'frequency': 'r', 'id': 530, 'synset': 'guacamole.n.01', 'synonyms': ['guacamole'], 'def': 'a dip made of mashed avocado mixed with chopped onions and other seasonings', 'name': 'guacamole'}, {'frequency': 'f', 'id': 531, 'synset': 'guitar.n.01', 'synonyms': ['guitar'], 'def': 'a stringed instrument usually having six strings; played by strumming or plucking', 'name': 'guitar'}, {'frequency': 'c', 'id': 532, 'synset': 'gull.n.02', 'synonyms': ['gull', 'seagull'], 'def': 'mostly white aquatic bird having long pointed wings and short legs', 'name': 'gull'}, {'frequency': 'c', 'id': 533, 'synset': 'gun.n.01', 'synonyms': ['gun'], 'def': 'a weapon that discharges a bullet at high velocity from a metal tube', 'name': 'gun'}, {'frequency': 'r', 'id': 534, 'synset': 'hair_spray.n.01', 'synonyms': ['hair_spray'], 'def': 'substance sprayed on the hair to hold it in place', 'name': 'hair_spray'}, {'frequency': 'c', 'id': 535, 'synset': 'hairbrush.n.01', 'synonyms': ['hairbrush'], 'def': "a brush used to groom a person's hair", 'name': 'hairbrush'}, {'frequency': 'c', 'id': 536, 'synset': 'hairnet.n.01', 'synonyms': ['hairnet'], 'def': 'a small net that someone wears over their hair to keep it in place', 'name': 'hairnet'}, {'frequency': 'c', 'id': 537, 'synset': 'hairpin.n.01', 'synonyms': ['hairpin'], 'def': "a double pronged pin used to hold women's hair in place", 'name': 'hairpin'}, {'frequency': 'f', 'id': 538, 'synset': 'ham.n.01', 'synonyms': ['ham', 'jambon', 'gammon'], 'def': 'meat cut from the thigh of a hog (usually smoked)', 'name': 'ham'}, {'frequency': 'c', 'id': 539, 'synset': 'hamburger.n.01', 'synonyms': ['hamburger', 'beefburger', 'burger'], 'def': 'a sandwich consisting of a patty of minced beef served on a bun', 'name': 'hamburger'}, {'frequency': 'c', 'id': 540, 'synset': 'hammer.n.02', 'synonyms': ['hammer'], 'def': 'a hand tool with a heavy head and a handle; used to deliver an impulsive force by striking', 'name': 'hammer'}, {'frequency': 'r', 'id': 541, 'synset': 'hammock.n.02', 'synonyms': ['hammock'], 'def': 'a hanging bed of canvas or rope netting (usually suspended between two trees)', 'name': 'hammock'}, {'frequency': 'r', 'id': 542, 'synset': 'hamper.n.02', 'synonyms': ['hamper'], 'def': 'a basket usually with a cover', 'name': 'hamper'}, {'frequency': 'r', 'id': 543, 'synset': 'hamster.n.01', 'synonyms': ['hamster'], 'def': 'short-tailed burrowing rodent with large cheek pouches', 'name': 'hamster'}, {'frequency': 'c', 'id': 544, 'synset': 'hand_blower.n.01', 'synonyms': ['hair_dryer'], 'def': 'a hand-held electric blower that can blow warm air onto the hair', 'name': 'hair_dryer'}, {'frequency': 'r', 'id': 545, 'synset': 'hand_glass.n.01', 'synonyms': ['hand_glass', 'hand_mirror'], 'def': 'a mirror intended to be held in the hand', 'name': 'hand_glass'}, {'frequency': 'f', 'id': 546, 'synset': 'hand_towel.n.01', 'synonyms': ['hand_towel', 'face_towel'], 'def': 'a small towel used to dry the hands or face', 'name': 'hand_towel'}, {'frequency': 'c', 'id': 547, 'synset': 'handcart.n.01', 'synonyms': ['handcart', 'pushcart', 'hand_truck'], 'def': 'wheeled vehicle that can be pushed by a person', 'name': 'handcart'}, {'frequency': 'r', 'id': 548, 'synset': 'handcuff.n.01', 'synonyms': ['handcuff'], 'def': 'shackle that consists of a metal loop that can be locked around the wrist', 'name': 'handcuff'}, {'frequency': 'c', 'id': 549, 'synset': 'handkerchief.n.01', 'synonyms': ['handkerchief'], 'def': 'a square piece of cloth used for wiping the eyes or nose or as a costume accessory', 'name': 'handkerchief'}, {'frequency': 'f', 'id': 550, 'synset': 'handle.n.01', 'synonyms': ['handle', 'grip', 'handgrip'], 'def': 'the appendage to an object that is designed to be held in order to use or move it', 'name': 'handle'}, {'frequency': 'r', 'id': 551, 'synset': 'handsaw.n.01', 'synonyms': ['handsaw', "carpenter's_saw"], 'def': 'a saw used with one hand for cutting wood', 'name': 'handsaw'}, {'frequency': 'r', 'id': 552, 'synset': 'hardback.n.01', 'synonyms': ['hardback_book', 'hardcover_book'], 'def': 'a book with cardboard or cloth or leather covers', 'name': 'hardback_book'}, {'frequency': 'r', 'id': 553, 'synset': 'harmonium.n.01', 'synonyms': ['harmonium', 'organ_(musical_instrument)', 'reed_organ_(musical_instrument)'], 'def': 'a free-reed instrument in which air is forced through the reeds by bellows', 'name': 'harmonium'}, {'frequency': 'f', 'id': 554, 'synset': 'hat.n.01', 'synonyms': ['hat'], 'def': 'headwear that protects the head from bad weather, sun, or worn for fashion', 'name': 'hat'}, {'frequency': 'r', 'id': 555, 'synset': 'hatbox.n.01', 'synonyms': ['hatbox'], 'def': 'a round piece of luggage for carrying hats', 'name': 'hatbox'}, {'frequency': 'r', 'id': 556, 'synset': 'hatch.n.03', 'synonyms': ['hatch'], 'def': 'a movable barrier covering a hatchway', 'name': 'hatch'}, {'frequency': 'c', 'id': 557, 'synset': 'head_covering.n.01', 'synonyms': ['veil'], 'def': 'a garment that covers the head and face', 'name': 'veil'}, {'frequency': 'f', 'id': 558, 'synset': 'headband.n.01', 'synonyms': ['headband'], 'def': 'a band worn around or over the head', 'name': 'headband'}, {'frequency': 'f', 'id': 559, 'synset': 'headboard.n.01', 'synonyms': ['headboard'], 'def': 'a vertical board or panel forming the head of a bedstead', 'name': 'headboard'}, {'frequency': 'f', 'id': 560, 'synset': 'headlight.n.01', 'synonyms': ['headlight', 'headlamp'], 'def': 'a powerful light with reflector; attached to the front of an automobile or locomotive', 'name': 'headlight'}, {'frequency': 'c', 'id': 561, 'synset': 'headscarf.n.01', 'synonyms': ['headscarf'], 'def': 'a kerchief worn over the head and tied under the chin', 'name': 'headscarf'}, {'frequency': 'r', 'id': 562, 'synset': 'headset.n.01', 'synonyms': ['headset'], 'def': 'receiver consisting of a pair of headphones', 'name': 'headset'}, {'frequency': 'c', 'id': 563, 'synset': 'headstall.n.01', 'synonyms': ['headstall_(for_horses)', 'headpiece_(for_horses)'], 'def': "the band that is the part of a bridle that fits around a horse's head", 'name': 'headstall_(for_horses)'}, {'frequency': 'r', 'id': 564, 'synset': 'hearing_aid.n.02', 'synonyms': ['hearing_aid'], 'def': 'an acoustic device used to direct sound to the ear of a hearing-impaired person', 'name': 'hearing_aid'}, {'frequency': 'c', 'id': 565, 'synset': 'heart.n.02', 'synonyms': ['heart'], 'def': 'a muscular organ; its contractions move the blood through the body', 'name': 'heart'}, {'frequency': 'c', 'id': 566, 'synset': 'heater.n.01', 'synonyms': ['heater', 'warmer'], 'def': 'device that heats water or supplies warmth to a room', 'name': 'heater'}, {'frequency': 'c', 'id': 567, 'synset': 'helicopter.n.01', 'synonyms': ['helicopter'], 'def': 'an aircraft without wings that obtains its lift from the rotation of overhead blades', 'name': 'helicopter'}, {'frequency': 'f', 'id': 568, 'synset': 'helmet.n.02', 'synonyms': ['helmet'], 'def': 'a protective headgear made of hard material to resist blows', 'name': 'helmet'}, {'frequency': 'r', 'id': 569, 'synset': 'heron.n.02', 'synonyms': ['heron'], 'def': 'grey or white wading bird with long neck and long legs and (usually) long bill', 'name': 'heron'}, {'frequency': 'c', 'id': 570, 'synset': 'highchair.n.01', 'synonyms': ['highchair', 'feeding_chair'], 'def': 'a chair for feeding a very young child', 'name': 'highchair'}, {'frequency': 'f', 'id': 571, 'synset': 'hinge.n.01', 'synonyms': ['hinge'], 'def': 'a joint that holds two parts together so that one can swing relative to the other', 'name': 'hinge'}, {'frequency': 'r', 'id': 572, 'synset': 'hippopotamus.n.01', 'synonyms': ['hippopotamus'], 'def': 'massive thick-skinned animal living in or around rivers of tropical Africa', 'name': 'hippopotamus'}, {'frequency': 'r', 'id': 573, 'synset': 'hockey_stick.n.01', 'synonyms': ['hockey_stick'], 'def': 'sports implement consisting of a stick used by hockey players to move the puck', 'name': 'hockey_stick'}, {'frequency': 'c', 'id': 574, 'synset': 'hog.n.03', 'synonyms': ['hog', 'pig'], 'def': 'domestic swine', 'name': 'hog'}, {'frequency': 'f', 'id': 575, 'synset': 'home_plate.n.01', 'synonyms': ['home_plate_(baseball)', 'home_base_(baseball)'], 'def': '(baseball) a rubber slab where the batter stands; it must be touched by a base runner in order to score', 'name': 'home_plate_(baseball)'}, {'frequency': 'c', 'id': 576, 'synset': 'honey.n.01', 'synonyms': ['honey'], 'def': 'a sweet yellow liquid produced by bees', 'name': 'honey'}, {'frequency': 'f', 'id': 577, 'synset': 'hood.n.06', 'synonyms': ['fume_hood', 'exhaust_hood'], 'def': 'metal covering leading to a vent that exhausts smoke or fumes', 'name': 'fume_hood'}, {'frequency': 'f', 'id': 578, 'synset': 'hook.n.05', 'synonyms': ['hook'], 'def': 'a curved or bent implement for suspending or pulling something', 'name': 'hook'}, {'frequency': 'f', 'id': 579, 'synset': 'horse.n.01', 'synonyms': ['horse'], 'def': 'a common horse', 'name': 'horse'}, {'frequency': 'f', 'id': 580, 'synset': 'hose.n.03', 'synonyms': ['hose', 'hosepipe'], 'def': 'a flexible pipe for conveying a liquid or gas', 'name': 'hose'}, {'frequency': 'r', 'id': 581, 'synset': 'hot-air_balloon.n.01', 'synonyms': ['hot-air_balloon'], 'def': 'balloon for travel through the air in a basket suspended below a large bag of heated air', 'name': 'hot-air_balloon'}, {'frequency': 'r', 'id': 582, 'synset': 'hot_plate.n.01', 'synonyms': ['hotplate'], 'def': 'a portable electric appliance for heating or cooking or keeping food warm', 'name': 'hotplate'}, {'frequency': 'c', 'id': 583, 'synset': 'hot_sauce.n.01', 'synonyms': ['hot_sauce'], 'def': 'a pungent peppery sauce', 'name': 'hot_sauce'}, {'frequency': 'r', 'id': 584, 'synset': 'hourglass.n.01', 'synonyms': ['hourglass'], 'def': 'a sandglass timer that runs for sixty minutes', 'name': 'hourglass'}, {'frequency': 'r', 'id': 585, 'synset': 'houseboat.n.01', 'synonyms': ['houseboat'], 'def': 'a barge that is designed and equipped for use as a dwelling', 'name': 'houseboat'}, {'frequency': 'r', 'id': 586, 'synset': 'hummingbird.n.01', 'synonyms': ['hummingbird'], 'def': 'tiny American bird having brilliant iridescent plumage and long slender bills', 'name': 'hummingbird'}, {'frequency': 'r', 'id': 587, 'synset': 'hummus.n.01', 'synonyms': ['hummus', 'humus', 'hommos', 'hoummos', 'humous'], 'def': 'a thick spread made from mashed chickpeas', 'name': 'hummus'}, {'frequency': 'c', 'id': 588, 'synset': 'ice_bear.n.01', 'synonyms': ['polar_bear'], 'def': 'white bear of Arctic regions', 'name': 'polar_bear'}, {'frequency': 'c', 'id': 589, 'synset': 'ice_cream.n.01', 'synonyms': ['icecream'], 'def': 'frozen dessert containing cream and sugar and flavoring', 'name': 'icecream'}, {'frequency': 'r', 'id': 590, 'synset': 'ice_lolly.n.01', 'synonyms': ['popsicle'], 'def': 'ice cream or water ice on a small wooden stick', 'name': 'popsicle'}, {'frequency': 'c', 'id': 591, 'synset': 'ice_maker.n.01', 'synonyms': ['ice_maker'], 'def': 'an appliance included in some electric refrigerators for making ice cubes', 'name': 'ice_maker'}, {'frequency': 'r', 'id': 592, 'synset': 'ice_pack.n.01', 'synonyms': ['ice_pack', 'ice_bag'], 'def': 'a waterproof bag filled with ice: applied to the body (especially the head) to cool or reduce swelling', 'name': 'ice_pack'}, {'frequency': 'r', 'id': 593, 'synset': 'ice_skate.n.01', 'synonyms': ['ice_skate'], 'def': 'skate consisting of a boot with a steel blade fitted to the sole', 'name': 'ice_skate'}, {'frequency': 'r', 'id': 594, 'synset': 'ice_tea.n.01', 'synonyms': ['ice_tea', 'iced_tea'], 'def': 'strong tea served over ice', 'name': 'ice_tea'}, {'frequency': 'c', 'id': 595, 'synset': 'igniter.n.01', 'synonyms': ['igniter', 'ignitor', 'lighter'], 'def': 'a substance or device used to start a fire', 'name': 'igniter'}, {'frequency': 'r', 'id': 596, 'synset': 'incense.n.01', 'synonyms': ['incense'], 'def': 'a substance that produces a fragrant odor when burned', 'name': 'incense'}, {'frequency': 'r', 'id': 597, 'synset': 'inhaler.n.01', 'synonyms': ['inhaler', 'inhalator'], 'def': 'a dispenser that produces a chemical vapor to be inhaled through mouth or nose', 'name': 'inhaler'}, {'frequency': 'c', 'id': 598, 'synset': 'ipod.n.01', 'synonyms': ['iPod'], 'def': 'a pocket-sized device used to play music files', 'name': 'iPod'}, {'frequency': 'c', 'id': 599, 'synset': 'iron.n.04', 'synonyms': ['iron_(for_clothing)', 'smoothing_iron_(for_clothing)'], 'def': 'home appliance consisting of a flat metal base that is heated and used to smooth cloth', 'name': 'iron_(for_clothing)'}, {'frequency': 'r', 'id': 600, 'synset': 'ironing_board.n.01', 'synonyms': ['ironing_board'], 'def': 'narrow padded board on collapsible supports; used for ironing clothes', 'name': 'ironing_board'}, {'frequency': 'f', 'id': 601, 'synset': 'jacket.n.01', 'synonyms': ['jacket'], 'def': 'a waist-length coat', 'name': 'jacket'}, {'frequency': 'r', 'id': 602, 'synset': 'jam.n.01', 'synonyms': ['jam'], 'def': 'preserve of crushed fruit', 'name': 'jam'}, {'frequency': 'f', 'id': 603, 'synset': 'jean.n.01', 'synonyms': ['jean', 'blue_jean', 'denim'], 'def': '(usually plural) close-fitting trousers of heavy denim for manual work or casual wear', 'name': 'jean'}, {'frequency': 'c', 'id': 604, 'synset': 'jeep.n.01', 'synonyms': ['jeep', 'landrover'], 'def': 'a car suitable for traveling over rough terrain', 'name': 'jeep'}, {'frequency': 'r', 'id': 605, 'synset': 'jelly_bean.n.01', 'synonyms': ['jelly_bean', 'jelly_egg'], 'def': 'sugar-glazed jellied candy', 'name': 'jelly_bean'}, {'frequency': 'f', 'id': 606, 'synset': 'jersey.n.03', 'synonyms': ['jersey', 'T-shirt', 'tee_shirt'], 'def': 'a close-fitting pullover shirt', 'name': 'jersey'}, {'frequency': 'c', 'id': 607, 'synset': 'jet.n.01', 'synonyms': ['jet_plane', 'jet-propelled_plane'], 'def': 'an airplane powered by one or more jet engines', 'name': 'jet_plane'}, {'frequency': 'c', 'id': 608, 'synset': 'jewelry.n.01', 'synonyms': ['jewelry', 'jewellery'], 'def': 'an adornment (as a bracelet or ring or necklace) made of precious metals and set with gems (or imitation gems)', 'name': 'jewelry'}, {'frequency': 'r', 'id': 609, 'synset': 'joystick.n.02', 'synonyms': ['joystick'], 'def': 'a control device for computers consisting of a vertical handle that can move freely in two directions', 'name': 'joystick'}, {'frequency': 'r', 'id': 610, 'synset': 'jump_suit.n.01', 'synonyms': ['jumpsuit'], 'def': "one-piece garment fashioned after a parachutist's uniform", 'name': 'jumpsuit'}, {'frequency': 'c', 'id': 611, 'synset': 'kayak.n.01', 'synonyms': ['kayak'], 'def': 'a small canoe consisting of a light frame made watertight with animal skins', 'name': 'kayak'}, {'frequency': 'r', 'id': 612, 'synset': 'keg.n.02', 'synonyms': ['keg'], 'def': 'small cask or barrel', 'name': 'keg'}, {'frequency': 'r', 'id': 613, 'synset': 'kennel.n.01', 'synonyms': ['kennel', 'doghouse'], 'def': 'outbuilding that serves as a shelter for a dog', 'name': 'kennel'}, {'frequency': 'c', 'id': 614, 'synset': 'kettle.n.01', 'synonyms': ['kettle', 'boiler'], 'def': 'a metal pot for stewing or boiling; usually has a lid', 'name': 'kettle'}, {'frequency': 'f', 'id': 615, 'synset': 'key.n.01', 'synonyms': ['key'], 'def': 'metal instrument used to unlock a lock', 'name': 'key'}, {'frequency': 'r', 'id': 616, 'synset': 'keycard.n.01', 'synonyms': ['keycard'], 'def': 'a plastic card used to gain access typically to a door', 'name': 'keycard'}, {'frequency': 'r', 'id': 617, 'synset': 'kilt.n.01', 'synonyms': ['kilt'], 'def': 'a knee-length pleated tartan skirt worn by men as part of the traditional dress in the Highlands of northern Scotland', 'name': 'kilt'}, {'frequency': 'c', 'id': 618, 'synset': 'kimono.n.01', 'synonyms': ['kimono'], 'def': 'a loose robe; imitated from robes originally worn by Japanese', 'name': 'kimono'}, {'frequency': 'f', 'id': 619, 'synset': 'kitchen_sink.n.01', 'synonyms': ['kitchen_sink'], 'def': 'a sink in a kitchen', 'name': 'kitchen_sink'}, {'frequency': 'c', 'id': 620, 'synset': 'kitchen_table.n.01', 'synonyms': ['kitchen_table'], 'def': 'a table in the kitchen', 'name': 'kitchen_table'}, {'frequency': 'f', 'id': 621, 'synset': 'kite.n.03', 'synonyms': ['kite'], 'def': 'plaything consisting of a light frame covered with tissue paper; flown in wind at end of a string', 'name': 'kite'}, {'frequency': 'c', 'id': 622, 'synset': 'kitten.n.01', 'synonyms': ['kitten', 'kitty'], 'def': 'young domestic cat', 'name': 'kitten'}, {'frequency': 'c', 'id': 623, 'synset': 'kiwi.n.03', 'synonyms': ['kiwi_fruit'], 'def': 'fuzzy brown egg-shaped fruit with slightly tart green flesh', 'name': 'kiwi_fruit'}, {'frequency': 'f', 'id': 624, 'synset': 'knee_pad.n.01', 'synonyms': ['knee_pad'], 'def': 'protective garment consisting of a pad worn by football or baseball or hockey players', 'name': 'knee_pad'}, {'frequency': 'f', 'id': 625, 'synset': 'knife.n.01', 'synonyms': ['knife'], 'def': 'tool with a blade and point used as a cutting instrument', 'name': 'knife'}, {'frequency': 'r', 'id': 626, 'synset': 'knight.n.02', 'synonyms': ['knight_(chess_piece)', 'horse_(chess_piece)'], 'def': 'a chess game piece shaped to resemble the head of a horse', 'name': 'knight_(chess_piece)'}, {'frequency': 'r', 'id': 627, 'synset': 'knitting_needle.n.01', 'synonyms': ['knitting_needle'], 'def': 'needle consisting of a slender rod with pointed ends; usually used in pairs', 'name': 'knitting_needle'}, {'frequency': 'f', 'id': 628, 'synset': 'knob.n.02', 'synonyms': ['knob'], 'def': 'a round handle often found on a door', 'name': 'knob'}, {'frequency': 'r', 'id': 629, 'synset': 'knocker.n.05', 'synonyms': ['knocker_(on_a_door)', 'doorknocker'], 'def': 'a device (usually metal and ornamental) attached by a hinge to a door', 'name': 'knocker_(on_a_door)'}, {'frequency': 'r', 'id': 630, 'synset': 'koala.n.01', 'synonyms': ['koala', 'koala_bear'], 'def': 'sluggish tailless Australian marsupial with grey furry ears and coat', 'name': 'koala'}, {'frequency': 'r', 'id': 631, 'synset': 'lab_coat.n.01', 'synonyms': ['lab_coat', 'laboratory_coat'], 'def': 'a light coat worn to protect clothing from substances used while working in a laboratory', 'name': 'lab_coat'}, {'frequency': 'f', 'id': 632, 'synset': 'ladder.n.01', 'synonyms': ['ladder'], 'def': 'steps consisting of two parallel members connected by rungs', 'name': 'ladder'}, {'frequency': 'c', 'id': 633, 'synset': 'ladle.n.01', 'synonyms': ['ladle'], 'def': 'a spoon-shaped vessel with a long handle frequently used to transfer liquids', 'name': 'ladle'}, {'frequency': 'r', 'id': 634, 'synset': 'ladybug.n.01', 'synonyms': ['ladybug', 'ladybeetle', 'ladybird_beetle'], 'def': 'small round bright-colored and spotted beetle, typically red and black', 'name': 'ladybug'}, {'frequency': 'c', 'id': 635, 'synset': 'lamb.n.01', 'synonyms': ['lamb_(animal)'], 'def': 'young sheep', 'name': 'lamb_(animal)'}, {'frequency': 'r', 'id': 636, 'synset': 'lamb_chop.n.01', 'synonyms': ['lamb-chop', 'lambchop'], 'def': 'chop cut from a lamb', 'name': 'lamb-chop'}, {'frequency': 'f', 'id': 637, 'synset': 'lamp.n.02', 'synonyms': ['lamp'], 'def': 'a piece of furniture holding one or more electric light bulbs', 'name': 'lamp'}, {'frequency': 'f', 'id': 638, 'synset': 'lamppost.n.01', 'synonyms': ['lamppost'], 'def': 'a metal post supporting an outdoor lamp (such as a streetlight)', 'name': 'lamppost'}, {'frequency': 'f', 'id': 639, 'synset': 'lampshade.n.01', 'synonyms': ['lampshade'], 'def': 'a protective ornamental shade used to screen a light bulb from direct view', 'name': 'lampshade'}, {'frequency': 'c', 'id': 640, 'synset': 'lantern.n.01', 'synonyms': ['lantern'], 'def': 'light in a transparent protective case', 'name': 'lantern'}, {'frequency': 'f', 'id': 641, 'synset': 'lanyard.n.02', 'synonyms': ['lanyard', 'laniard'], 'def': 'a cord worn around the neck to hold a knife or whistle, etc.', 'name': 'lanyard'}, {'frequency': 'f', 'id': 642, 'synset': 'laptop.n.01', 'synonyms': ['laptop_computer', 'notebook_computer'], 'def': 'a portable computer small enough to use in your lap', 'name': 'laptop_computer'}, {'frequency': 'r', 'id': 643, 'synset': 'lasagna.n.01', 'synonyms': ['lasagna', 'lasagne'], 'def': 'baked dish of layers of lasagna pasta with sauce and cheese and meat or vegetables', 'name': 'lasagna'}, {'frequency': 'c', 'id': 644, 'synset': 'latch.n.02', 'synonyms': ['latch'], 'def': 'a bar that can be lowered or slid into a groove to fasten a door or gate', 'name': 'latch'}, {'frequency': 'r', 'id': 645, 'synset': 'lawn_mower.n.01', 'synonyms': ['lawn_mower'], 'def': 'garden tool for mowing grass on lawns', 'name': 'lawn_mower'}, {'frequency': 'r', 'id': 646, 'synset': 'leather.n.01', 'synonyms': ['leather'], 'def': 'an animal skin made smooth and flexible by removing the hair and then tanning', 'name': 'leather'}, {'frequency': 'c', 'id': 647, 'synset': 'legging.n.01', 'synonyms': ['legging_(clothing)', 'leging_(clothing)', 'leg_covering'], 'def': 'a garment covering the leg (usually extending from the knee to the ankle)', 'name': 'legging_(clothing)'}, {'frequency': 'c', 'id': 648, 'synset': 'lego.n.01', 'synonyms': ['Lego', 'Lego_set'], 'def': "a child's plastic construction set for making models from blocks", 'name': 'Lego'}, {'frequency': 'f', 'id': 649, 'synset': 'lemon.n.01', 'synonyms': ['lemon'], 'def': 'yellow oval fruit with juicy acidic flesh', 'name': 'lemon'}, {'frequency': 'r', 'id': 650, 'synset': 'lemonade.n.01', 'synonyms': ['lemonade'], 'def': 'sweetened beverage of diluted lemon juice', 'name': 'lemonade'}, {'frequency': 'f', 'id': 651, 'synset': 'lettuce.n.02', 'synonyms': ['lettuce'], 'def': 'leafy plant commonly eaten in salad or on sandwiches', 'name': 'lettuce'}, {'frequency': 'f', 'id': 652, 'synset': 'license_plate.n.01', 'synonyms': ['license_plate', 'numberplate'], 'def': "a plate mounted on the front and back of car and bearing the car's registration number", 'name': 'license_plate'}, {'frequency': 'f', 'id': 653, 'synset': 'life_buoy.n.01', 'synonyms': ['life_buoy', 'lifesaver', 'life_belt', 'life_ring'], 'def': 'a ring-shaped life preserver used to prevent drowning (NOT a life-jacket or vest)', 'name': 'life_buoy'}, {'frequency': 'f', 'id': 654, 'synset': 'life_jacket.n.01', 'synonyms': ['life_jacket', 'life_vest'], 'def': 'life preserver consisting of a sleeveless jacket of buoyant or inflatable design', 'name': 'life_jacket'}, {'frequency': 'f', 'id': 655, 'synset': 'light_bulb.n.01', 'synonyms': ['lightbulb'], 'def': 'glass bulb or tube shaped electric device that emits light (DO NOT MARK LAMPS AS A WHOLE)', 'name': 'lightbulb'}, {'frequency': 'r', 'id': 656, 'synset': 'lightning_rod.n.02', 'synonyms': ['lightning_rod', 'lightning_conductor'], 'def': 'a metallic conductor that is attached to a high point and leads to the ground', 'name': 'lightning_rod'}, {'frequency': 'c', 'id': 657, 'synset': 'lime.n.06', 'synonyms': ['lime'], 'def': 'the green acidic fruit of any of various lime trees', 'name': 'lime'}, {'frequency': 'r', 'id': 658, 'synset': 'limousine.n.01', 'synonyms': ['limousine'], 'def': 'long luxurious car; usually driven by a chauffeur', 'name': 'limousine'}, {'frequency': 'r', 'id': 659, 'synset': 'linen.n.02', 'synonyms': ['linen_paper'], 'def': 'a high-quality paper made of linen fibers or with a linen finish', 'name': 'linen_paper'}, {'frequency': 'c', 'id': 660, 'synset': 'lion.n.01', 'synonyms': ['lion'], 'def': 'large gregarious predatory cat of Africa and India', 'name': 'lion'}, {'frequency': 'c', 'id': 661, 'synset': 'lip_balm.n.01', 'synonyms': ['lip_balm'], 'def': 'a balm applied to the lips', 'name': 'lip_balm'}, {'frequency': 'c', 'id': 662, 'synset': 'lipstick.n.01', 'synonyms': ['lipstick', 'lip_rouge'], 'def': 'makeup that is used to color the lips', 'name': 'lipstick'}, {'frequency': 'r', 'id': 663, 'synset': 'liquor.n.01', 'synonyms': ['liquor', 'spirits', 'hard_liquor', 'liqueur', 'cordial'], 'def': 'an alcoholic beverage that is distilled rather than fermented', 'name': 'liquor'}, {'frequency': 'r', 'id': 664, 'synset': 'lizard.n.01', 'synonyms': ['lizard'], 'def': 'a reptile with usually two pairs of legs and a tapering tail', 'name': 'lizard'}, {'frequency': 'r', 'id': 665, 'synset': 'loafer.n.02', 'synonyms': ['Loafer_(type_of_shoe)'], 'def': 'a low leather step-in shoe', 'name': 'Loafer_(type_of_shoe)'}, {'frequency': 'f', 'id': 666, 'synset': 'log.n.01', 'synonyms': ['log'], 'def': 'a segment of the trunk of a tree when stripped of branches', 'name': 'log'}, {'frequency': 'c', 'id': 667, 'synset': 'lollipop.n.02', 'synonyms': ['lollipop'], 'def': 'hard candy on a stick', 'name': 'lollipop'}, {'frequency': 'c', 'id': 668, 'synset': 'lotion.n.01', 'synonyms': ['lotion'], 'def': 'any of various cosmetic preparations that are applied to the skin', 'name': 'lotion'}, {'frequency': 'f', 'id': 669, 'synset': 'loudspeaker.n.01', 'synonyms': ['speaker_(stero_equipment)'], 'def': 'electronic device that produces sound often as part of a stereo system', 'name': 'speaker_(stero_equipment)'}, {'frequency': 'c', 'id': 670, 'synset': 'love_seat.n.01', 'synonyms': ['loveseat'], 'def': 'small sofa that seats two people', 'name': 'loveseat'}, {'frequency': 'r', 'id': 671, 'synset': 'machine_gun.n.01', 'synonyms': ['machine_gun'], 'def': 'a rapidly firing automatic gun', 'name': 'machine_gun'}, {'frequency': 'f', 'id': 672, 'synset': 'magazine.n.02', 'synonyms': ['magazine'], 'def': 'a paperback periodic publication', 'name': 'magazine'}, {'frequency': 'f', 'id': 673, 'synset': 'magnet.n.01', 'synonyms': ['magnet'], 'def': 'a device that attracts iron and produces a magnetic field', 'name': 'magnet'}, {'frequency': 'r', 'id': 674, 'synset': 'mail_slot.n.01', 'synonyms': ['mail_slot'], 'def': 'a slot (usually in a door) through which mail can be delivered', 'name': 'mail_slot'}, {'frequency': 'c', 'id': 675, 'synset': 'mailbox.n.01', 'synonyms': ['mailbox_(at_home)', 'letter_box_(at_home)'], 'def': 'a private box for delivery of mail', 'name': 'mailbox_(at_home)'}, {'frequency': 'r', 'id': 676, 'synset': 'mallet.n.01', 'synonyms': ['mallet'], 'def': 'a sports implement with a long handle and a hammer-like head used to hit a ball', 'name': 'mallet'}, {'frequency': 'r', 'id': 677, 'synset': 'mammoth.n.01', 'synonyms': ['mammoth'], 'def': 'any of numerous extinct elephants widely distributed in the Pleistocene', 'name': 'mammoth'}, {'frequency': 'c', 'id': 678, 'synset': 'mandarin.n.05', 'synonyms': ['mandarin_orange'], 'def': 'a somewhat flat reddish-orange loose skinned citrus of China', 'name': 'mandarin_orange'}, {'frequency': 'c', 'id': 679, 'synset': 'manger.n.01', 'synonyms': ['manger', 'trough'], 'def': 'a container (usually in a barn or stable) from which cattle or horses feed', 'name': 'manger'}, {'frequency': 'f', 'id': 680, 'synset': 'manhole.n.01', 'synonyms': ['manhole'], 'def': 'a hole (usually with a flush cover) through which a person can gain access to an underground structure', 'name': 'manhole'}, {'frequency': 'c', 'id': 681, 'synset': 'map.n.01', 'synonyms': ['map'], 'def': "a diagrammatic representation of the earth's surface (or part of it)", 'name': 'map'}, {'frequency': 'c', 'id': 682, 'synset': 'marker.n.03', 'synonyms': ['marker'], 'def': 'a writing implement for making a mark', 'name': 'marker'}, {'frequency': 'r', 'id': 683, 'synset': 'martini.n.01', 'synonyms': ['martini'], 'def': 'a cocktail made of gin (or vodka) with dry vermouth', 'name': 'martini'}, {'frequency': 'r', 'id': 684, 'synset': 'mascot.n.01', 'synonyms': ['mascot'], 'def': 'a person or animal that is adopted by a team or other group as a symbolic figure', 'name': 'mascot'}, {'frequency': 'c', 'id': 685, 'synset': 'mashed_potato.n.01', 'synonyms': ['mashed_potato'], 'def': 'potato that has been peeled and boiled and then mashed', 'name': 'mashed_potato'}, {'frequency': 'r', 'id': 686, 'synset': 'masher.n.02', 'synonyms': ['masher'], 'def': 'a kitchen utensil used for mashing (e.g. potatoes)', 'name': 'masher'}, {'frequency': 'f', 'id': 687, 'synset': 'mask.n.04', 'synonyms': ['mask', 'facemask'], 'def': 'a protective covering worn over the face', 'name': 'mask'}, {'frequency': 'f', 'id': 688, 'synset': 'mast.n.01', 'synonyms': ['mast'], 'def': 'a vertical spar for supporting sails', 'name': 'mast'}, {'frequency': 'c', 'id': 689, 'synset': 'mat.n.03', 'synonyms': ['mat_(gym_equipment)', 'gym_mat'], 'def': 'sports equipment consisting of a piece of thick padding on the floor for gymnastics', 'name': 'mat_(gym_equipment)'}, {'frequency': 'r', 'id': 690, 'synset': 'matchbox.n.01', 'synonyms': ['matchbox'], 'def': 'a box for holding matches', 'name': 'matchbox'}, {'frequency': 'f', 'id': 691, 'synset': 'mattress.n.01', 'synonyms': ['mattress'], 'def': 'a thick pad filled with resilient material used as a bed or part of a bed', 'name': 'mattress'}, {'frequency': 'c', 'id': 692, 'synset': 'measuring_cup.n.01', 'synonyms': ['measuring_cup'], 'def': 'graduated cup used to measure liquid or granular ingredients', 'name': 'measuring_cup'}, {'frequency': 'c', 'id': 693, 'synset': 'measuring_stick.n.01', 'synonyms': ['measuring_stick', 'ruler_(measuring_stick)', 'measuring_rod'], 'def': 'measuring instrument having a sequence of marks at regular intervals', 'name': 'measuring_stick'}, {'frequency': 'c', 'id': 694, 'synset': 'meatball.n.01', 'synonyms': ['meatball'], 'def': 'ground meat formed into a ball and fried or simmered in broth', 'name': 'meatball'}, {'frequency': 'c', 'id': 695, 'synset': 'medicine.n.02', 'synonyms': ['medicine'], 'def': 'something that treats or prevents or alleviates the symptoms of disease', 'name': 'medicine'}, {'frequency': 'r', 'id': 696, 'synset': 'melon.n.01', 'synonyms': ['melon'], 'def': 'fruit of the gourd family having a hard rind and sweet juicy flesh', 'name': 'melon'}, {'frequency': 'f', 'id': 697, 'synset': 'microphone.n.01', 'synonyms': ['microphone'], 'def': 'device for converting sound waves into electrical energy', 'name': 'microphone'}, {'frequency': 'r', 'id': 698, 'synset': 'microscope.n.01', 'synonyms': ['microscope'], 'def': 'magnifier of the image of small objects', 'name': 'microscope'}, {'frequency': 'f', 'id': 699, 'synset': 'microwave.n.02', 'synonyms': ['microwave_oven'], 'def': 'kitchen appliance that cooks food by passing an electromagnetic wave through it', 'name': 'microwave_oven'}, {'frequency': 'r', 'id': 700, 'synset': 'milestone.n.01', 'synonyms': ['milestone', 'milepost'], 'def': 'stone post at side of a road to show distances', 'name': 'milestone'}, {'frequency': 'c', 'id': 701, 'synset': 'milk.n.01', 'synonyms': ['milk'], 'def': 'a white nutritious liquid secreted by mammals and used as food by human beings', 'name': 'milk'}, {'frequency': 'f', 'id': 702, 'synset': 'minivan.n.01', 'synonyms': ['minivan'], 'def': 'a small box-shaped passenger van', 'name': 'minivan'}, {'frequency': 'r', 'id': 703, 'synset': 'mint.n.05', 'synonyms': ['mint_candy'], 'def': 'a candy that is flavored with a mint oil', 'name': 'mint_candy'}, {'frequency': 'f', 'id': 704, 'synset': 'mirror.n.01', 'synonyms': ['mirror'], 'def': 'polished surface that forms images by reflecting light', 'name': 'mirror'}, {'frequency': 'c', 'id': 705, 'synset': 'mitten.n.01', 'synonyms': ['mitten'], 'def': 'glove that encases the thumb separately and the other four fingers together', 'name': 'mitten'}, {'frequency': 'c', 'id': 706, 'synset': 'mixer.n.04', 'synonyms': ['mixer_(kitchen_tool)', 'stand_mixer'], 'def': 'a kitchen utensil that is used for mixing foods', 'name': 'mixer_(kitchen_tool)'}, {'frequency': 'c', 'id': 707, 'synset': 'money.n.03', 'synonyms': ['money'], 'def': 'the official currency issued by a government or national bank', 'name': 'money'}, {'frequency': 'f', 'id': 708, 'synset': 'monitor.n.04', 'synonyms': ['monitor_(computer_equipment) computer_monitor'], 'def': 'a computer monitor', 'name': 'monitor_(computer_equipment) computer_monitor'}, {'frequency': 'c', 'id': 709, 'synset': 'monkey.n.01', 'synonyms': ['monkey'], 'def': 'any of various long-tailed primates', 'name': 'monkey'}, {'frequency': 'f', 'id': 710, 'synset': 'motor.n.01', 'synonyms': ['motor'], 'def': 'machine that converts other forms of energy into mechanical energy and so imparts motion', 'name': 'motor'}, {'frequency': 'f', 'id': 711, 'synset': 'motor_scooter.n.01', 'synonyms': ['motor_scooter', 'scooter'], 'def': 'a wheeled vehicle with small wheels and a low-powered engine', 'name': 'motor_scooter'}, {'frequency': 'r', 'id': 712, 'synset': 'motor_vehicle.n.01', 'synonyms': ['motor_vehicle', 'automotive_vehicle'], 'def': 'a self-propelled wheeled vehicle that does not run on rails', 'name': 'motor_vehicle'}, {'frequency': 'r', 'id': 713, 'synset': 'motorboat.n.01', 'synonyms': ['motorboat', 'powerboat'], 'def': 'a boat propelled by an internal-combustion engine', 'name': 'motorboat'}, {'frequency': 'f', 'id': 714, 'synset': 'motorcycle.n.01', 'synonyms': ['motorcycle'], 'def': 'a motor vehicle with two wheels and a strong frame', 'name': 'motorcycle'}, {'frequency': 'f', 'id': 715, 'synset': 'mound.n.01', 'synonyms': ['mound_(baseball)', "pitcher's_mound"], 'def': '(baseball) the slight elevation on which the pitcher stands', 'name': 'mound_(baseball)'}, {'frequency': 'r', 'id': 716, 'synset': 'mouse.n.01', 'synonyms': ['mouse_(animal_rodent)'], 'def': 'a small rodent with pointed snouts and small ears on elongated bodies with slender usually hairless tails', 'name': 'mouse_(animal_rodent)'}, {'frequency': 'f', 'id': 717, 'synset': 'mouse.n.04', 'synonyms': ['mouse_(computer_equipment)', 'computer_mouse'], 'def': 'a computer input device that controls an on-screen pointer', 'name': 'mouse_(computer_equipment)'}, {'frequency': 'f', 'id': 718, 'synset': 'mousepad.n.01', 'synonyms': ['mousepad'], 'def': 'a small portable pad that provides an operating surface for a computer mouse', 'name': 'mousepad'}, {'frequency': 'c', 'id': 719, 'synset': 'muffin.n.01', 'synonyms': ['muffin'], 'def': 'a sweet quick bread baked in a cup-shaped pan', 'name': 'muffin'}, {'frequency': 'f', 'id': 720, 'synset': 'mug.n.04', 'synonyms': ['mug'], 'def': 'with handle and usually cylindrical', 'name': 'mug'}, {'frequency': 'f', 'id': 721, 'synset': 'mushroom.n.02', 'synonyms': ['mushroom'], 'def': 'a common mushroom', 'name': 'mushroom'}, {'frequency': 'r', 'id': 722, 'synset': 'music_stool.n.01', 'synonyms': ['music_stool', 'piano_stool'], 'def': 'a stool for piano players; usually adjustable in height', 'name': 'music_stool'}, {'frequency': 'r', 'id': 723, 'synset': 'musical_instrument.n.01', 'synonyms': ['musical_instrument', 'instrument_(musical)'], 'def': 'any of various devices or contrivances that can be used to produce musical tones or sounds', 'name': 'musical_instrument'}, {'frequency': 'r', 'id': 724, 'synset': 'nailfile.n.01', 'synonyms': ['nailfile'], 'def': 'a small flat file for shaping the nails', 'name': 'nailfile'}, {'frequency': 'r', 'id': 725, 'synset': 'nameplate.n.01', 'synonyms': ['nameplate'], 'def': 'a plate bearing a name', 'name': 'nameplate'}, {'frequency': 'f', 'id': 726, 'synset': 'napkin.n.01', 'synonyms': ['napkin', 'table_napkin', 'serviette'], 'def': 'a small piece of table linen or paper that is used to wipe the mouth and to cover the lap in order to protect clothing', 'name': 'napkin'}, {'frequency': 'r', 'id': 727, 'synset': 'neckerchief.n.01', 'synonyms': ['neckerchief'], 'def': 'a kerchief worn around the neck', 'name': 'neckerchief'}, {'frequency': 'f', 'id': 728, 'synset': 'necklace.n.01', 'synonyms': ['necklace'], 'def': 'jewelry consisting of a cord or chain (often bearing gems) worn about the neck as an ornament', 'name': 'necklace'}, {'frequency': 'f', 'id': 729, 'synset': 'necktie.n.01', 'synonyms': ['necktie', 'tie_(necktie)'], 'def': 'neckwear consisting of a long narrow piece of material worn under a collar and tied in knot at the front', 'name': 'necktie'}, {'frequency': 'r', 'id': 730, 'synset': 'needle.n.03', 'synonyms': ['needle'], 'def': 'a sharp pointed implement (usually metal)', 'name': 'needle'}, {'frequency': 'c', 'id': 731, 'synset': 'nest.n.01', 'synonyms': ['nest'], 'def': 'a structure in which animals lay eggs or give birth to their young', 'name': 'nest'}, {'frequency': 'r', 'id': 732, 'synset': 'newsstand.n.01', 'synonyms': ['newsstand'], 'def': 'a stall where newspapers and other periodicals are sold', 'name': 'newsstand'}, {'frequency': 'c', 'id': 733, 'synset': 'nightwear.n.01', 'synonyms': ['nightshirt', 'nightwear', 'sleepwear', 'nightclothes'], 'def': 'garments designed to be worn in bed', 'name': 'nightshirt'}, {'frequency': 'r', 'id': 734, 'synset': 'nosebag.n.01', 'synonyms': ['nosebag_(for_animals)', 'feedbag'], 'def': 'a canvas bag that is used to feed an animal (such as a horse); covers the muzzle and fastens at the top of the head', 'name': 'nosebag_(for_animals)'}, {'frequency': 'r', 'id': 735, 'synset': 'noseband.n.01', 'synonyms': ['noseband_(for_animals)', 'nosepiece_(for_animals)'], 'def': "a strap that is the part of a bridle that goes over the animal's nose", 'name': 'noseband_(for_animals)'}, {'frequency': 'f', 'id': 736, 'synset': 'notebook.n.01', 'synonyms': ['notebook'], 'def': 'a book with blank pages for recording notes or memoranda', 'name': 'notebook'}, {'frequency': 'c', 'id': 737, 'synset': 'notepad.n.01', 'synonyms': ['notepad'], 'def': 'a pad of paper for keeping notes', 'name': 'notepad'}, {'frequency': 'c', 'id': 738, 'synset': 'nut.n.03', 'synonyms': ['nut'], 'def': 'a small metal block (usually square or hexagonal) with internal screw thread to be fitted onto a bolt', 'name': 'nut'}, {'frequency': 'r', 'id': 739, 'synset': 'nutcracker.n.01', 'synonyms': ['nutcracker'], 'def': 'a hand tool used to crack nuts open', 'name': 'nutcracker'}, {'frequency': 'c', 'id': 740, 'synset': 'oar.n.01', 'synonyms': ['oar'], 'def': 'an implement used to propel or steer a boat', 'name': 'oar'}, {'frequency': 'r', 'id': 741, 'synset': 'octopus.n.01', 'synonyms': ['octopus_(food)'], 'def': 'tentacles of octopus prepared as food', 'name': 'octopus_(food)'}, {'frequency': 'r', 'id': 742, 'synset': 'octopus.n.02', 'synonyms': ['octopus_(animal)'], 'def': 'bottom-living cephalopod having a soft oval body with eight long tentacles', 'name': 'octopus_(animal)'}, {'frequency': 'c', 'id': 743, 'synset': 'oil_lamp.n.01', 'synonyms': ['oil_lamp', 'kerosene_lamp', 'kerosine_lamp'], 'def': 'a lamp that burns oil (as kerosine) for light', 'name': 'oil_lamp'}, {'frequency': 'c', 'id': 744, 'synset': 'olive_oil.n.01', 'synonyms': ['olive_oil'], 'def': 'oil from olives', 'name': 'olive_oil'}, {'frequency': 'r', 'id': 745, 'synset': 'omelet.n.01', 'synonyms': ['omelet', 'omelette'], 'def': 'beaten eggs cooked until just set; may be folded around e.g. ham or cheese or jelly', 'name': 'omelet'}, {'frequency': 'f', 'id': 746, 'synset': 'onion.n.01', 'synonyms': ['onion'], 'def': 'the bulb of an onion plant', 'name': 'onion'}, {'frequency': 'f', 'id': 747, 'synset': 'orange.n.01', 'synonyms': ['orange_(fruit)'], 'def': 'orange (FRUIT of an orange tree)', 'name': 'orange_(fruit)'}, {'frequency': 'c', 'id': 748, 'synset': 'orange_juice.n.01', 'synonyms': ['orange_juice'], 'def': 'bottled or freshly squeezed juice of oranges', 'name': 'orange_juice'}, {'frequency': 'r', 'id': 749, 'synset': 'oregano.n.01', 'synonyms': ['oregano', 'marjoram'], 'def': 'aromatic Eurasian perennial herb used in cooking and baking', 'name': 'oregano'}, {'frequency': 'c', 'id': 750, 'synset': 'ostrich.n.02', 'synonyms': ['ostrich'], 'def': 'fast-running African flightless bird with two-toed feet; largest living bird', 'name': 'ostrich'}, {'frequency': 'c', 'id': 751, 'synset': 'ottoman.n.03', 'synonyms': ['ottoman', 'pouf', 'pouffe', 'hassock'], 'def': 'thick cushion used as a seat', 'name': 'ottoman'}, {'frequency': 'c', 'id': 752, 'synset': 'overall.n.01', 'synonyms': ['overalls_(clothing)'], 'def': 'work clothing consisting of denim trousers usually with a bib and shoulder straps', 'name': 'overalls_(clothing)'}, {'frequency': 'c', 'id': 753, 'synset': 'owl.n.01', 'synonyms': ['owl'], 'def': 'nocturnal bird of prey with hawk-like beak and claws and large head with front-facing eyes', 'name': 'owl'}, {'frequency': 'c', 'id': 754, 'synset': 'packet.n.03', 'synonyms': ['packet'], 'def': 'a small package or bundle', 'name': 'packet'}, {'frequency': 'r', 'id': 755, 'synset': 'pad.n.03', 'synonyms': ['inkpad', 'inking_pad', 'stamp_pad'], 'def': 'absorbent material saturated with ink used to transfer ink evenly to a rubber stamp', 'name': 'inkpad'}, {'frequency': 'c', 'id': 756, 'synset': 'pad.n.04', 'synonyms': ['pad'], 'def': 'a flat mass of soft material used for protection, stuffing, or comfort', 'name': 'pad'}, {'frequency': 'c', 'id': 757, 'synset': 'paddle.n.04', 'synonyms': ['paddle', 'boat_paddle'], 'def': 'a short light oar used without an oarlock to propel a canoe or small boat', 'name': 'paddle'}, {'frequency': 'c', 'id': 758, 'synset': 'padlock.n.01', 'synonyms': ['padlock'], 'def': 'a detachable, portable lock', 'name': 'padlock'}, {'frequency': 'r', 'id': 759, 'synset': 'paintbox.n.01', 'synonyms': ['paintbox'], 'def': "a box containing a collection of cubes or tubes of artists' paint", 'name': 'paintbox'}, {'frequency': 'c', 'id': 760, 'synset': 'paintbrush.n.01', 'synonyms': ['paintbrush'], 'def': 'a brush used as an applicator to apply paint', 'name': 'paintbrush'}, {'frequency': 'f', 'id': 761, 'synset': 'painting.n.01', 'synonyms': ['painting'], 'def': 'graphic art consisting of an artistic composition made by applying paints to a surface', 'name': 'painting'}, {'frequency': 'c', 'id': 762, 'synset': 'pajama.n.02', 'synonyms': ['pajamas', 'pyjamas'], 'def': 'loose-fitting nightclothes worn for sleeping or lounging', 'name': 'pajamas'}, {'frequency': 'c', 'id': 763, 'synset': 'palette.n.02', 'synonyms': ['palette', 'pallet'], 'def': 'board that provides a flat surface on which artists mix paints and the range of colors used', 'name': 'palette'}, {'frequency': 'f', 'id': 764, 'synset': 'pan.n.01', 'synonyms': ['pan_(for_cooking)', 'cooking_pan'], 'def': 'cooking utensil consisting of a wide metal vessel', 'name': 'pan_(for_cooking)'}, {'frequency': 'r', 'id': 765, 'synset': 'pan.n.03', 'synonyms': ['pan_(metal_container)'], 'def': 'shallow container made of metal', 'name': 'pan_(metal_container)'}, {'frequency': 'c', 'id': 766, 'synset': 'pancake.n.01', 'synonyms': ['pancake'], 'def': 'a flat cake of thin batter fried on both sides on a griddle', 'name': 'pancake'}, {'frequency': 'r', 'id': 767, 'synset': 'pantyhose.n.01', 'synonyms': ['pantyhose'], 'def': "a woman's tights consisting of underpants and stockings", 'name': 'pantyhose'}, {'frequency': 'r', 'id': 768, 'synset': 'papaya.n.02', 'synonyms': ['papaya'], 'def': 'large oval melon-like tropical fruit with yellowish flesh', 'name': 'papaya'}, {'frequency': 'r', 'id': 769, 'synset': 'paper_clip.n.01', 'synonyms': ['paperclip'], 'def': 'a wire or plastic clip for holding sheets of paper together', 'name': 'paperclip'}, {'frequency': 'f', 'id': 770, 'synset': 'paper_plate.n.01', 'synonyms': ['paper_plate'], 'def': 'a disposable plate made of cardboard', 'name': 'paper_plate'}, {'frequency': 'f', 'id': 771, 'synset': 'paper_towel.n.01', 'synonyms': ['paper_towel'], 'def': 'a disposable towel made of absorbent paper', 'name': 'paper_towel'}, {'frequency': 'r', 'id': 772, 'synset': 'paperback_book.n.01', 'synonyms': ['paperback_book', 'paper-back_book', 'softback_book', 'soft-cover_book'], 'def': 'a book with paper covers', 'name': 'paperback_book'}, {'frequency': 'r', 'id': 773, 'synset': 'paperweight.n.01', 'synonyms': ['paperweight'], 'def': 'a weight used to hold down a stack of papers', 'name': 'paperweight'}, {'frequency': 'c', 'id': 774, 'synset': 'parachute.n.01', 'synonyms': ['parachute'], 'def': 'rescue equipment consisting of a device that fills with air and retards your fall', 'name': 'parachute'}, {'frequency': 'r', 'id': 775, 'synset': 'parakeet.n.01', 'synonyms': ['parakeet', 'parrakeet', 'parroket', 'paraquet', 'paroquet', 'parroquet'], 'def': 'any of numerous small slender long-tailed parrots', 'name': 'parakeet'}, {'frequency': 'c', 'id': 776, 'synset': 'parasail.n.01', 'synonyms': ['parasail_(sports)'], 'def': 'parachute that will lift a person up into the air when it is towed by a motorboat or a car', 'name': 'parasail_(sports)'}, {'frequency': 'r', 'id': 777, 'synset': 'parchment.n.01', 'synonyms': ['parchment'], 'def': 'a superior paper resembling sheepskin', 'name': 'parchment'}, {'frequency': 'r', 'id': 778, 'synset': 'parka.n.01', 'synonyms': ['parka', 'anorak'], 'def': "a kind of heavy jacket (`windcheater' is a British term)", 'name': 'parka'}, {'frequency': 'f', 'id': 779, 'synset': 'parking_meter.n.01', 'synonyms': ['parking_meter'], 'def': 'a coin-operated timer located next to a parking space', 'name': 'parking_meter'}, {'frequency': 'c', 'id': 780, 'synset': 'parrot.n.01', 'synonyms': ['parrot'], 'def': 'usually brightly colored tropical birds with short hooked beaks and the ability to mimic sounds', 'name': 'parrot'}, {'frequency': 'c', 'id': 781, 'synset': 'passenger_car.n.01', 'synonyms': ['passenger_car_(part_of_a_train)', 'coach_(part_of_a_train)'], 'def': 'a railcar where passengers ride', 'name': 'passenger_car_(part_of_a_train)'}, {'frequency': 'r', 'id': 782, 'synset': 'passenger_ship.n.01', 'synonyms': ['passenger_ship'], 'def': 'a ship built to carry passengers', 'name': 'passenger_ship'}, {'frequency': 'r', 'id': 783, 'synset': 'passport.n.02', 'synonyms': ['passport'], 'def': 'a document issued by a country to a citizen allowing that person to travel abroad and re-enter the home country', 'name': 'passport'}, {'frequency': 'f', 'id': 784, 'synset': 'pastry.n.02', 'synonyms': ['pastry'], 'def': 'any of various baked foods made of dough or batter', 'name': 'pastry'}, {'frequency': 'r', 'id': 785, 'synset': 'patty.n.01', 'synonyms': ['patty_(food)'], 'def': 'small flat mass of chopped food', 'name': 'patty_(food)'}, {'frequency': 'c', 'id': 786, 'synset': 'pea.n.01', 'synonyms': ['pea_(food)'], 'def': 'seed of a pea plant used for food', 'name': 'pea_(food)'}, {'frequency': 'c', 'id': 787, 'synset': 'peach.n.03', 'synonyms': ['peach'], 'def': 'downy juicy fruit with sweet yellowish or whitish flesh', 'name': 'peach'}, {'frequency': 'c', 'id': 788, 'synset': 'peanut_butter.n.01', 'synonyms': ['peanut_butter'], 'def': 'a spread made from ground peanuts', 'name': 'peanut_butter'}, {'frequency': 'c', 'id': 789, 'synset': 'pear.n.01', 'synonyms': ['pear'], 'def': 'sweet juicy gritty-textured fruit available in many varieties', 'name': 'pear'}, {'frequency': 'r', 'id': 790, 'synset': 'peeler.n.03', 'synonyms': ['peeler_(tool_for_fruit_and_vegetables)'], 'def': 'a device for peeling vegetables or fruits', 'name': 'peeler_(tool_for_fruit_and_vegetables)'}, {'frequency': 'r', 'id': 791, 'synset': 'pegboard.n.01', 'synonyms': ['pegboard'], 'def': 'a board perforated with regularly spaced holes into which pegs can be fitted', 'name': 'pegboard'}, {'frequency': 'c', 'id': 792, 'synset': 'pelican.n.01', 'synonyms': ['pelican'], 'def': 'large long-winged warm-water seabird having a large bill with a distensible pouch for fish', 'name': 'pelican'}, {'frequency': 'f', 'id': 793, 'synset': 'pen.n.01', 'synonyms': ['pen'], 'def': 'a writing implement with a point from which ink flows', 'name': 'pen'}, {'frequency': 'c', 'id': 794, 'synset': 'pencil.n.01', 'synonyms': ['pencil'], 'def': 'a thin cylindrical pointed writing implement made of wood and graphite', 'name': 'pencil'}, {'frequency': 'r', 'id': 795, 'synset': 'pencil_box.n.01', 'synonyms': ['pencil_box', 'pencil_case'], 'def': 'a box for holding pencils', 'name': 'pencil_box'}, {'frequency': 'r', 'id': 796, 'synset': 'pencil_sharpener.n.01', 'synonyms': ['pencil_sharpener'], 'def': 'a rotary implement for sharpening the point on pencils', 'name': 'pencil_sharpener'}, {'frequency': 'r', 'id': 797, 'synset': 'pendulum.n.01', 'synonyms': ['pendulum'], 'def': 'an apparatus consisting of an object mounted so that it swings freely under the influence of gravity', 'name': 'pendulum'}, {'frequency': 'c', 'id': 798, 'synset': 'penguin.n.01', 'synonyms': ['penguin'], 'def': 'short-legged flightless birds of cold southern regions having webbed feet and wings modified as flippers', 'name': 'penguin'}, {'frequency': 'r', 'id': 799, 'synset': 'pennant.n.02', 'synonyms': ['pennant'], 'def': 'a flag longer than it is wide (and often tapering)', 'name': 'pennant'}, {'frequency': 'r', 'id': 800, 'synset': 'penny.n.02', 'synonyms': ['penny_(coin)'], 'def': 'a coin worth one-hundredth of the value of the basic unit', 'name': 'penny_(coin)'}, {'frequency': 'c', 'id': 801, 'synset': 'pepper.n.03', 'synonyms': ['pepper', 'peppercorn'], 'def': 'pungent seasoning from the berry of the common pepper plant; whole or ground', 'name': 'pepper'}, {'frequency': 'c', 'id': 802, 'synset': 'pepper_mill.n.01', 'synonyms': ['pepper_mill', 'pepper_grinder'], 'def': 'a mill for grinding pepper', 'name': 'pepper_mill'}, {'frequency': 'c', 'id': 803, 'synset': 'perfume.n.02', 'synonyms': ['perfume'], 'def': 'a toiletry that emits and diffuses a fragrant odor', 'name': 'perfume'}, {'frequency': 'r', 'id': 804, 'synset': 'persimmon.n.02', 'synonyms': ['persimmon'], 'def': 'orange fruit resembling a plum; edible when fully ripe', 'name': 'persimmon'}, {'frequency': 'f', 'id': 805, 'synset': 'person.n.01', 'synonyms': ['baby', 'child', 'boy', 'girl', 'man', 'woman', 'person', 'human'], 'def': 'a human being', 'name': 'baby'}, {'frequency': 'r', 'id': 806, 'synset': 'pet.n.01', 'synonyms': ['pet'], 'def': 'a domesticated animal kept for companionship or amusement', 'name': 'pet'}, {'frequency': 'r', 'id': 807, 'synset': 'petfood.n.01', 'synonyms': ['petfood', 'pet-food'], 'def': 'food prepared for animal pets', 'name': 'petfood'}, {'frequency': 'r', 'id': 808, 'synset': 'pew.n.01', 'synonyms': ['pew_(church_bench)', 'church_bench'], 'def': 'long bench with backs; used in church by the congregation', 'name': 'pew_(church_bench)'}, {'frequency': 'r', 'id': 809, 'synset': 'phonebook.n.01', 'synonyms': ['phonebook', 'telephone_book', 'telephone_directory'], 'def': 'a directory containing an alphabetical list of telephone subscribers and their telephone numbers', 'name': 'phonebook'}, {'frequency': 'c', 'id': 810, 'synset': 'phonograph_record.n.01', 'synonyms': ['phonograph_record', 'phonograph_recording', 'record_(phonograph_recording)'], 'def': 'sound recording consisting of a typically black disk with a continuous groove', 'name': 'phonograph_record'}, {'frequency': 'c', 'id': 811, 'synset': 'piano.n.01', 'synonyms': ['piano'], 'def': 'a keyboard instrument that is played by depressing keys that cause hammers to strike tuned strings and produce sounds', 'name': 'piano'}, {'frequency': 'f', 'id': 812, 'synset': 'pickle.n.01', 'synonyms': ['pickle'], 'def': 'vegetables (especially cucumbers) preserved in brine or vinegar', 'name': 'pickle'}, {'frequency': 'f', 'id': 813, 'synset': 'pickup.n.01', 'synonyms': ['pickup_truck'], 'def': 'a light truck with an open body and low sides and a tailboard', 'name': 'pickup_truck'}, {'frequency': 'c', 'id': 814, 'synset': 'pie.n.01', 'synonyms': ['pie'], 'def': 'dish baked in pastry-lined pan often with a pastry top', 'name': 'pie'}, {'frequency': 'c', 'id': 815, 'synset': 'pigeon.n.01', 'synonyms': ['pigeon'], 'def': 'wild and domesticated birds having a heavy body and short legs', 'name': 'pigeon'}, {'frequency': 'r', 'id': 816, 'synset': 'piggy_bank.n.01', 'synonyms': ['piggy_bank', 'penny_bank'], 'def': "a child's coin bank (often shaped like a pig)", 'name': 'piggy_bank'}, {'frequency': 'f', 'id': 817, 'synset': 'pillow.n.01', 'synonyms': ['pillow'], 'def': 'a cushion to support the head of a sleeping person', 'name': 'pillow'}, {'frequency': 'r', 'id': 818, 'synset': 'pin.n.09', 'synonyms': ['pin_(non_jewelry)'], 'def': 'a small slender (often pointed) piece of wood or metal used to support or fasten or attach things', 'name': 'pin_(non_jewelry)'}, {'frequency': 'f', 'id': 819, 'synset': 'pineapple.n.02', 'synonyms': ['pineapple'], 'def': 'large sweet fleshy tropical fruit with a tuft of stiff leaves', 'name': 'pineapple'}, {'frequency': 'c', 'id': 820, 'synset': 'pinecone.n.01', 'synonyms': ['pinecone'], 'def': 'the seed-producing cone of a pine tree', 'name': 'pinecone'}, {'frequency': 'r', 'id': 821, 'synset': 'ping-pong_ball.n.01', 'synonyms': ['ping-pong_ball'], 'def': 'light hollow ball used in playing table tennis', 'name': 'ping-pong_ball'}, {'frequency': 'r', 'id': 822, 'synset': 'pinwheel.n.03', 'synonyms': ['pinwheel'], 'def': 'a toy consisting of vanes of colored paper or plastic that is pinned to a stick and spins when it is pointed into the wind', 'name': 'pinwheel'}, {'frequency': 'r', 'id': 823, 'synset': 'pipe.n.01', 'synonyms': ['tobacco_pipe'], 'def': 'a tube with a small bowl at one end; used for smoking tobacco', 'name': 'tobacco_pipe'}, {'frequency': 'f', 'id': 824, 'synset': 'pipe.n.02', 'synonyms': ['pipe', 'piping'], 'def': 'a long tube made of metal or plastic that is used to carry water or oil or gas etc.', 'name': 'pipe'}, {'frequency': 'r', 'id': 825, 'synset': 'pistol.n.01', 'synonyms': ['pistol', 'handgun'], 'def': 'a firearm that is held and fired with one hand', 'name': 'pistol'}, {'frequency': 'r', 'id': 826, 'synset': 'pita.n.01', 'synonyms': ['pita_(bread)', 'pocket_bread'], 'def': 'usually small round bread that can open into a pocket for filling', 'name': 'pita_(bread)'}, {'frequency': 'f', 'id': 827, 'synset': 'pitcher.n.02', 'synonyms': ['pitcher_(vessel_for_liquid)', 'ewer'], 'def': 'an open vessel with a handle and a spout for pouring', 'name': 'pitcher_(vessel_for_liquid)'}, {'frequency': 'r', 'id': 828, 'synset': 'pitchfork.n.01', 'synonyms': ['pitchfork'], 'def': 'a long-handled hand tool with sharp widely spaced prongs for lifting and pitching hay', 'name': 'pitchfork'}, {'frequency': 'f', 'id': 829, 'synset': 'pizza.n.01', 'synonyms': ['pizza'], 'def': 'Italian open pie made of thin bread dough spread with a spiced mixture of e.g. tomato sauce and cheese', 'name': 'pizza'}, {'frequency': 'f', 'id': 830, 'synset': 'place_mat.n.01', 'synonyms': ['place_mat'], 'def': 'a mat placed on a table for an individual place setting', 'name': 'place_mat'}, {'frequency': 'f', 'id': 831, 'synset': 'plate.n.04', 'synonyms': ['plate'], 'def': 'dish on which food is served or from which food is eaten', 'name': 'plate'}, {'frequency': 'c', 'id': 832, 'synset': 'platter.n.01', 'synonyms': ['platter'], 'def': 'a large shallow dish used for serving food', 'name': 'platter'}, {'frequency': 'r', 'id': 833, 'synset': 'playing_card.n.01', 'synonyms': ['playing_card'], 'def': 'one of a pack of cards that are used to play card games', 'name': 'playing_card'}, {'frequency': 'r', 'id': 834, 'synset': 'playpen.n.01', 'synonyms': ['playpen'], 'def': 'a portable enclosure in which babies may be left to play', 'name': 'playpen'}, {'frequency': 'c', 'id': 835, 'synset': 'pliers.n.01', 'synonyms': ['pliers', 'plyers'], 'def': 'a gripping hand tool with two hinged arms and (usually) serrated jaws', 'name': 'pliers'}, {'frequency': 'r', 'id': 836, 'synset': 'plow.n.01', 'synonyms': ['plow_(farm_equipment)', 'plough_(farm_equipment)'], 'def': 'a farm tool having one or more heavy blades to break the soil and cut a furrow prior to sowing', 'name': 'plow_(farm_equipment)'}, {'frequency': 'r', 'id': 837, 'synset': 'pocket_watch.n.01', 'synonyms': ['pocket_watch'], 'def': 'a watch that is carried in a small watch pocket', 'name': 'pocket_watch'}, {'frequency': 'c', 'id': 838, 'synset': 'pocketknife.n.01', 'synonyms': ['pocketknife'], 'def': 'a knife with a blade that folds into the handle; suitable for carrying in the pocket', 'name': 'pocketknife'}, {'frequency': 'c', 'id': 839, 'synset': 'poker.n.01', 'synonyms': ['poker_(fire_stirring_tool)', 'stove_poker', 'fire_hook'], 'def': 'fire iron consisting of a metal rod with a handle; used to stir a fire', 'name': 'poker_(fire_stirring_tool)'}, {'frequency': 'f', 'id': 840, 'synset': 'pole.n.01', 'synonyms': ['pole', 'post'], 'def': 'a long (usually round) rod of wood or metal or plastic', 'name': 'pole'}, {'frequency': 'r', 'id': 841, 'synset': 'police_van.n.01', 'synonyms': ['police_van', 'police_wagon', 'paddy_wagon', 'patrol_wagon'], 'def': 'van used by police to transport prisoners', 'name': 'police_van'}, {'frequency': 'f', 'id': 842, 'synset': 'polo_shirt.n.01', 'synonyms': ['polo_shirt', 'sport_shirt'], 'def': 'a shirt with short sleeves designed for comfort and casual wear', 'name': 'polo_shirt'}, {'frequency': 'r', 'id': 843, 'synset': 'poncho.n.01', 'synonyms': ['poncho'], 'def': 'a blanket-like cloak with a hole in the center for the head', 'name': 'poncho'}, {'frequency': 'c', 'id': 844, 'synset': 'pony.n.05', 'synonyms': ['pony'], 'def': 'any of various breeds of small gentle horses usually less than five feet high at the shoulder', 'name': 'pony'}, {'frequency': 'r', 'id': 845, 'synset': 'pool_table.n.01', 'synonyms': ['pool_table', 'billiard_table', 'snooker_table'], 'def': 'game equipment consisting of a heavy table on which pool is played', 'name': 'pool_table'}, {'frequency': 'f', 'id': 846, 'synset': 'pop.n.02', 'synonyms': ['pop_(soda)', 'soda_(pop)', 'tonic', 'soft_drink'], 'def': 'a sweet drink containing carbonated water and flavoring', 'name': 'pop_(soda)'}, {'frequency': 'r', 'id': 847, 'synset': 'portrait.n.02', 'synonyms': ['portrait', 'portrayal'], 'def': 'any likeness of a person, in any medium', 'name': 'portrait'}, {'frequency': 'c', 'id': 848, 'synset': 'postbox.n.01', 'synonyms': ['postbox_(public)', 'mailbox_(public)'], 'def': 'public box for deposit of mail', 'name': 'postbox_(public)'}, {'frequency': 'c', 'id': 849, 'synset': 'postcard.n.01', 'synonyms': ['postcard', 'postal_card', 'mailing-card'], 'def': 'a card for sending messages by post without an envelope', 'name': 'postcard'}, {'frequency': 'f', 'id': 850, 'synset': 'poster.n.01', 'synonyms': ['poster', 'placard'], 'def': 'a sign posted in a public place as an advertisement', 'name': 'poster'}, {'frequency': 'f', 'id': 851, 'synset': 'pot.n.01', 'synonyms': ['pot'], 'def': 'metal or earthenware cooking vessel that is usually round and deep; often has a handle and lid', 'name': 'pot'}, {'frequency': 'f', 'id': 852, 'synset': 'pot.n.04', 'synonyms': ['flowerpot'], 'def': 'a container in which plants are cultivated', 'name': 'flowerpot'}, {'frequency': 'f', 'id': 853, 'synset': 'potato.n.01', 'synonyms': ['potato'], 'def': 'an edible tuber native to South America', 'name': 'potato'}, {'frequency': 'c', 'id': 854, 'synset': 'potholder.n.01', 'synonyms': ['potholder'], 'def': 'an insulated pad for holding hot pots', 'name': 'potholder'}, {'frequency': 'c', 'id': 855, 'synset': 'pottery.n.01', 'synonyms': ['pottery', 'clayware'], 'def': 'ceramic ware made from clay and baked in a kiln', 'name': 'pottery'}, {'frequency': 'c', 'id': 856, 'synset': 'pouch.n.01', 'synonyms': ['pouch'], 'def': 'a small or medium size container for holding or carrying things', 'name': 'pouch'}, {'frequency': 'r', 'id': 857, 'synset': 'power_shovel.n.01', 'synonyms': ['power_shovel', 'excavator', 'digger'], 'def': 'a machine for excavating', 'name': 'power_shovel'}, {'frequency': 'c', 'id': 858, 'synset': 'prawn.n.01', 'synonyms': ['prawn', 'shrimp'], 'def': 'any of various edible decapod crustaceans', 'name': 'prawn'}, {'frequency': 'f', 'id': 859, 'synset': 'printer.n.03', 'synonyms': ['printer', 'printing_machine'], 'def': 'a machine that prints', 'name': 'printer'}, {'frequency': 'c', 'id': 860, 'synset': 'projectile.n.01', 'synonyms': ['projectile_(weapon)', 'missile'], 'def': 'a weapon that is forcibly thrown or projected at a targets', 'name': 'projectile_(weapon)'}, {'frequency': 'c', 'id': 861, 'synset': 'projector.n.02', 'synonyms': ['projector'], 'def': 'an optical instrument that projects an enlarged image onto a screen', 'name': 'projector'}, {'frequency': 'f', 'id': 862, 'synset': 'propeller.n.01', 'synonyms': ['propeller', 'propellor'], 'def': 'a mechanical device that rotates to push against air or water', 'name': 'propeller'}, {'frequency': 'r', 'id': 863, 'synset': 'prune.n.01', 'synonyms': ['prune'], 'def': 'dried plum', 'name': 'prune'}, {'frequency': 'r', 'id': 864, 'synset': 'pudding.n.01', 'synonyms': ['pudding'], 'def': 'any of various soft thick unsweetened baked dishes', 'name': 'pudding'}, {'frequency': 'r', 'id': 865, 'synset': 'puffer.n.02', 'synonyms': ['puffer_(fish)', 'pufferfish', 'blowfish', 'globefish'], 'def': 'fishes whose elongated spiny body can inflate itself with water or air to form a globe', 'name': 'puffer_(fish)'}, {'frequency': 'r', 'id': 866, 'synset': 'puffin.n.01', 'synonyms': ['puffin'], 'def': 'seabirds having short necks and brightly colored compressed bills', 'name': 'puffin'}, {'frequency': 'r', 'id': 867, 'synset': 'pug.n.01', 'synonyms': ['pug-dog'], 'def': 'small compact smooth-coated breed of Asiatic origin having a tightly curled tail and broad flat wrinkled muzzle', 'name': 'pug-dog'}, {'frequency': 'c', 'id': 868, 'synset': 'pumpkin.n.02', 'synonyms': ['pumpkin'], 'def': 'usually large pulpy deep-yellow round fruit of the squash family maturing in late summer or early autumn', 'name': 'pumpkin'}, {'frequency': 'r', 'id': 869, 'synset': 'punch.n.03', 'synonyms': ['puncher'], 'def': 'a tool for making holes or indentations', 'name': 'puncher'}, {'frequency': 'r', 'id': 870, 'synset': 'puppet.n.01', 'synonyms': ['puppet', 'marionette'], 'def': 'a small figure of a person operated from above with strings by a puppeteer', 'name': 'puppet'}, {'frequency': 'r', 'id': 871, 'synset': 'puppy.n.01', 'synonyms': ['puppy'], 'def': 'a young dog', 'name': 'puppy'}, {'frequency': 'r', 'id': 872, 'synset': 'quesadilla.n.01', 'synonyms': ['quesadilla'], 'def': 'a tortilla that is filled with cheese and heated', 'name': 'quesadilla'}, {'frequency': 'r', 'id': 873, 'synset': 'quiche.n.02', 'synonyms': ['quiche'], 'def': 'a tart filled with rich unsweetened custard; often contains other ingredients (as cheese or ham or seafood or vegetables)', 'name': 'quiche'}, {'frequency': 'f', 'id': 874, 'synset': 'quilt.n.01', 'synonyms': ['quilt', 'comforter'], 'def': 'bedding made of two layers of cloth filled with stuffing and stitched together', 'name': 'quilt'}, {'frequency': 'c', 'id': 875, 'synset': 'rabbit.n.01', 'synonyms': ['rabbit'], 'def': 'any of various burrowing animals of the family Leporidae having long ears and short tails', 'name': 'rabbit'}, {'frequency': 'r', 'id': 876, 'synset': 'racer.n.02', 'synonyms': ['race_car', 'racing_car'], 'def': 'a fast car that competes in races', 'name': 'race_car'}, {'frequency': 'c', 'id': 877, 'synset': 'racket.n.04', 'synonyms': ['racket', 'racquet'], 'def': 'a sports implement used to strike a ball in various games', 'name': 'racket'}, {'frequency': 'r', 'id': 878, 'synset': 'radar.n.01', 'synonyms': ['radar'], 'def': 'measuring instrument in which the echo of a pulse of microwave radiation is used to detect and locate distant objects', 'name': 'radar'}, {'frequency': 'c', 'id': 879, 'synset': 'radiator.n.03', 'synonyms': ['radiator'], 'def': 'a mechanism consisting of a metal honeycomb through which hot fluids circulate', 'name': 'radiator'}, {'frequency': 'c', 'id': 880, 'synset': 'radio_receiver.n.01', 'synonyms': ['radio_receiver', 'radio_set', 'radio', 'tuner_(radio)'], 'def': 'an electronic receiver that detects and demodulates and amplifies transmitted radio signals', 'name': 'radio_receiver'}, {'frequency': 'c', 'id': 881, 'synset': 'radish.n.03', 'synonyms': ['radish', 'daikon'], 'def': 'pungent edible root of any of various cultivated radish plants', 'name': 'radish'}, {'frequency': 'c', 'id': 882, 'synset': 'raft.n.01', 'synonyms': ['raft'], 'def': 'a flat float (usually made of logs or planks) that can be used for transport or as a platform for swimmers', 'name': 'raft'}, {'frequency': 'r', 'id': 883, 'synset': 'rag_doll.n.01', 'synonyms': ['rag_doll'], 'def': 'a cloth doll that is stuffed and (usually) painted', 'name': 'rag_doll'}, {'frequency': 'c', 'id': 884, 'synset': 'raincoat.n.01', 'synonyms': ['raincoat', 'waterproof_jacket'], 'def': 'a water-resistant coat', 'name': 'raincoat'}, {'frequency': 'c', 'id': 885, 'synset': 'ram.n.05', 'synonyms': ['ram_(animal)'], 'def': 'uncastrated adult male sheep', 'name': 'ram_(animal)'}, {'frequency': 'c', 'id': 886, 'synset': 'raspberry.n.02', 'synonyms': ['raspberry'], 'def': 'red or black edible aggregate berries usually smaller than the related blackberries', 'name': 'raspberry'}, {'frequency': 'r', 'id': 887, 'synset': 'rat.n.01', 'synonyms': ['rat'], 'def': 'any of various long-tailed rodents similar to but larger than a mouse', 'name': 'rat'}, {'frequency': 'c', 'id': 888, 'synset': 'razorblade.n.01', 'synonyms': ['razorblade'], 'def': 'a blade that has very sharp edge', 'name': 'razorblade'}, {'frequency': 'c', 'id': 889, 'synset': 'reamer.n.01', 'synonyms': ['reamer_(juicer)', 'juicer', 'juice_reamer'], 'def': 'a squeezer with a conical ridged center that is used for squeezing juice from citrus fruit', 'name': 'reamer_(juicer)'}, {'frequency': 'f', 'id': 890, 'synset': 'rearview_mirror.n.01', 'synonyms': ['rearview_mirror'], 'def': 'car mirror that reflects the view out of the rear window', 'name': 'rearview_mirror'}, {'frequency': 'c', 'id': 891, 'synset': 'receipt.n.02', 'synonyms': ['receipt'], 'def': 'an acknowledgment (usually tangible) that payment has been made', 'name': 'receipt'}, {'frequency': 'c', 'id': 892, 'synset': 'recliner.n.01', 'synonyms': ['recliner', 'reclining_chair', 'lounger_(chair)'], 'def': 'an armchair whose back can be lowered and foot can be raised to allow the sitter to recline in it', 'name': 'recliner'}, {'frequency': 'r', 'id': 893, 'synset': 'record_player.n.01', 'synonyms': ['record_player', 'phonograph_(record_player)', 'turntable'], 'def': 'machine in which rotating records cause a stylus to vibrate and the vibrations are amplified acoustically or electronically', 'name': 'record_player'}, {'frequency': 'r', 'id': 894, 'synset': 'red_cabbage.n.02', 'synonyms': ['red_cabbage'], 'def': 'compact head of purplish-red leaves', 'name': 'red_cabbage'}, {'frequency': 'f', 'id': 895, 'synset': 'reflector.n.01', 'synonyms': ['reflector'], 'def': 'device that reflects light, radiation, etc.', 'name': 'reflector'}, {'frequency': 'f', 'id': 896, 'synset': 'remote_control.n.01', 'synonyms': ['remote_control'], 'def': 'a device that can be used to control a machine or apparatus from a distance', 'name': 'remote_control'}, {'frequency': 'c', 'id': 897, 'synset': 'rhinoceros.n.01', 'synonyms': ['rhinoceros'], 'def': 'massive powerful herbivorous odd-toed ungulate of southeast Asia and Africa having very thick skin and one or two horns on the snout', 'name': 'rhinoceros'}, {'frequency': 'r', 'id': 898, 'synset': 'rib.n.03', 'synonyms': ['rib_(food)'], 'def': 'cut of meat including one or more ribs', 'name': 'rib_(food)'}, {'frequency': 'r', 'id': 899, 'synset': 'rifle.n.01', 'synonyms': ['rifle'], 'def': 'a shoulder firearm with a long barrel', 'name': 'rifle'}, {'frequency': 'f', 'id': 900, 'synset': 'ring.n.08', 'synonyms': ['ring'], 'def': 'jewelry consisting of a circlet of precious metal (often set with jewels) worn on the finger', 'name': 'ring'}, {'frequency': 'r', 'id': 901, 'synset': 'river_boat.n.01', 'synonyms': ['river_boat'], 'def': 'a boat used on rivers or to ply a river', 'name': 'river_boat'}, {'frequency': 'r', 'id': 902, 'synset': 'road_map.n.02', 'synonyms': ['road_map'], 'def': '(NOT A ROAD) a MAP showing roads (for automobile travel)', 'name': 'road_map'}, {'frequency': 'c', 'id': 903, 'synset': 'robe.n.01', 'synonyms': ['robe'], 'def': 'any loose flowing garment', 'name': 'robe'}, {'frequency': 'c', 'id': 904, 'synset': 'rocking_chair.n.01', 'synonyms': ['rocking_chair'], 'def': 'a chair mounted on rockers', 'name': 'rocking_chair'}, {'frequency': 'r', 'id': 905, 'synset': 'roller_skate.n.01', 'synonyms': ['roller_skate'], 'def': 'a shoe with pairs of rollers (small hard wheels) fixed to the sole', 'name': 'roller_skate'}, {'frequency': 'r', 'id': 906, 'synset': 'rollerblade.n.01', 'synonyms': ['Rollerblade'], 'def': 'an in-line variant of a roller skate', 'name': 'Rollerblade'}, {'frequency': 'c', 'id': 907, 'synset': 'rolling_pin.n.01', 'synonyms': ['rolling_pin'], 'def': 'utensil consisting of a cylinder (usually of wood) with a handle at each end; used to roll out dough', 'name': 'rolling_pin'}, {'frequency': 'r', 'id': 908, 'synset': 'root_beer.n.01', 'synonyms': ['root_beer'], 'def': 'carbonated drink containing extracts of roots and herbs', 'name': 'root_beer'}, {'frequency': 'c', 'id': 909, 'synset': 'router.n.02', 'synonyms': ['router_(computer_equipment)'], 'def': 'a device that forwards data packets between computer networks', 'name': 'router_(computer_equipment)'}, {'frequency': 'f', 'id': 910, 'synset': 'rubber_band.n.01', 'synonyms': ['rubber_band', 'elastic_band'], 'def': 'a narrow band of elastic rubber used to hold things (such as papers) together', 'name': 'rubber_band'}, {'frequency': 'c', 'id': 911, 'synset': 'runner.n.08', 'synonyms': ['runner_(carpet)'], 'def': 'a long narrow carpet', 'name': 'runner_(carpet)'}, {'frequency': 'f', 'id': 912, 'synset': 'sack.n.01', 'synonyms': ['plastic_bag', 'paper_bag'], 'def': "a bag made of paper or plastic for holding customer's purchases", 'name': 'plastic_bag'}, {'frequency': 'f', 'id': 913, 'synset': 'saddle.n.01', 'synonyms': ['saddle_(on_an_animal)'], 'def': 'a seat for the rider of a horse or camel', 'name': 'saddle_(on_an_animal)'}, {'frequency': 'f', 'id': 914, 'synset': 'saddle_blanket.n.01', 'synonyms': ['saddle_blanket', 'saddlecloth', 'horse_blanket'], 'def': 'stable gear consisting of a blanket placed under the saddle', 'name': 'saddle_blanket'}, {'frequency': 'c', 'id': 915, 'synset': 'saddlebag.n.01', 'synonyms': ['saddlebag'], 'def': 'a large bag (or pair of bags) hung over a saddle', 'name': 'saddlebag'}, {'frequency': 'r', 'id': 916, 'synset': 'safety_pin.n.01', 'synonyms': ['safety_pin'], 'def': 'a pin in the form of a clasp; has a guard so the point of the pin will not stick the user', 'name': 'safety_pin'}, {'frequency': 'c', 'id': 917, 'synset': 'sail.n.01', 'synonyms': ['sail'], 'def': 'a large piece of fabric by means of which wind is used to propel a sailing vessel', 'name': 'sail'}, {'frequency': 'c', 'id': 918, 'synset': 'salad.n.01', 'synonyms': ['salad'], 'def': 'food mixtures either arranged on a plate or tossed and served with a moist dressing; usually consisting of or including greens', 'name': 'salad'}, {'frequency': 'r', 'id': 919, 'synset': 'salad_plate.n.01', 'synonyms': ['salad_plate', 'salad_bowl'], 'def': 'a plate or bowl for individual servings of salad', 'name': 'salad_plate'}, {'frequency': 'r', 'id': 920, 'synset': 'salami.n.01', 'synonyms': ['salami'], 'def': 'highly seasoned fatty sausage of pork and beef usually dried', 'name': 'salami'}, {'frequency': 'r', 'id': 921, 'synset': 'salmon.n.01', 'synonyms': ['salmon_(fish)'], 'def': 'any of various large food and game fishes of northern waters', 'name': 'salmon_(fish)'}, {'frequency': 'r', 'id': 922, 'synset': 'salmon.n.03', 'synonyms': ['salmon_(food)'], 'def': 'flesh of any of various marine or freshwater fish of the family Salmonidae', 'name': 'salmon_(food)'}, {'frequency': 'r', 'id': 923, 'synset': 'salsa.n.01', 'synonyms': ['salsa'], 'def': 'spicy sauce of tomatoes and onions and chili peppers to accompany Mexican foods', 'name': 'salsa'}, {'frequency': 'f', 'id': 924, 'synset': 'saltshaker.n.01', 'synonyms': ['saltshaker'], 'def': 'a shaker with a perforated top for sprinkling salt', 'name': 'saltshaker'}, {'frequency': 'f', 'id': 925, 'synset': 'sandal.n.01', 'synonyms': ['sandal_(type_of_shoe)'], 'def': 'a shoe consisting of a sole fastened by straps to the foot', 'name': 'sandal_(type_of_shoe)'}, {'frequency': 'f', 'id': 926, 'synset': 'sandwich.n.01', 'synonyms': ['sandwich'], 'def': 'two (or more) slices of bread with a filling between them', 'name': 'sandwich'}, {'frequency': 'r', 'id': 927, 'synset': 'satchel.n.01', 'synonyms': ['satchel'], 'def': 'luggage consisting of a small case with a flat bottom and (usually) a shoulder strap', 'name': 'satchel'}, {'frequency': 'r', 'id': 928, 'synset': 'saucepan.n.01', 'synonyms': ['saucepan'], 'def': 'a deep pan with a handle; used for stewing or boiling', 'name': 'saucepan'}, {'frequency': 'f', 'id': 929, 'synset': 'saucer.n.02', 'synonyms': ['saucer'], 'def': 'a small shallow dish for holding a cup at the table', 'name': 'saucer'}, {'frequency': 'f', 'id': 930, 'synset': 'sausage.n.01', 'synonyms': ['sausage'], 'def': 'highly seasoned minced meat stuffed in casings', 'name': 'sausage'}, {'frequency': 'r', 'id': 931, 'synset': 'sawhorse.n.01', 'synonyms': ['sawhorse', 'sawbuck'], 'def': 'a framework for holding wood that is being sawed', 'name': 'sawhorse'}, {'frequency': 'r', 'id': 932, 'synset': 'sax.n.02', 'synonyms': ['saxophone'], 'def': "a wind instrument with a `J'-shaped form typically made of brass", 'name': 'saxophone'}, {'frequency': 'f', 'id': 933, 'synset': 'scale.n.07', 'synonyms': ['scale_(measuring_instrument)'], 'def': 'a measuring instrument for weighing; shows amount of mass', 'name': 'scale_(measuring_instrument)'}, {'frequency': 'r', 'id': 934, 'synset': 'scarecrow.n.01', 'synonyms': ['scarecrow', 'strawman'], 'def': 'an effigy in the shape of a man to frighten birds away from seeds', 'name': 'scarecrow'}, {'frequency': 'f', 'id': 935, 'synset': 'scarf.n.01', 'synonyms': ['scarf'], 'def': 'a garment worn around the head or neck or shoulders for warmth or decoration', 'name': 'scarf'}, {'frequency': 'c', 'id': 936, 'synset': 'school_bus.n.01', 'synonyms': ['school_bus'], 'def': 'a bus used to transport children to or from school', 'name': 'school_bus'}, {'frequency': 'f', 'id': 937, 'synset': 'scissors.n.01', 'synonyms': ['scissors'], 'def': 'a tool having two crossed pivoting blades with looped handles', 'name': 'scissors'}, {'frequency': 'c', 'id': 938, 'synset': 'scoreboard.n.01', 'synonyms': ['scoreboard'], 'def': 'a large board for displaying the score of a contest (and some other information)', 'name': 'scoreboard'}, {'frequency': 'c', 'id': 939, 'synset': 'scrambled_eggs.n.01', 'synonyms': ['scrambled_eggs'], 'def': 'eggs beaten and cooked to a soft firm consistency while stirring', 'name': 'scrambled_eggs'}, {'frequency': 'r', 'id': 940, 'synset': 'scraper.n.01', 'synonyms': ['scraper'], 'def': 'any of various hand tools for scraping', 'name': 'scraper'}, {'frequency': 'r', 'id': 941, 'synset': 'scratcher.n.03', 'synonyms': ['scratcher'], 'def': 'a device used for scratching', 'name': 'scratcher'}, {'frequency': 'c', 'id': 942, 'synset': 'screwdriver.n.01', 'synonyms': ['screwdriver'], 'def': 'a hand tool for driving screws; has a tip that fits into the head of a screw', 'name': 'screwdriver'}, {'frequency': 'c', 'id': 943, 'synset': 'scrub_brush.n.01', 'synonyms': ['scrubbing_brush'], 'def': 'a brush with short stiff bristles for heavy cleaning', 'name': 'scrubbing_brush'}, {'frequency': 'c', 'id': 944, 'synset': 'sculpture.n.01', 'synonyms': ['sculpture'], 'def': 'a three-dimensional work of art', 'name': 'sculpture'}, {'frequency': 'r', 'id': 945, 'synset': 'seabird.n.01', 'synonyms': ['seabird', 'seafowl'], 'def': 'a bird that frequents coastal waters and the open ocean: gulls; pelicans; gannets; cormorants; albatrosses; petrels; etc.', 'name': 'seabird'}, {'frequency': 'r', 'id': 946, 'synset': 'seahorse.n.02', 'synonyms': ['seahorse'], 'def': 'small fish with horse-like heads bent sharply downward and curled tails', 'name': 'seahorse'}, {'frequency': 'r', 'id': 947, 'synset': 'seaplane.n.01', 'synonyms': ['seaplane', 'hydroplane'], 'def': 'an airplane that can land on or take off from water', 'name': 'seaplane'}, {'frequency': 'c', 'id': 948, 'synset': 'seashell.n.01', 'synonyms': ['seashell'], 'def': 'the shell of a marine organism', 'name': 'seashell'}, {'frequency': 'r', 'id': 949, 'synset': 'seedling.n.01', 'synonyms': ['seedling'], 'def': 'young plant or tree grown from a seed', 'name': 'seedling'}, {'frequency': 'c', 'id': 950, 'synset': 'serving_dish.n.01', 'synonyms': ['serving_dish'], 'def': 'a dish used for serving food', 'name': 'serving_dish'}, {'frequency': 'r', 'id': 951, 'synset': 'sewing_machine.n.01', 'synonyms': ['sewing_machine'], 'def': 'a textile machine used as a home appliance for sewing', 'name': 'sewing_machine'}, {'frequency': 'r', 'id': 952, 'synset': 'shaker.n.03', 'synonyms': ['shaker'], 'def': 'a container in which something can be shaken', 'name': 'shaker'}, {'frequency': 'c', 'id': 953, 'synset': 'shampoo.n.01', 'synonyms': ['shampoo'], 'def': 'cleansing agent consisting of soaps or detergents used for washing the hair', 'name': 'shampoo'}, {'frequency': 'r', 'id': 954, 'synset': 'shark.n.01', 'synonyms': ['shark'], 'def': 'typically large carnivorous fishes with sharpe teeth', 'name': 'shark'}, {'frequency': 'r', 'id': 955, 'synset': 'sharpener.n.01', 'synonyms': ['sharpener'], 'def': 'any implement that is used to make something (an edge or a point) sharper', 'name': 'sharpener'}, {'frequency': 'r', 'id': 956, 'synset': 'sharpie.n.03', 'synonyms': ['Sharpie'], 'def': 'a pen with indelible ink that will write on any surface', 'name': 'Sharpie'}, {'frequency': 'r', 'id': 957, 'synset': 'shaver.n.03', 'synonyms': ['shaver_(electric)', 'electric_shaver', 'electric_razor'], 'def': 'a razor powered by an electric motor', 'name': 'shaver_(electric)'}, {'frequency': 'c', 'id': 958, 'synset': 'shaving_cream.n.01', 'synonyms': ['shaving_cream', 'shaving_soap'], 'def': 'toiletry consisting that forms a rich lather for softening the beard before shaving', 'name': 'shaving_cream'}, {'frequency': 'r', 'id': 959, 'synset': 'shawl.n.01', 'synonyms': ['shawl'], 'def': 'cloak consisting of an oblong piece of cloth used to cover the head and shoulders', 'name': 'shawl'}, {'frequency': 'r', 'id': 960, 'synset': 'shears.n.01', 'synonyms': ['shears'], 'def': 'large scissors with strong blades', 'name': 'shears'}, {'frequency': 'f', 'id': 961, 'synset': 'sheep.n.01', 'synonyms': ['sheep'], 'def': 'woolly usually horned ruminant mammal related to the goat', 'name': 'sheep'}, {'frequency': 'r', 'id': 962, 'synset': 'shepherd_dog.n.01', 'synonyms': ['shepherd_dog', 'sheepdog'], 'def': 'any of various usually long-haired breeds of dog reared to herd and guard sheep', 'name': 'shepherd_dog'}, {'frequency': 'r', 'id': 963, 'synset': 'sherbert.n.01', 'synonyms': ['sherbert', 'sherbet'], 'def': 'a frozen dessert made primarily of fruit juice and sugar', 'name': 'sherbert'}, {'frequency': 'r', 'id': 964, 'synset': 'shield.n.02', 'synonyms': ['shield'], 'def': 'armor carried on the arm to intercept blows', 'name': 'shield'}, {'frequency': 'f', 'id': 965, 'synset': 'shirt.n.01', 'synonyms': ['shirt'], 'def': 'a garment worn on the upper half of the body', 'name': 'shirt'}, {'frequency': 'f', 'id': 966, 'synset': 'shoe.n.01', 'synonyms': ['shoe', 'sneaker_(type_of_shoe)', 'tennis_shoe'], 'def': 'common footwear covering the foot', 'name': 'shoe'}, {'frequency': 'c', 'id': 967, 'synset': 'shopping_bag.n.01', 'synonyms': ['shopping_bag'], 'def': 'a bag made of plastic or strong paper (often with handles); used to transport goods after shopping', 'name': 'shopping_bag'}, {'frequency': 'c', 'id': 968, 'synset': 'shopping_cart.n.01', 'synonyms': ['shopping_cart'], 'def': 'a handcart that holds groceries or other goods while shopping', 'name': 'shopping_cart'}, {'frequency': 'f', 'id': 969, 'synset': 'short_pants.n.01', 'synonyms': ['short_pants', 'shorts_(clothing)', 'trunks_(clothing)'], 'def': 'trousers that end at or above the knee', 'name': 'short_pants'}, {'frequency': 'r', 'id': 970, 'synset': 'shot_glass.n.01', 'synonyms': ['shot_glass'], 'def': 'a small glass adequate to hold a single swallow of whiskey', 'name': 'shot_glass'}, {'frequency': 'c', 'id': 971, 'synset': 'shoulder_bag.n.01', 'synonyms': ['shoulder_bag'], 'def': 'a large handbag that can be carried by a strap looped over the shoulder', 'name': 'shoulder_bag'}, {'frequency': 'c', 'id': 972, 'synset': 'shovel.n.01', 'synonyms': ['shovel'], 'def': 'a hand tool for lifting loose material such as snow, dirt, etc.', 'name': 'shovel'}, {'frequency': 'f', 'id': 973, 'synset': 'shower.n.01', 'synonyms': ['shower_head'], 'def': 'a plumbing fixture that sprays water over you', 'name': 'shower_head'}, {'frequency': 'f', 'id': 974, 'synset': 'shower_curtain.n.01', 'synonyms': ['shower_curtain'], 'def': 'a curtain that keeps water from splashing out of the shower area', 'name': 'shower_curtain'}, {'frequency': 'r', 'id': 975, 'synset': 'shredder.n.01', 'synonyms': ['shredder_(for_paper)'], 'def': 'a device that shreds documents', 'name': 'shredder_(for_paper)'}, {'frequency': 'r', 'id': 976, 'synset': 'sieve.n.01', 'synonyms': ['sieve', 'screen_(sieve)'], 'def': 'a strainer for separating lumps from powdered material or grading particles', 'name': 'sieve'}, {'frequency': 'f', 'id': 977, 'synset': 'signboard.n.01', 'synonyms': ['signboard'], 'def': 'structure displaying a board on which advertisements can be posted', 'name': 'signboard'}, {'frequency': 'c', 'id': 978, 'synset': 'silo.n.01', 'synonyms': ['silo'], 'def': 'a cylindrical tower used for storing goods', 'name': 'silo'}, {'frequency': 'f', 'id': 979, 'synset': 'sink.n.01', 'synonyms': ['sink'], 'def': 'plumbing fixture consisting of a water basin fixed to a wall or floor and having a drainpipe', 'name': 'sink'}, {'frequency': 'f', 'id': 980, 'synset': 'skateboard.n.01', 'synonyms': ['skateboard'], 'def': 'a board with wheels that is ridden in a standing or crouching position and propelled by foot', 'name': 'skateboard'}, {'frequency': 'c', 'id': 981, 'synset': 'skewer.n.01', 'synonyms': ['skewer'], 'def': 'a long pin for holding meat in position while it is being roasted', 'name': 'skewer'}, {'frequency': 'f', 'id': 982, 'synset': 'ski.n.01', 'synonyms': ['ski'], 'def': 'sports equipment for skiing on snow', 'name': 'ski'}, {'frequency': 'f', 'id': 983, 'synset': 'ski_boot.n.01', 'synonyms': ['ski_boot'], 'def': 'a stiff boot that is fastened to a ski with a ski binding', 'name': 'ski_boot'}, {'frequency': 'f', 'id': 984, 'synset': 'ski_parka.n.01', 'synonyms': ['ski_parka', 'ski_jacket'], 'def': 'a parka to be worn while skiing', 'name': 'ski_parka'}, {'frequency': 'f', 'id': 985, 'synset': 'ski_pole.n.01', 'synonyms': ['ski_pole'], 'def': 'a pole with metal points used as an aid in skiing', 'name': 'ski_pole'}, {'frequency': 'f', 'id': 986, 'synset': 'skirt.n.02', 'synonyms': ['skirt'], 'def': 'a garment hanging from the waist; worn mainly by girls and women', 'name': 'skirt'}, {'frequency': 'c', 'id': 987, 'synset': 'sled.n.01', 'synonyms': ['sled', 'sledge', 'sleigh'], 'def': 'a vehicle or flat object for transportation over snow by sliding or pulled by dogs, etc.', 'name': 'sled'}, {'frequency': 'c', 'id': 988, 'synset': 'sleeping_bag.n.01', 'synonyms': ['sleeping_bag'], 'def': 'large padded bag designed to be slept in outdoors', 'name': 'sleeping_bag'}, {'frequency': 'r', 'id': 989, 'synset': 'sling.n.05', 'synonyms': ['sling_(bandage)', 'triangular_bandage'], 'def': 'bandage to support an injured forearm; slung over the shoulder or neck', 'name': 'sling_(bandage)'}, {'frequency': 'c', 'id': 990, 'synset': 'slipper.n.01', 'synonyms': ['slipper_(footwear)', 'carpet_slipper_(footwear)'], 'def': 'low footwear that can be slipped on and off easily; usually worn indoors', 'name': 'slipper_(footwear)'}, {'frequency': 'r', 'id': 991, 'synset': 'smoothie.n.02', 'synonyms': ['smoothie'], 'def': 'a thick smooth drink consisting of fresh fruit pureed with ice cream or yoghurt or milk', 'name': 'smoothie'}, {'frequency': 'r', 'id': 992, 'synset': 'snake.n.01', 'synonyms': ['snake', 'serpent'], 'def': 'limbless scaly elongate reptile; some are venomous', 'name': 'snake'}, {'frequency': 'f', 'id': 993, 'synset': 'snowboard.n.01', 'synonyms': ['snowboard'], 'def': 'a board that resembles a broad ski or a small surfboard; used in a standing position to slide down snow-covered slopes', 'name': 'snowboard'}, {'frequency': 'c', 'id': 994, 'synset': 'snowman.n.01', 'synonyms': ['snowman'], 'def': 'a figure of a person made of packed snow', 'name': 'snowman'}, {'frequency': 'c', 'id': 995, 'synset': 'snowmobile.n.01', 'synonyms': ['snowmobile'], 'def': 'tracked vehicle for travel on snow having skis in front', 'name': 'snowmobile'}, {'frequency': 'f', 'id': 996, 'synset': 'soap.n.01', 'synonyms': ['soap'], 'def': 'a cleansing agent made from the salts of vegetable or animal fats', 'name': 'soap'}, {'frequency': 'f', 'id': 997, 'synset': 'soccer_ball.n.01', 'synonyms': ['soccer_ball'], 'def': "an inflated ball used in playing soccer (called `football' outside of the United States)", 'name': 'soccer_ball'}, {'frequency': 'f', 'id': 998, 'synset': 'sock.n.01', 'synonyms': ['sock'], 'def': 'cloth covering for the foot; worn inside the shoe; reaches to between the ankle and the knee', 'name': 'sock'}, {'frequency': 'r', 'id': 999, 'synset': 'soda_fountain.n.02', 'synonyms': ['soda_fountain'], 'def': 'an apparatus for dispensing soda water', 'name': 'soda_fountain'}, {'frequency': 'r', 'id': 1000, 'synset': 'soda_water.n.01', 'synonyms': ['carbonated_water', 'club_soda', 'seltzer', 'sparkling_water'], 'def': 'effervescent beverage artificially charged with carbon dioxide', 'name': 'carbonated_water'}, {'frequency': 'f', 'id': 1001, 'synset': 'sofa.n.01', 'synonyms': ['sofa', 'couch', 'lounge'], 'def': 'an upholstered seat for more than one person', 'name': 'sofa'}, {'frequency': 'r', 'id': 1002, 'synset': 'softball.n.01', 'synonyms': ['softball'], 'def': 'ball used in playing softball', 'name': 'softball'}, {'frequency': 'c', 'id': 1003, 'synset': 'solar_array.n.01', 'synonyms': ['solar_array', 'solar_battery', 'solar_panel'], 'def': 'electrical device consisting of a large array of connected solar cells', 'name': 'solar_array'}, {'frequency': 'r', 'id': 1004, 'synset': 'sombrero.n.02', 'synonyms': ['sombrero'], 'def': 'a straw hat with a tall crown and broad brim; worn in American southwest and in Mexico', 'name': 'sombrero'}, {'frequency': 'c', 'id': 1005, 'synset': 'soup.n.01', 'synonyms': ['soup'], 'def': 'liquid food especially of meat or fish or vegetable stock often containing pieces of solid food', 'name': 'soup'}, {'frequency': 'r', 'id': 1006, 'synset': 'soup_bowl.n.01', 'synonyms': ['soup_bowl'], 'def': 'a bowl for serving soup', 'name': 'soup_bowl'}, {'frequency': 'c', 'id': 1007, 'synset': 'soupspoon.n.01', 'synonyms': ['soupspoon'], 'def': 'a spoon with a rounded bowl for eating soup', 'name': 'soupspoon'}, {'frequency': 'c', 'id': 1008, 'synset': 'sour_cream.n.01', 'synonyms': ['sour_cream', 'soured_cream'], 'def': 'soured light cream', 'name': 'sour_cream'}, {'frequency': 'r', 'id': 1009, 'synset': 'soya_milk.n.01', 'synonyms': ['soya_milk', 'soybean_milk', 'soymilk'], 'def': 'a milk substitute containing soybean flour and water; used in some infant formulas and in making tofu', 'name': 'soya_milk'}, {'frequency': 'r', 'id': 1010, 'synset': 'space_shuttle.n.01', 'synonyms': ['space_shuttle'], 'def': "a reusable spacecraft with wings for a controlled descent through the Earth's atmosphere", 'name': 'space_shuttle'}, {'frequency': 'r', 'id': 1011, 'synset': 'sparkler.n.02', 'synonyms': ['sparkler_(fireworks)'], 'def': 'a firework that burns slowly and throws out a shower of sparks', 'name': 'sparkler_(fireworks)'}, {'frequency': 'f', 'id': 1012, 'synset': 'spatula.n.02', 'synonyms': ['spatula'], 'def': 'a hand tool with a thin flexible blade used to mix or spread soft substances', 'name': 'spatula'}, {'frequency': 'r', 'id': 1013, 'synset': 'spear.n.01', 'synonyms': ['spear', 'lance'], 'def': 'a long pointed rod used as a tool or weapon', 'name': 'spear'}, {'frequency': 'f', 'id': 1014, 'synset': 'spectacles.n.01', 'synonyms': ['spectacles', 'specs', 'eyeglasses', 'glasses'], 'def': 'optical instrument consisting of a frame that holds a pair of lenses for correcting defective vision', 'name': 'spectacles'}, {'frequency': 'c', 'id': 1015, 'synset': 'spice_rack.n.01', 'synonyms': ['spice_rack'], 'def': 'a rack for displaying containers filled with spices', 'name': 'spice_rack'}, {'frequency': 'r', 'id': 1016, 'synset': 'spider.n.01', 'synonyms': ['spider'], 'def': 'predatory arachnid with eight legs, two poison fangs, two feelers, and usually two silk-spinning organs at the back end of the body', 'name': 'spider'}, {'frequency': 'c', 'id': 1017, 'synset': 'sponge.n.01', 'synonyms': ['sponge'], 'def': 'a porous mass usable to absorb water typically used for cleaning', 'name': 'sponge'}, {'frequency': 'f', 'id': 1018, 'synset': 'spoon.n.01', 'synonyms': ['spoon'], 'def': 'a piece of cutlery with a shallow bowl-shaped container and a handle', 'name': 'spoon'}, {'frequency': 'c', 'id': 1019, 'synset': 'sportswear.n.01', 'synonyms': ['sportswear', 'athletic_wear', 'activewear'], 'def': 'attire worn for sport or for casual wear', 'name': 'sportswear'}, {'frequency': 'c', 'id': 1020, 'synset': 'spotlight.n.02', 'synonyms': ['spotlight'], 'def': 'a lamp that produces a strong beam of light to illuminate a restricted area; used to focus attention of a stage performer', 'name': 'spotlight'}, {'frequency': 'r', 'id': 1021, 'synset': 'squirrel.n.01', 'synonyms': ['squirrel'], 'def': 'a kind of arboreal rodent having a long bushy tail', 'name': 'squirrel'}, {'frequency': 'c', 'id': 1022, 'synset': 'stapler.n.01', 'synonyms': ['stapler_(stapling_machine)'], 'def': 'a machine that inserts staples into sheets of paper in order to fasten them together', 'name': 'stapler_(stapling_machine)'}, {'frequency': 'r', 'id': 1023, 'synset': 'starfish.n.01', 'synonyms': ['starfish', 'sea_star'], 'def': 'echinoderms characterized by five arms extending from a central disk', 'name': 'starfish'}, {'frequency': 'f', 'id': 1024, 'synset': 'statue.n.01', 'synonyms': ['statue_(sculpture)'], 'def': 'a sculpture representing a human or animal', 'name': 'statue_(sculpture)'}, {'frequency': 'c', 'id': 1025, 'synset': 'steak.n.01', 'synonyms': ['steak_(food)'], 'def': 'a slice of meat cut from the fleshy part of an animal or large fish', 'name': 'steak_(food)'}, {'frequency': 'r', 'id': 1026, 'synset': 'steak_knife.n.01', 'synonyms': ['steak_knife'], 'def': 'a sharp table knife used in eating steak', 'name': 'steak_knife'}, {'frequency': 'r', 'id': 1027, 'synset': 'steamer.n.02', 'synonyms': ['steamer_(kitchen_appliance)'], 'def': 'a cooking utensil that can be used to cook food by steaming it', 'name': 'steamer_(kitchen_appliance)'}, {'frequency': 'f', 'id': 1028, 'synset': 'steering_wheel.n.01', 'synonyms': ['steering_wheel'], 'def': 'a handwheel that is used for steering', 'name': 'steering_wheel'}, {'frequency': 'r', 'id': 1029, 'synset': 'stencil.n.01', 'synonyms': ['stencil'], 'def': 'a sheet of material (metal, plastic, etc.) that has been perforated with a pattern; ink or paint can pass through the perforations to create the printed pattern on the surface below', 'name': 'stencil'}, {'frequency': 'r', 'id': 1030, 'synset': 'step_ladder.n.01', 'synonyms': ['stepladder'], 'def': 'a folding portable ladder hinged at the top', 'name': 'stepladder'}, {'frequency': 'c', 'id': 1031, 'synset': 'step_stool.n.01', 'synonyms': ['step_stool'], 'def': 'a stool that has one or two steps that fold under the seat', 'name': 'step_stool'}, {'frequency': 'c', 'id': 1032, 'synset': 'stereo.n.01', 'synonyms': ['stereo_(sound_system)'], 'def': 'electronic device for playing audio', 'name': 'stereo_(sound_system)'}, {'frequency': 'r', 'id': 1033, 'synset': 'stew.n.02', 'synonyms': ['stew'], 'def': 'food prepared by stewing especially meat or fish with vegetables', 'name': 'stew'}, {'frequency': 'r', 'id': 1034, 'synset': 'stirrer.n.02', 'synonyms': ['stirrer'], 'def': 'an implement used for stirring', 'name': 'stirrer'}, {'frequency': 'f', 'id': 1035, 'synset': 'stirrup.n.01', 'synonyms': ['stirrup'], 'def': "support consisting of metal loops into which rider's feet go", 'name': 'stirrup'}, {'frequency': 'c', 'id': 1036, 'synset': 'stocking.n.01', 'synonyms': ['stockings_(leg_wear)'], 'def': 'close-fitting hosiery to cover the foot and leg; come in matched pairs', 'name': 'stockings_(leg_wear)'}, {'frequency': 'f', 'id': 1037, 'synset': 'stool.n.01', 'synonyms': ['stool'], 'def': 'a simple seat without a back or arms', 'name': 'stool'}, {'frequency': 'f', 'id': 1038, 'synset': 'stop_sign.n.01', 'synonyms': ['stop_sign'], 'def': 'a traffic sign to notify drivers that they must come to a complete stop', 'name': 'stop_sign'}, {'frequency': 'f', 'id': 1039, 'synset': 'stoplight.n.01', 'synonyms': ['brake_light'], 'def': 'a red light on the rear of a motor vehicle that signals when the brakes are applied', 'name': 'brake_light'}, {'frequency': 'f', 'id': 1040, 'synset': 'stove.n.01', 'synonyms': ['stove', 'kitchen_stove', 'range_(kitchen_appliance)', 'kitchen_range', 'cooking_stove'], 'def': 'a kitchen appliance used for cooking food', 'name': 'stove'}, {'frequency': 'c', 'id': 1041, 'synset': 'strainer.n.01', 'synonyms': ['strainer'], 'def': 'a filter to retain larger pieces while smaller pieces and liquids pass through', 'name': 'strainer'}, {'frequency': 'f', 'id': 1042, 'synset': 'strap.n.01', 'synonyms': ['strap'], 'def': 'an elongated strip of material for binding things together or holding', 'name': 'strap'}, {'frequency': 'f', 'id': 1043, 'synset': 'straw.n.04', 'synonyms': ['straw_(for_drinking)', 'drinking_straw'], 'def': 'a thin paper or plastic tube used to suck liquids into the mouth', 'name': 'straw_(for_drinking)'}, {'frequency': 'f', 'id': 1044, 'synset': 'strawberry.n.01', 'synonyms': ['strawberry'], 'def': 'sweet fleshy red fruit', 'name': 'strawberry'}, {'frequency': 'f', 'id': 1045, 'synset': 'street_sign.n.01', 'synonyms': ['street_sign'], 'def': 'a sign visible from the street', 'name': 'street_sign'}, {'frequency': 'f', 'id': 1046, 'synset': 'streetlight.n.01', 'synonyms': ['streetlight', 'street_lamp'], 'def': 'a lamp supported on a lamppost; for illuminating a street', 'name': 'streetlight'}, {'frequency': 'r', 'id': 1047, 'synset': 'string_cheese.n.01', 'synonyms': ['string_cheese'], 'def': 'cheese formed in long strings twisted together', 'name': 'string_cheese'}, {'frequency': 'r', 'id': 1048, 'synset': 'stylus.n.02', 'synonyms': ['stylus'], 'def': 'a pointed tool for writing or drawing or engraving', 'name': 'stylus'}, {'frequency': 'r', 'id': 1049, 'synset': 'subwoofer.n.01', 'synonyms': ['subwoofer'], 'def': 'a loudspeaker that is designed to reproduce very low bass frequencies', 'name': 'subwoofer'}, {'frequency': 'r', 'id': 1050, 'synset': 'sugar_bowl.n.01', 'synonyms': ['sugar_bowl'], 'def': 'a dish in which sugar is served', 'name': 'sugar_bowl'}, {'frequency': 'r', 'id': 1051, 'synset': 'sugarcane.n.01', 'synonyms': ['sugarcane_(plant)'], 'def': 'juicy canes whose sap is a source of molasses and commercial sugar; fresh canes are sometimes chewed for the juice', 'name': 'sugarcane_(plant)'}, {'frequency': 'c', 'id': 1052, 'synset': 'suit.n.01', 'synonyms': ['suit_(clothing)'], 'def': 'a set of garments (usually including a jacket and trousers or skirt) for outerwear all of the same fabric and color', 'name': 'suit_(clothing)'}, {'frequency': 'c', 'id': 1053, 'synset': 'sunflower.n.01', 'synonyms': ['sunflower'], 'def': 'any plant of the genus Helianthus having large flower heads with dark disk florets and showy yellow rays', 'name': 'sunflower'}, {'frequency': 'f', 'id': 1054, 'synset': 'sunglasses.n.01', 'synonyms': ['sunglasses'], 'def': 'spectacles that are darkened or polarized to protect the eyes from the glare of the sun', 'name': 'sunglasses'}, {'frequency': 'c', 'id': 1055, 'synset': 'sunhat.n.01', 'synonyms': ['sunhat'], 'def': 'a hat with a broad brim that protects the face from direct exposure to the sun', 'name': 'sunhat'}, {'frequency': 'r', 'id': 1056, 'synset': 'sunscreen.n.01', 'synonyms': ['sunscreen', 'sunblock'], 'def': 'a cream spread on the skin; contains a chemical to filter out ultraviolet light and so protect from sunburn', 'name': 'sunscreen'}, {'frequency': 'f', 'id': 1057, 'synset': 'surfboard.n.01', 'synonyms': ['surfboard'], 'def': 'a narrow buoyant board for riding surf', 'name': 'surfboard'}, {'frequency': 'c', 'id': 1058, 'synset': 'sushi.n.01', 'synonyms': ['sushi'], 'def': 'rice (with raw fish) wrapped in seaweed', 'name': 'sushi'}, {'frequency': 'c', 'id': 1059, 'synset': 'swab.n.02', 'synonyms': ['mop'], 'def': 'cleaning implement consisting of absorbent material fastened to a handle; for cleaning floors', 'name': 'mop'}, {'frequency': 'c', 'id': 1060, 'synset': 'sweat_pants.n.01', 'synonyms': ['sweat_pants'], 'def': 'loose-fitting trousers with elastic cuffs; worn by athletes', 'name': 'sweat_pants'}, {'frequency': 'c', 'id': 1061, 'synset': 'sweatband.n.02', 'synonyms': ['sweatband'], 'def': 'a band of material tied around the forehead or wrist to absorb sweat', 'name': 'sweatband'}, {'frequency': 'f', 'id': 1062, 'synset': 'sweater.n.01', 'synonyms': ['sweater'], 'def': 'a crocheted or knitted garment covering the upper part of the body', 'name': 'sweater'}, {'frequency': 'f', 'id': 1063, 'synset': 'sweatshirt.n.01', 'synonyms': ['sweatshirt'], 'def': 'cotton knit pullover with long sleeves worn during athletic activity', 'name': 'sweatshirt'}, {'frequency': 'c', 'id': 1064, 'synset': 'sweet_potato.n.02', 'synonyms': ['sweet_potato'], 'def': 'the edible tuberous root of the sweet potato vine', 'name': 'sweet_potato'}, {'frequency': 'f', 'id': 1065, 'synset': 'swimsuit.n.01', 'synonyms': ['swimsuit', 'swimwear', 'bathing_suit', 'swimming_costume', 'bathing_costume', 'swimming_trunks', 'bathing_trunks'], 'def': 'garment worn for swimming', 'name': 'swimsuit'}, {'frequency': 'c', 'id': 1066, 'synset': 'sword.n.01', 'synonyms': ['sword'], 'def': 'a cutting or thrusting weapon that has a long metal blade', 'name': 'sword'}, {'frequency': 'r', 'id': 1067, 'synset': 'syringe.n.01', 'synonyms': ['syringe'], 'def': 'a medical instrument used to inject or withdraw fluids', 'name': 'syringe'}, {'frequency': 'r', 'id': 1068, 'synset': 'tabasco.n.02', 'synonyms': ['Tabasco_sauce'], 'def': 'very spicy sauce (trade name Tabasco) made from fully-aged red peppers', 'name': 'Tabasco_sauce'}, {'frequency': 'r', 'id': 1069, 'synset': 'table-tennis_table.n.01', 'synonyms': ['table-tennis_table', 'ping-pong_table'], 'def': 'a table used for playing table tennis', 'name': 'table-tennis_table'}, {'frequency': 'f', 'id': 1070, 'synset': 'table.n.02', 'synonyms': ['table'], 'def': 'a piece of furniture having a smooth flat top that is usually supported by one or more vertical legs', 'name': 'table'}, {'frequency': 'c', 'id': 1071, 'synset': 'table_lamp.n.01', 'synonyms': ['table_lamp'], 'def': 'a lamp that sits on a table', 'name': 'table_lamp'}, {'frequency': 'f', 'id': 1072, 'synset': 'tablecloth.n.01', 'synonyms': ['tablecloth'], 'def': 'a covering spread over a dining table', 'name': 'tablecloth'}, {'frequency': 'r', 'id': 1073, 'synset': 'tachometer.n.01', 'synonyms': ['tachometer'], 'def': 'measuring instrument for indicating speed of rotation', 'name': 'tachometer'}, {'frequency': 'r', 'id': 1074, 'synset': 'taco.n.02', 'synonyms': ['taco'], 'def': 'a small tortilla cupped around a filling', 'name': 'taco'}, {'frequency': 'f', 'id': 1075, 'synset': 'tag.n.02', 'synonyms': ['tag'], 'def': 'a label associated with something for the purpose of identification or information', 'name': 'tag'}, {'frequency': 'f', 'id': 1076, 'synset': 'taillight.n.01', 'synonyms': ['taillight', 'rear_light'], 'def': 'lamp (usually red) mounted at the rear of a motor vehicle', 'name': 'taillight'}, {'frequency': 'r', 'id': 1077, 'synset': 'tambourine.n.01', 'synonyms': ['tambourine'], 'def': 'a shallow drum with a single drumhead and with metallic disks in the sides', 'name': 'tambourine'}, {'frequency': 'r', 'id': 1078, 'synset': 'tank.n.01', 'synonyms': ['army_tank', 'armored_combat_vehicle', 'armoured_combat_vehicle'], 'def': 'an enclosed armored military vehicle; has a cannon and moves on caterpillar treads', 'name': 'army_tank'}, {'frequency': 'c', 'id': 1079, 'synset': 'tank.n.02', 'synonyms': ['tank_(storage_vessel)', 'storage_tank'], 'def': 'a large (usually metallic) vessel for holding gases or liquids', 'name': 'tank_(storage_vessel)'}, {'frequency': 'f', 'id': 1080, 'synset': 'tank_top.n.01', 'synonyms': ['tank_top_(clothing)'], 'def': 'a tight-fitting sleeveless shirt with wide shoulder straps and low neck and no front opening', 'name': 'tank_top_(clothing)'}, {'frequency': 'c', 'id': 1081, 'synset': 'tape.n.01', 'synonyms': ['tape_(sticky_cloth_or_paper)'], 'def': 'a long thin piece of cloth or paper as used for binding or fastening', 'name': 'tape_(sticky_cloth_or_paper)'}, {'frequency': 'c', 'id': 1082, 'synset': 'tape.n.04', 'synonyms': ['tape_measure', 'measuring_tape'], 'def': 'measuring instrument consisting of a narrow strip (cloth or metal) marked in inches or centimeters and used for measuring lengths', 'name': 'tape_measure'}, {'frequency': 'c', 'id': 1083, 'synset': 'tapestry.n.02', 'synonyms': ['tapestry'], 'def': 'a heavy textile with a woven design; used for curtains and upholstery', 'name': 'tapestry'}, {'frequency': 'f', 'id': 1084, 'synset': 'tarpaulin.n.01', 'synonyms': ['tarp'], 'def': 'waterproofed canvas', 'name': 'tarp'}, {'frequency': 'c', 'id': 1085, 'synset': 'tartan.n.01', 'synonyms': ['tartan', 'plaid'], 'def': 'a cloth having a crisscross design', 'name': 'tartan'}, {'frequency': 'c', 'id': 1086, 'synset': 'tassel.n.01', 'synonyms': ['tassel'], 'def': 'adornment consisting of a bunch of cords fastened at one end', 'name': 'tassel'}, {'frequency': 'r', 'id': 1087, 'synset': 'tea_bag.n.01', 'synonyms': ['tea_bag'], 'def': 'a measured amount of tea in a bag for an individual serving of tea', 'name': 'tea_bag'}, {'frequency': 'c', 'id': 1088, 'synset': 'teacup.n.02', 'synonyms': ['teacup'], 'def': 'a cup from which tea is drunk', 'name': 'teacup'}, {'frequency': 'c', 'id': 1089, 'synset': 'teakettle.n.01', 'synonyms': ['teakettle'], 'def': 'kettle for boiling water to make tea', 'name': 'teakettle'}, {'frequency': 'c', 'id': 1090, 'synset': 'teapot.n.01', 'synonyms': ['teapot'], 'def': 'pot for brewing tea; usually has a spout and handle', 'name': 'teapot'}, {'frequency': 'f', 'id': 1091, 'synset': 'teddy.n.01', 'synonyms': ['teddy_bear'], 'def': "plaything consisting of a child's toy bear (usually plush and stuffed with soft materials)", 'name': 'teddy_bear'}, {'frequency': 'f', 'id': 1092, 'synset': 'telephone.n.01', 'synonyms': ['telephone', 'phone', 'telephone_set'], 'def': 'electronic device for communicating by voice over long distances', 'name': 'telephone'}, {'frequency': 'c', 'id': 1093, 'synset': 'telephone_booth.n.01', 'synonyms': ['telephone_booth', 'phone_booth', 'call_box', 'telephone_box', 'telephone_kiosk'], 'def': 'booth for using a telephone', 'name': 'telephone_booth'}, {'frequency': 'f', 'id': 1094, 'synset': 'telephone_pole.n.01', 'synonyms': ['telephone_pole', 'telegraph_pole', 'telegraph_post'], 'def': 'tall pole supporting telephone wires', 'name': 'telephone_pole'}, {'frequency': 'r', 'id': 1095, 'synset': 'telephoto_lens.n.01', 'synonyms': ['telephoto_lens', 'zoom_lens'], 'def': 'a camera lens that magnifies the image', 'name': 'telephoto_lens'}, {'frequency': 'c', 'id': 1096, 'synset': 'television_camera.n.01', 'synonyms': ['television_camera', 'tv_camera'], 'def': 'television equipment for capturing and recording video', 'name': 'television_camera'}, {'frequency': 'f', 'id': 1097, 'synset': 'television_receiver.n.01', 'synonyms': ['television_set', 'tv', 'tv_set'], 'def': 'an electronic device that receives television signals and displays them on a screen', 'name': 'television_set'}, {'frequency': 'f', 'id': 1098, 'synset': 'tennis_ball.n.01', 'synonyms': ['tennis_ball'], 'def': 'ball about the size of a fist used in playing tennis', 'name': 'tennis_ball'}, {'frequency': 'f', 'id': 1099, 'synset': 'tennis_racket.n.01', 'synonyms': ['tennis_racket'], 'def': 'a racket used to play tennis', 'name': 'tennis_racket'}, {'frequency': 'r', 'id': 1100, 'synset': 'tequila.n.01', 'synonyms': ['tequila'], 'def': 'Mexican liquor made from fermented juices of an agave plant', 'name': 'tequila'}, {'frequency': 'c', 'id': 1101, 'synset': 'thermometer.n.01', 'synonyms': ['thermometer'], 'def': 'measuring instrument for measuring temperature', 'name': 'thermometer'}, {'frequency': 'c', 'id': 1102, 'synset': 'thermos.n.01', 'synonyms': ['thermos_bottle'], 'def': 'vacuum flask that preserves temperature of hot or cold drinks', 'name': 'thermos_bottle'}, {'frequency': 'c', 'id': 1103, 'synset': 'thermostat.n.01', 'synonyms': ['thermostat'], 'def': 'a regulator for automatically regulating temperature by starting or stopping the supply of heat', 'name': 'thermostat'}, {'frequency': 'r', 'id': 1104, 'synset': 'thimble.n.02', 'synonyms': ['thimble'], 'def': 'a small metal cap to protect the finger while sewing; can be used as a small container', 'name': 'thimble'}, {'frequency': 'c', 'id': 1105, 'synset': 'thread.n.01', 'synonyms': ['thread', 'yarn'], 'def': 'a fine cord of twisted fibers (of cotton or silk or wool or nylon etc.) used in sewing and weaving', 'name': 'thread'}, {'frequency': 'c', 'id': 1106, 'synset': 'thumbtack.n.01', 'synonyms': ['thumbtack', 'drawing_pin', 'pushpin'], 'def': 'a tack for attaching papers to a bulletin board or drawing board', 'name': 'thumbtack'}, {'frequency': 'c', 'id': 1107, 'synset': 'tiara.n.01', 'synonyms': ['tiara'], 'def': 'a jeweled headdress worn by women on formal occasions', 'name': 'tiara'}, {'frequency': 'c', 'id': 1108, 'synset': 'tiger.n.02', 'synonyms': ['tiger'], 'def': 'large feline of forests in most of Asia having a tawny coat with black stripes', 'name': 'tiger'}, {'frequency': 'c', 'id': 1109, 'synset': 'tights.n.01', 'synonyms': ['tights_(clothing)', 'leotards'], 'def': 'skintight knit hose covering the body from the waist to the feet worn by acrobats and dancers and as stockings by women and girls', 'name': 'tights_(clothing)'}, {'frequency': 'c', 'id': 1110, 'synset': 'timer.n.01', 'synonyms': ['timer', 'stopwatch'], 'def': 'a timepiece that measures a time interval and signals its end', 'name': 'timer'}, {'frequency': 'f', 'id': 1111, 'synset': 'tinfoil.n.01', 'synonyms': ['tinfoil'], 'def': 'foil made of tin or an alloy of tin and lead', 'name': 'tinfoil'}, {'frequency': 'r', 'id': 1112, 'synset': 'tinsel.n.01', 'synonyms': ['tinsel'], 'def': 'a showy decoration that is basically valueless', 'name': 'tinsel'}, {'frequency': 'f', 'id': 1113, 'synset': 'tissue.n.02', 'synonyms': ['tissue_paper'], 'def': 'a soft thin (usually translucent) paper', 'name': 'tissue_paper'}, {'frequency': 'c', 'id': 1114, 'synset': 'toast.n.01', 'synonyms': ['toast_(food)'], 'def': 'slice of bread that has been toasted', 'name': 'toast_(food)'}, {'frequency': 'f', 'id': 1115, 'synset': 'toaster.n.02', 'synonyms': ['toaster'], 'def': 'a kitchen appliance (usually electric) for toasting bread', 'name': 'toaster'}, {'frequency': 'c', 'id': 1116, 'synset': 'toaster_oven.n.01', 'synonyms': ['toaster_oven'], 'def': 'kitchen appliance consisting of a small electric oven for toasting or warming food', 'name': 'toaster_oven'}, {'frequency': 'f', 'id': 1117, 'synset': 'toilet.n.02', 'synonyms': ['toilet'], 'def': 'a plumbing fixture for defecation and urination', 'name': 'toilet'}, {'frequency': 'f', 'id': 1118, 'synset': 'toilet_tissue.n.01', 'synonyms': ['toilet_tissue', 'toilet_paper', 'bathroom_tissue'], 'def': 'a soft thin absorbent paper for use in toilets', 'name': 'toilet_tissue'}, {'frequency': 'f', 'id': 1119, 'synset': 'tomato.n.01', 'synonyms': ['tomato'], 'def': 'mildly acid red or yellow pulpy fruit eaten as a vegetable', 'name': 'tomato'}, {'frequency': 'c', 'id': 1120, 'synset': 'tongs.n.01', 'synonyms': ['tongs'], 'def': 'any of various devices for taking hold of objects; usually have two hinged legs with handles above and pointed hooks below', 'name': 'tongs'}, {'frequency': 'c', 'id': 1121, 'synset': 'toolbox.n.01', 'synonyms': ['toolbox'], 'def': 'a box or chest or cabinet for holding hand tools', 'name': 'toolbox'}, {'frequency': 'f', 'id': 1122, 'synset': 'toothbrush.n.01', 'synonyms': ['toothbrush'], 'def': 'small brush; has long handle; used to clean teeth', 'name': 'toothbrush'}, {'frequency': 'f', 'id': 1123, 'synset': 'toothpaste.n.01', 'synonyms': ['toothpaste'], 'def': 'a dentifrice in the form of a paste', 'name': 'toothpaste'}, {'frequency': 'c', 'id': 1124, 'synset': 'toothpick.n.01', 'synonyms': ['toothpick'], 'def': 'pick consisting of a small strip of wood or plastic; used to pick food from between the teeth', 'name': 'toothpick'}, {'frequency': 'c', 'id': 1125, 'synset': 'top.n.09', 'synonyms': ['cover'], 'def': 'covering for a hole (especially a hole in the top of a container)', 'name': 'cover'}, {'frequency': 'c', 'id': 1126, 'synset': 'tortilla.n.01', 'synonyms': ['tortilla'], 'def': 'thin unleavened pancake made from cornmeal or wheat flour', 'name': 'tortilla'}, {'frequency': 'c', 'id': 1127, 'synset': 'tow_truck.n.01', 'synonyms': ['tow_truck'], 'def': 'a truck equipped to hoist and pull wrecked cars (or to remove cars from no-parking zones)', 'name': 'tow_truck'}, {'frequency': 'f', 'id': 1128, 'synset': 'towel.n.01', 'synonyms': ['towel'], 'def': 'a rectangular piece of absorbent cloth (or paper) for drying or wiping', 'name': 'towel'}, {'frequency': 'f', 'id': 1129, 'synset': 'towel_rack.n.01', 'synonyms': ['towel_rack', 'towel_rail', 'towel_bar'], 'def': 'a rack consisting of one or more bars on which towels can be hung', 'name': 'towel_rack'}, {'frequency': 'f', 'id': 1130, 'synset': 'toy.n.03', 'synonyms': ['toy'], 'def': 'a device regarded as providing amusement', 'name': 'toy'}, {'frequency': 'c', 'id': 1131, 'synset': 'tractor.n.01', 'synonyms': ['tractor_(farm_equipment)'], 'def': 'a wheeled vehicle with large wheels; used in farming and other applications', 'name': 'tractor_(farm_equipment)'}, {'frequency': 'f', 'id': 1132, 'synset': 'traffic_light.n.01', 'synonyms': ['traffic_light'], 'def': 'a device to control vehicle traffic often consisting of three or more lights', 'name': 'traffic_light'}, {'frequency': 'r', 'id': 1133, 'synset': 'trail_bike.n.01', 'synonyms': ['dirt_bike'], 'def': 'a lightweight motorcycle equipped with rugged tires and suspension for off-road use', 'name': 'dirt_bike'}, {'frequency': 'c', 'id': 1134, 'synset': 'trailer_truck.n.01', 'synonyms': ['trailer_truck', 'tractor_trailer', 'trucking_rig', 'articulated_lorry', 'semi_truck'], 'def': 'a truck consisting of a tractor and trailer together', 'name': 'trailer_truck'}, {'frequency': 'f', 'id': 1135, 'synset': 'train.n.01', 'synonyms': ['train_(railroad_vehicle)', 'railroad_train'], 'def': 'public or private transport provided by a line of railway cars coupled together and drawn by a locomotive', 'name': 'train_(railroad_vehicle)'}, {'frequency': 'r', 'id': 1136, 'synset': 'trampoline.n.01', 'synonyms': ['trampoline'], 'def': 'gymnastic apparatus consisting of a strong canvas sheet attached with springs to a metal frame', 'name': 'trampoline'}, {'frequency': 'f', 'id': 1137, 'synset': 'tray.n.01', 'synonyms': ['tray'], 'def': 'an open receptacle for holding or displaying or serving articles or food', 'name': 'tray'}, {'frequency': 'r', 'id': 1138, 'synset': 'tree_house.n.01', 'synonyms': ['tree_house'], 'def': '(NOT A TREE) a PLAYHOUSE built in the branches of a tree', 'name': 'tree_house'}, {'frequency': 'r', 'id': 1139, 'synset': 'trench_coat.n.01', 'synonyms': ['trench_coat'], 'def': 'a military style raincoat; belted with deep pockets', 'name': 'trench_coat'}, {'frequency': 'r', 'id': 1140, 'synset': 'triangle.n.05', 'synonyms': ['triangle_(musical_instrument)'], 'def': 'a percussion instrument consisting of a metal bar bent in the shape of an open triangle', 'name': 'triangle_(musical_instrument)'}, {'frequency': 'r', 'id': 1141, 'synset': 'tricycle.n.01', 'synonyms': ['tricycle'], 'def': 'a vehicle with three wheels that is moved by foot pedals', 'name': 'tricycle'}, {'frequency': 'c', 'id': 1142, 'synset': 'tripod.n.01', 'synonyms': ['tripod'], 'def': 'a three-legged rack used for support', 'name': 'tripod'}, {'frequency': 'f', 'id': 1143, 'synset': 'trouser.n.01', 'synonyms': ['trousers', 'pants_(clothing)'], 'def': 'a garment extending from the waist to the knee or ankle, covering each leg separately', 'name': 'trousers'}, {'frequency': 'f', 'id': 1144, 'synset': 'truck.n.01', 'synonyms': ['truck'], 'def': 'an automotive vehicle suitable for hauling', 'name': 'truck'}, {'frequency': 'r', 'id': 1145, 'synset': 'truffle.n.03', 'synonyms': ['truffle_(chocolate)', 'chocolate_truffle'], 'def': 'creamy chocolate candy', 'name': 'truffle_(chocolate)'}, {'frequency': 'c', 'id': 1146, 'synset': 'trunk.n.02', 'synonyms': ['trunk'], 'def': 'luggage consisting of a large strong case used when traveling or for storage', 'name': 'trunk'}, {'frequency': 'r', 'id': 1147, 'synset': 'tub.n.02', 'synonyms': ['vat'], 'def': 'a large open vessel for holding or storing liquids', 'name': 'vat'}, {'frequency': 'c', 'id': 1148, 'synset': 'turban.n.01', 'synonyms': ['turban'], 'def': 'a traditional headdress consisting of a long scarf wrapped around the head', 'name': 'turban'}, {'frequency': 'r', 'id': 1149, 'synset': 'turkey.n.01', 'synonyms': ['turkey_(bird)'], 'def': 'large gallinaceous bird with fan-shaped tail; widely domesticated for food', 'name': 'turkey_(bird)'}, {'frequency': 'c', 'id': 1150, 'synset': 'turkey.n.04', 'synonyms': ['turkey_(food)'], 'def': 'flesh of large domesticated fowl usually roasted', 'name': 'turkey_(food)'}, {'frequency': 'r', 'id': 1151, 'synset': 'turnip.n.01', 'synonyms': ['turnip'], 'def': 'widely cultivated plant having a large fleshy edible white or yellow root', 'name': 'turnip'}, {'frequency': 'c', 'id': 1152, 'synset': 'turtle.n.02', 'synonyms': ['turtle'], 'def': 'any of various aquatic and land reptiles having a bony shell and flipper-like limbs for swimming', 'name': 'turtle'}, {'frequency': 'r', 'id': 1153, 'synset': 'turtleneck.n.01', 'synonyms': ['turtleneck_(clothing)', 'polo-neck'], 'def': 'a sweater or jersey with a high close-fitting collar', 'name': 'turtleneck_(clothing)'}, {'frequency': 'r', 'id': 1154, 'synset': 'typewriter.n.01', 'synonyms': ['typewriter'], 'def': 'hand-operated character printer for printing written messages one character at a time', 'name': 'typewriter'}, {'frequency': 'f', 'id': 1155, 'synset': 'umbrella.n.01', 'synonyms': ['umbrella'], 'def': 'a lightweight handheld collapsible canopy', 'name': 'umbrella'}, {'frequency': 'c', 'id': 1156, 'synset': 'underwear.n.01', 'synonyms': ['underwear', 'underclothes', 'underclothing', 'underpants'], 'def': 'undergarment worn next to the skin and under the outer garments', 'name': 'underwear'}, {'frequency': 'r', 'id': 1157, 'synset': 'unicycle.n.01', 'synonyms': ['unicycle'], 'def': 'a vehicle with a single wheel that is driven by pedals', 'name': 'unicycle'}, {'frequency': 'c', 'id': 1158, 'synset': 'urinal.n.01', 'synonyms': ['urinal'], 'def': 'a plumbing fixture (usually attached to the wall) used by men to urinate', 'name': 'urinal'}, {'frequency': 'r', 'id': 1159, 'synset': 'urn.n.01', 'synonyms': ['urn'], 'def': 'a large vase that usually has a pedestal or feet', 'name': 'urn'}, {'frequency': 'c', 'id': 1160, 'synset': 'vacuum.n.04', 'synonyms': ['vacuum_cleaner'], 'def': 'an electrical home appliance that cleans by suction', 'name': 'vacuum_cleaner'}, {'frequency': 'c', 'id': 1161, 'synset': 'valve.n.03', 'synonyms': ['valve'], 'def': 'control consisting of a mechanical device for controlling the flow of a fluid', 'name': 'valve'}, {'frequency': 'f', 'id': 1162, 'synset': 'vase.n.01', 'synonyms': ['vase'], 'def': 'an open jar of glass or porcelain used as an ornament or to hold flowers', 'name': 'vase'}, {'frequency': 'c', 'id': 1163, 'synset': 'vending_machine.n.01', 'synonyms': ['vending_machine'], 'def': 'a slot machine for selling goods', 'name': 'vending_machine'}, {'frequency': 'f', 'id': 1164, 'synset': 'vent.n.01', 'synonyms': ['vent', 'blowhole', 'air_vent'], 'def': 'a hole for the escape of gas or air', 'name': 'vent'}, {'frequency': 'c', 'id': 1165, 'synset': 'videotape.n.01', 'synonyms': ['videotape'], 'def': 'a video recording made on magnetic tape', 'name': 'videotape'}, {'frequency': 'r', 'id': 1166, 'synset': 'vinegar.n.01', 'synonyms': ['vinegar'], 'def': 'sour-tasting liquid produced usually by oxidation of the alcohol in wine or cider and used as a condiment or food preservative', 'name': 'vinegar'}, {'frequency': 'r', 'id': 1167, 'synset': 'violin.n.01', 'synonyms': ['violin', 'fiddle'], 'def': 'bowed stringed instrument that is the highest member of the violin family', 'name': 'violin'}, {'frequency': 'r', 'id': 1168, 'synset': 'vodka.n.01', 'synonyms': ['vodka'], 'def': 'unaged colorless liquor originating in Russia', 'name': 'vodka'}, {'frequency': 'r', 'id': 1169, 'synset': 'volleyball.n.02', 'synonyms': ['volleyball'], 'def': 'an inflated ball used in playing volleyball', 'name': 'volleyball'}, {'frequency': 'r', 'id': 1170, 'synset': 'vulture.n.01', 'synonyms': ['vulture'], 'def': 'any of various large birds of prey having naked heads and weak claws and feeding chiefly on carrion', 'name': 'vulture'}, {'frequency': 'c', 'id': 1171, 'synset': 'waffle.n.01', 'synonyms': ['waffle'], 'def': 'pancake batter baked in a waffle iron', 'name': 'waffle'}, {'frequency': 'r', 'id': 1172, 'synset': 'waffle_iron.n.01', 'synonyms': ['waffle_iron'], 'def': 'a kitchen appliance for baking waffles', 'name': 'waffle_iron'}, {'frequency': 'c', 'id': 1173, 'synset': 'wagon.n.01', 'synonyms': ['wagon'], 'def': 'any of various kinds of wheeled vehicles drawn by an animal or a tractor', 'name': 'wagon'}, {'frequency': 'c', 'id': 1174, 'synset': 'wagon_wheel.n.01', 'synonyms': ['wagon_wheel'], 'def': 'a wheel of a wagon', 'name': 'wagon_wheel'}, {'frequency': 'c', 'id': 1175, 'synset': 'walking_stick.n.01', 'synonyms': ['walking_stick'], 'def': 'a stick carried in the hand for support in walking', 'name': 'walking_stick'}, {'frequency': 'c', 'id': 1176, 'synset': 'wall_clock.n.01', 'synonyms': ['wall_clock'], 'def': 'a clock mounted on a wall', 'name': 'wall_clock'}, {'frequency': 'f', 'id': 1177, 'synset': 'wall_socket.n.01', 'synonyms': ['wall_socket', 'wall_plug', 'electric_outlet', 'electrical_outlet', 'outlet', 'electric_receptacle'], 'def': 'receptacle providing a place in a wiring system where current can be taken to run electrical devices', 'name': 'wall_socket'}, {'frequency': 'c', 'id': 1178, 'synset': 'wallet.n.01', 'synonyms': ['wallet', 'billfold'], 'def': 'a pocket-size case for holding papers and paper money', 'name': 'wallet'}, {'frequency': 'r', 'id': 1179, 'synset': 'walrus.n.01', 'synonyms': ['walrus'], 'def': 'either of two large northern marine mammals having ivory tusks and tough hide over thick blubber', 'name': 'walrus'}, {'frequency': 'r', 'id': 1180, 'synset': 'wardrobe.n.01', 'synonyms': ['wardrobe'], 'def': 'a tall piece of furniture that provides storage space for clothes; has a door and rails or hooks for hanging clothes', 'name': 'wardrobe'}, {'frequency': 'r', 'id': 1181, 'synset': 'wasabi.n.02', 'synonyms': ['wasabi'], 'def': 'the thick green root of the wasabi plant that the Japanese use in cooking and that tastes like strong horseradish', 'name': 'wasabi'}, {'frequency': 'c', 'id': 1182, 'synset': 'washer.n.03', 'synonyms': ['automatic_washer', 'washing_machine'], 'def': 'a home appliance for washing clothes and linens automatically', 'name': 'automatic_washer'}, {'frequency': 'f', 'id': 1183, 'synset': 'watch.n.01', 'synonyms': ['watch', 'wristwatch'], 'def': 'a small, portable timepiece', 'name': 'watch'}, {'frequency': 'f', 'id': 1184, 'synset': 'water_bottle.n.01', 'synonyms': ['water_bottle'], 'def': 'a bottle for holding water', 'name': 'water_bottle'}, {'frequency': 'c', 'id': 1185, 'synset': 'water_cooler.n.01', 'synonyms': ['water_cooler'], 'def': 'a device for cooling and dispensing drinking water', 'name': 'water_cooler'}, {'frequency': 'c', 'id': 1186, 'synset': 'water_faucet.n.01', 'synonyms': ['water_faucet', 'water_tap', 'tap_(water_faucet)'], 'def': 'a faucet for drawing water from a pipe or cask', 'name': 'water_faucet'}, {'frequency': 'r', 'id': 1187, 'synset': 'water_filter.n.01', 'synonyms': ['water_filter'], 'def': 'a filter to remove impurities from the water supply', 'name': 'water_filter'}, {'frequency': 'r', 'id': 1188, 'synset': 'water_heater.n.01', 'synonyms': ['water_heater', 'hot-water_heater'], 'def': 'a heater and storage tank to supply heated water', 'name': 'water_heater'}, {'frequency': 'r', 'id': 1189, 'synset': 'water_jug.n.01', 'synonyms': ['water_jug'], 'def': 'a jug that holds water', 'name': 'water_jug'}, {'frequency': 'r', 'id': 1190, 'synset': 'water_pistol.n.01', 'synonyms': ['water_gun', 'squirt_gun'], 'def': 'plaything consisting of a toy pistol that squirts water', 'name': 'water_gun'}, {'frequency': 'c', 'id': 1191, 'synset': 'water_scooter.n.01', 'synonyms': ['water_scooter', 'sea_scooter', 'jet_ski'], 'def': 'a motorboat resembling a motor scooter (NOT A SURFBOARD OR WATER SKI)', 'name': 'water_scooter'}, {'frequency': 'c', 'id': 1192, 'synset': 'water_ski.n.01', 'synonyms': ['water_ski'], 'def': 'broad ski for skimming over water towed by a speedboat (DO NOT MARK WATER)', 'name': 'water_ski'}, {'frequency': 'c', 'id': 1193, 'synset': 'water_tower.n.01', 'synonyms': ['water_tower'], 'def': 'a large reservoir for water', 'name': 'water_tower'}, {'frequency': 'c', 'id': 1194, 'synset': 'watering_can.n.01', 'synonyms': ['watering_can'], 'def': 'a container with a handle and a spout with a perforated nozzle; used to sprinkle water over plants', 'name': 'watering_can'}, {'frequency': 'c', 'id': 1195, 'synset': 'watermelon.n.02', 'synonyms': ['watermelon'], 'def': 'large oblong or roundish melon with a hard green rind and sweet watery red or occasionally yellowish pulp', 'name': 'watermelon'}, {'frequency': 'f', 'id': 1196, 'synset': 'weathervane.n.01', 'synonyms': ['weathervane', 'vane_(weathervane)', 'wind_vane'], 'def': 'mechanical device attached to an elevated structure; rotates freely to show the direction of the wind', 'name': 'weathervane'}, {'frequency': 'c', 'id': 1197, 'synset': 'webcam.n.01', 'synonyms': ['webcam'], 'def': 'a digital camera designed to take digital photographs and transmit them over the internet', 'name': 'webcam'}, {'frequency': 'c', 'id': 1198, 'synset': 'wedding_cake.n.01', 'synonyms': ['wedding_cake', 'bridecake'], 'def': 'a rich cake with two or more tiers and covered with frosting and decorations; served at a wedding reception', 'name': 'wedding_cake'}, {'frequency': 'c', 'id': 1199, 'synset': 'wedding_ring.n.01', 'synonyms': ['wedding_ring', 'wedding_band'], 'def': 'a ring given to the bride and/or groom at the wedding', 'name': 'wedding_ring'}, {'frequency': 'f', 'id': 1200, 'synset': 'wet_suit.n.01', 'synonyms': ['wet_suit'], 'def': 'a close-fitting garment made of a permeable material; worn in cold water to retain body heat', 'name': 'wet_suit'}, {'frequency': 'f', 'id': 1201, 'synset': 'wheel.n.01', 'synonyms': ['wheel'], 'def': 'a circular frame with spokes (or a solid disc) that can rotate on a shaft or axle', 'name': 'wheel'}, {'frequency': 'c', 'id': 1202, 'synset': 'wheelchair.n.01', 'synonyms': ['wheelchair'], 'def': 'a movable chair mounted on large wheels', 'name': 'wheelchair'}, {'frequency': 'c', 'id': 1203, 'synset': 'whipped_cream.n.01', 'synonyms': ['whipped_cream'], 'def': 'cream that has been beaten until light and fluffy', 'name': 'whipped_cream'}, {'frequency': 'r', 'id': 1204, 'synset': 'whiskey.n.01', 'synonyms': ['whiskey'], 'def': 'a liquor made from fermented mash of grain', 'name': 'whiskey'}, {'frequency': 'r', 'id': 1205, 'synset': 'whistle.n.03', 'synonyms': ['whistle'], 'def': 'a small wind instrument that produces a whistling sound by blowing into it', 'name': 'whistle'}, {'frequency': 'r', 'id': 1206, 'synset': 'wick.n.02', 'synonyms': ['wick'], 'def': 'a loosely woven cord in a candle or oil lamp that is lit on fire', 'name': 'wick'}, {'frequency': 'c', 'id': 1207, 'synset': 'wig.n.01', 'synonyms': ['wig'], 'def': 'hairpiece covering the head and made of real or synthetic hair', 'name': 'wig'}, {'frequency': 'c', 'id': 1208, 'synset': 'wind_chime.n.01', 'synonyms': ['wind_chime'], 'def': 'a decorative arrangement of pieces of metal or glass or pottery that hang together loosely so the wind can cause them to tinkle', 'name': 'wind_chime'}, {'frequency': 'c', 'id': 1209, 'synset': 'windmill.n.01', 'synonyms': ['windmill'], 'def': 'a mill that is powered by the wind', 'name': 'windmill'}, {'frequency': 'c', 'id': 1210, 'synset': 'window_box.n.01', 'synonyms': ['window_box_(for_plants)'], 'def': 'a container for growing plants on a windowsill', 'name': 'window_box_(for_plants)'}, {'frequency': 'f', 'id': 1211, 'synset': 'windshield_wiper.n.01', 'synonyms': ['windshield_wiper', 'windscreen_wiper', 'wiper_(for_windshield/screen)'], 'def': 'a mechanical device that cleans the windshield', 'name': 'windshield_wiper'}, {'frequency': 'c', 'id': 1212, 'synset': 'windsock.n.01', 'synonyms': ['windsock', 'air_sock', 'air-sleeve', 'wind_sleeve', 'wind_cone'], 'def': 'a truncated cloth cone mounted on a mast/pole; shows wind direction', 'name': 'windsock'}, {'frequency': 'f', 'id': 1213, 'synset': 'wine_bottle.n.01', 'synonyms': ['wine_bottle'], 'def': 'a bottle for holding wine', 'name': 'wine_bottle'}, {'frequency': 'r', 'id': 1214, 'synset': 'wine_bucket.n.01', 'synonyms': ['wine_bucket', 'wine_cooler'], 'def': 'a bucket of ice used to chill a bottle of wine', 'name': 'wine_bucket'}, {'frequency': 'f', 'id': 1215, 'synset': 'wineglass.n.01', 'synonyms': ['wineglass'], 'def': 'a glass that has a stem and in which wine is served', 'name': 'wineglass'}, {'frequency': 'r', 'id': 1216, 'synset': 'wing_chair.n.01', 'synonyms': ['wing_chair'], 'def': 'easy chair having wings on each side of a high back', 'name': 'wing_chair'}, {'frequency': 'c', 'id': 1217, 'synset': 'winker.n.02', 'synonyms': ['blinder_(for_horses)'], 'def': 'blinds that prevent a horse from seeing something on either side', 'name': 'blinder_(for_horses)'}, {'frequency': 'c', 'id': 1218, 'synset': 'wok.n.01', 'synonyms': ['wok'], 'def': 'pan with a convex bottom; used for frying in Chinese cooking', 'name': 'wok'}, {'frequency': 'r', 'id': 1219, 'synset': 'wolf.n.01', 'synonyms': ['wolf'], 'def': 'a wild carnivorous mammal of the dog family, living and hunting in packs', 'name': 'wolf'}, {'frequency': 'c', 'id': 1220, 'synset': 'wooden_spoon.n.02', 'synonyms': ['wooden_spoon'], 'def': 'a spoon made of wood', 'name': 'wooden_spoon'}, {'frequency': 'c', 'id': 1221, 'synset': 'wreath.n.01', 'synonyms': ['wreath'], 'def': 'an arrangement of flowers, leaves, or stems fastened in a ring', 'name': 'wreath'}, {'frequency': 'c', 'id': 1222, 'synset': 'wrench.n.03', 'synonyms': ['wrench', 'spanner'], 'def': 'a hand tool that is used to hold or twist a nut or bolt', 'name': 'wrench'}, {'frequency': 'c', 'id': 1223, 'synset': 'wristband.n.01', 'synonyms': ['wristband'], 'def': 'band consisting of a part of a sleeve that covers the wrist', 'name': 'wristband'}, {'frequency': 'f', 'id': 1224, 'synset': 'wristlet.n.01', 'synonyms': ['wristlet', 'wrist_band'], 'def': 'a band or bracelet worn around the wrist', 'name': 'wristlet'}, {'frequency': 'r', 'id': 1225, 'synset': 'yacht.n.01', 'synonyms': ['yacht'], 'def': 'an expensive vessel propelled by sail or power and used for cruising or racing', 'name': 'yacht'}, {'frequency': 'r', 'id': 1226, 'synset': 'yak.n.02', 'synonyms': ['yak'], 'def': 'large long-haired wild ox of Tibet often domesticated', 'name': 'yak'}, {'frequency': 'c', 'id': 1227, 'synset': 'yogurt.n.01', 'synonyms': ['yogurt', 'yoghurt', 'yoghourt'], 'def': 'a custard-like food made from curdled milk', 'name': 'yogurt'}, {'frequency': 'r', 'id': 1228, 'synset': 'yoke.n.07', 'synonyms': ['yoke_(animal_equipment)'], 'def': 'gear joining two animals at the neck; NOT egg yolk', 'name': 'yoke_(animal_equipment)'}, {'frequency': 'f', 'id': 1229, 'synset': 'zebra.n.01', 'synonyms': ['zebra'], 'def': 'any of several fleet black-and-white striped African equines', 'name': 'zebra'}, {'frequency': 'c', 'id': 1230, 'synset': 'zucchini.n.02', 'synonyms': ['zucchini', 'courgette'], 'def': 'small cucumber-shaped vegetable marrow; typically dark green', 'name': 'zucchini'}] # noqa +# fmt: on diff --git a/detectron2/data/datasets/pascal_voc.py b/detectron2/data/datasets/pascal_voc.py new file mode 100644 index 0000000000000000000000000000000000000000..5872d96575b428e90b29a7759a2f7b32dcc15d25 --- /dev/null +++ b/detectron2/data/datasets/pascal_voc.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import numpy as np +import os +import xml.etree.ElementTree as ET +from fvcore.common.file_io import PathManager + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode + +__all__ = ["register_pascal_voc"] + + +# fmt: off +CLASS_NAMES = [ + "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", + "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", + "pottedplant", "sheep", "sofa", "train", "tvmonitor", +] +# fmt: on + + +def load_voc_instances(dirname: str, split: str): + """ + Load Pascal VOC detection annotations to Detectron2 format. + + Args: + dirname: Contain "Annotations", "ImageSets", "JPEGImages" + split (str): one of "train", "test", "val", "trainval" + """ + with PathManager.open(os.path.join(dirname, "ImageSets", "Main", split + ".txt")) as f: + fileids = np.loadtxt(f, dtype=np.str) + + # Needs to read many small annotation files. Makes sense at local + annotation_dirname = PathManager.get_local_path(os.path.join(dirname, "Annotations/")) + dicts = [] + for fileid in fileids: + anno_file = os.path.join(annotation_dirname, fileid + ".xml") + jpeg_file = os.path.join(dirname, "JPEGImages", fileid + ".jpg") + + with PathManager.open(anno_file) as f: + tree = ET.parse(f) + + r = { + "file_name": jpeg_file, + "image_id": fileid, + "height": int(tree.findall("./size/height")[0].text), + "width": int(tree.findall("./size/width")[0].text), + } + instances = [] + + for obj in tree.findall("object"): + cls = obj.find("name").text + # We include "difficult" samples in training. + # Based on limited experiments, they don't hurt accuracy. + # difficult = int(obj.find("difficult").text) + # if difficult == 1: + # continue + bbox = obj.find("bndbox") + bbox = [float(bbox.find(x).text) for x in ["xmin", "ymin", "xmax", "ymax"]] + # Original annotations are integers in the range [1, W or H] + # Assuming they mean 1-based pixel indices (inclusive), + # a box with annotation (xmin=1, xmax=W) covers the whole image. + # In coordinate space this is represented by (xmin=0, xmax=W) + bbox[0] -= 1.0 + bbox[1] -= 1.0 + instances.append( + {"category_id": CLASS_NAMES.index(cls), "bbox": bbox, "bbox_mode": BoxMode.XYXY_ABS} + ) + r["annotations"] = instances + dicts.append(r) + return dicts + + +def register_pascal_voc(name, dirname, split, year): + DatasetCatalog.register(name, lambda: load_voc_instances(dirname, split)) + MetadataCatalog.get(name).set( + thing_classes=CLASS_NAMES, dirname=dirname, year=year, split=split + ) diff --git a/detectron2/data/datasets/register_coco.py b/detectron2/data/datasets/register_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..fe92216a2489dcddf8a8f96b010320a35f6b4a3b --- /dev/null +++ b/detectron2/data/datasets/register_coco.py @@ -0,0 +1,129 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog + +from .coco import load_coco_json, load_sem_seg + +""" +This file contains functions to register a COCO-format dataset to the DatasetCatalog. +""" + +__all__ = ["register_coco_instances", "register_coco_panoptic_separated"] + + +def register_coco_instances(name, metadata, json_file, image_root): + """ + Register a dataset in COCO's json annotation format for + instance detection, instance segmentation and keypoint detection. + (i.e., Type 1 and 2 in http://cocodataset.org/#format-data. + `instances*.json` and `person_keypoints*.json` in the dataset). + + This is an example of how to register a new dataset. + You can do something similar to this function, to register new datasets. + + Args: + name (str): the name that identifies a dataset, e.g. "coco_2014_train". + metadata (dict): extra metadata associated with this dataset. You can + leave it as an empty dict. + json_file (str): path to the json instance annotation file. + image_root (str or path-like): directory which contains all the images. + """ + assert isinstance(name, str), name + assert isinstance(json_file, (str, os.PathLike)), json_file + assert isinstance(image_root, (str, os.PathLike)), image_root + # 1. register a function which returns dicts + DatasetCatalog.register(name, lambda: load_coco_json(json_file, image_root, name)) + + # 2. Optionally, add metadata about this dataset, + # since they might be useful in evaluation, visualization or logging + MetadataCatalog.get(name).set( + json_file=json_file, image_root=image_root, evaluator_type="coco", **metadata + ) + + +def register_coco_panoptic_separated( + name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json +): + """ + Register a COCO panoptic segmentation dataset named `name`. + The annotations in this registered dataset will contain both instance annotations and + semantic annotations, each with its own contiguous ids. Hence it's called "separated". + + It follows the setting used by the PanopticFPN paper: + + 1. The instance annotations directly come from polygons in the COCO + instances annotation task, rather than from the masks in the COCO panoptic annotations. + + The two format have small differences: + Polygons in the instance annotations may have overlaps. + The mask annotations are produced by labeling the overlapped polygons + with depth ordering. + + 2. The semantic annotations are converted from panoptic annotations, where + all "things" are assigned a semantic id of 0. + All semantic categories will therefore have ids in contiguous + range [1, #stuff_categories]. + + This function will also register a pure semantic segmentation dataset + named ``name + '_stuffonly'``. + + Args: + name (str): the name that identifies a dataset, + e.g. "coco_2017_train_panoptic" + metadata (dict): extra metadata associated with this dataset. + image_root (str): directory which contains all the images + panoptic_root (str): directory which contains panoptic annotation images + panoptic_json (str): path to the json panoptic annotation file + sem_seg_root (str): directory which contains all the ground truth segmentation annotations. + instances_json (str): path to the json instance annotation file + """ + panoptic_name = name + "_separated" + DatasetCatalog.register( + panoptic_name, + lambda: merge_to_panoptic( + load_coco_json(instances_json, image_root, panoptic_name), + load_sem_seg(sem_seg_root, image_root), + ), + ) + MetadataCatalog.get(panoptic_name).set( + panoptic_root=panoptic_root, + image_root=image_root, + panoptic_json=panoptic_json, + sem_seg_root=sem_seg_root, + json_file=instances_json, # TODO rename + evaluator_type="coco_panoptic_seg", + **metadata + ) + + semantic_name = name + "_stuffonly" + DatasetCatalog.register(semantic_name, lambda: load_sem_seg(sem_seg_root, image_root)) + MetadataCatalog.get(semantic_name).set( + sem_seg_root=sem_seg_root, image_root=image_root, evaluator_type="sem_seg", **metadata + ) + + +def merge_to_panoptic(detection_dicts, sem_seg_dicts): + """ + Create dataset dicts for panoptic segmentation, by + merging two dicts using "file_name" field to match their entries. + + Args: + detection_dicts (list[dict]): lists of dicts for object detection or instance segmentation. + sem_seg_dicts (list[dict]): lists of dicts for semantic segmentation. + + Returns: + list[dict] (one per input image): Each dict contains all (key, value) pairs from dicts in + both detection_dicts and sem_seg_dicts that correspond to the same image. + The function assumes that the same key in different dicts has the same value. + """ + results = [] + sem_seg_file_to_entry = {x["file_name"]: x for x in sem_seg_dicts} + assert len(sem_seg_file_to_entry) > 0 + + for det_dict in detection_dicts: + dic = copy.copy(det_dict) + dic.update(sem_seg_file_to_entry[dic["file_name"]]) + results.append(dic) + return results diff --git a/detectron2/data/detection_utils.py b/detectron2/data/detection_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3384e6c5e922ad42601d91fc3739b7289605a553 --- /dev/null +++ b/detectron2/data/detection_utils.py @@ -0,0 +1,516 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +Common data processing utilities that are used in a +typical object detection data pipeline. +""" +import logging +import numpy as np +import pycocotools.mask as mask_util +import torch +from fvcore.common.file_io import PathManager +from PIL import Image, ImageOps + +from detectron2.structures import ( + BitMasks, + Boxes, + BoxMode, + Instances, + Keypoints, + PolygonMasks, + RotatedBoxes, + polygons_to_bitmask, +) + +from . import transforms as T +from .catalog import MetadataCatalog + + +class SizeMismatchError(ValueError): + """ + When loaded image has difference width/height compared with annotation. + """ + + +# https://en.wikipedia.org/wiki/YUV#SDTV_with_BT.601 +_M_RGB2YUV = [[0.299, 0.587, 0.114], [-0.14713, -0.28886, 0.436], [0.615, -0.51499, -0.10001]] +_M_YUV2RGB = [[1.0, 0.0, 1.13983], [1.0, -0.39465, -0.58060], [1.0, 2.03211, 0.0]] + + +def convert_PIL_to_numpy(image, format): + """ + Convert PIL image to numpy array of target format. + + Args: + image (PIL.Image): a PIL image + format (str): the format of output image + + Returns: + (np.ndarray): also see `read_image` + """ + if format is not None: + # PIL only supports RGB, so convert to RGB and flip channels over below + conversion_format = format + if format in ["BGR", "YUV-BT.601"]: + conversion_format = "RGB" + image = image.convert(conversion_format) + image = np.asarray(image) + # PIL squeezes out the channel dimension for "L", so make it HWC + if format == "L": + image = np.expand_dims(image, -1) + + # handle formats not supported by PIL + elif format == "BGR": + # flip channels if needed + image = image[:, :, ::-1] + elif format == "YUV-BT.601": + image = image / 255.0 + image = np.dot(image, np.array(_M_RGB2YUV).T) + + return image + + +def convert_image_to_rgb(image, format): + """ + Convert numpy image from given format to RGB. + + Args: + image (np.ndarray): a numpy image + format (str): the format of input image, also see `read_image` + + Returns: + (np.ndarray): HWC RGB image in 0-255 range, can be either float or uint8 + """ + if format == "BGR": + image = image[:, :, [2, 1, 0]] + elif format == "YUV-BT.601": + image = np.dot(image, np.array(_M_YUV2RGB).T) + image = image * 255.0 + else: + if format == "L": + image = image[:, :, 0] + image = image.astype(np.uint8) + image = np.asarray(Image.fromarray(image, mode=format).convert("RGB")) + return image + + +def read_image(file_name, format=None): + """ + Read an image into the given format. + Will apply rotation and flipping if the image has such exif information. + + Args: + file_name (str): image file path + format (str): one of the supported image modes in PIL, or "BGR" or "YUV-BT.601" + + Returns: + image (np.ndarray): an HWC image in the given format, which is 0-255, uint8 for + supported image modes in PIL or "BGR"; float (0-1 for Y) for YUV-BT.601. + """ + with PathManager.open(file_name, "rb") as f: + image = Image.open(f) + + # capture and ignore this bug: https://github.com/python-pillow/Pillow/issues/3973 + try: + image = ImageOps.exif_transpose(image) + except Exception: + pass + + return convert_PIL_to_numpy(image, format) + + +def check_image_size(dataset_dict, image): + """ + Raise an error if the image does not match the size specified in the dict. + """ + if "width" in dataset_dict or "height" in dataset_dict: + image_wh = (image.shape[1], image.shape[0]) + expected_wh = (dataset_dict["width"], dataset_dict["height"]) + if not image_wh == expected_wh: + raise SizeMismatchError( + "Mismatched (W,H){}, got {}, expect {}".format( + " for image " + dataset_dict["file_name"] + if "file_name" in dataset_dict + else "", + image_wh, + expected_wh, + ) + ) + + # To ensure bbox always remap to original image size + if "width" not in dataset_dict: + dataset_dict["width"] = image.shape[1] + if "height" not in dataset_dict: + dataset_dict["height"] = image.shape[0] + + +def transform_proposals(dataset_dict, image_shape, transforms, min_box_side_len, proposal_topk): + """ + Apply transformations to the proposals in dataset_dict, if any. + + Args: + dataset_dict (dict): a dict read from the dataset, possibly + contains fields "proposal_boxes", "proposal_objectness_logits", "proposal_bbox_mode" + image_shape (tuple): height, width + transforms (TransformList): + min_box_side_len (int): keep proposals with at least this size + proposal_topk (int): only keep top-K scoring proposals + + The input dict is modified in-place, with abovementioned keys removed. A new + key "proposals" will be added. Its value is an `Instances` + object which contains the transformed proposals in its field + "proposal_boxes" and "objectness_logits". + """ + if "proposal_boxes" in dataset_dict: + # Transform proposal boxes + boxes = transforms.apply_box( + BoxMode.convert( + dataset_dict.pop("proposal_boxes"), + dataset_dict.pop("proposal_bbox_mode"), + BoxMode.XYXY_ABS, + ) + ) + boxes = Boxes(boxes) + objectness_logits = torch.as_tensor( + dataset_dict.pop("proposal_objectness_logits").astype("float32") + ) + + boxes.clip(image_shape) + keep = boxes.nonempty(threshold=min_box_side_len) + boxes = boxes[keep] + objectness_logits = objectness_logits[keep] + + proposals = Instances(image_shape) + proposals.proposal_boxes = boxes[:proposal_topk] + proposals.objectness_logits = objectness_logits[:proposal_topk] + dataset_dict["proposals"] = proposals + + +def transform_instance_annotations( + annotation, transforms, image_size, *, keypoint_hflip_indices=None +): + """ + Apply transforms to box, segmentation and keypoints annotations of a single instance. + + It will use `transforms.apply_box` for the box, and + `transforms.apply_coords` for segmentation polygons & keypoints. + If you need anything more specially designed for each data structure, + you'll need to implement your own version of this function or the transforms. + + Args: + annotation (dict): dict of instance annotations for a single instance. + It will be modified in-place. + transforms (TransformList): + image_size (tuple): the height, width of the transformed image + keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. + + Returns: + dict: + the same input dict with fields "bbox", "segmentation", "keypoints" + transformed according to `transforms`. + The "bbox_mode" field will be set to XYXY_ABS. + """ + bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS) + # Note that bbox is 1d (per-instance bounding box) + annotation["bbox"] = transforms.apply_box([bbox])[0] + annotation["bbox_mode"] = BoxMode.XYXY_ABS + + if "segmentation" in annotation: + # each instance contains 1 or more polygons + segm = annotation["segmentation"] + if isinstance(segm, list): + # polygons + polygons = [np.asarray(p).reshape(-1, 2) for p in segm] + annotation["segmentation"] = [ + p.reshape(-1) for p in transforms.apply_polygons(polygons) + ] + elif isinstance(segm, dict): + # RLE + mask = mask_util.decode(segm) + mask = transforms.apply_segmentation(mask) + assert tuple(mask.shape[:2]) == image_size + annotation["segmentation"] = mask + else: + raise ValueError( + "Cannot transform segmentation of type '{}'!" + "Supported types are: polygons as list[list[float] or ndarray]," + " COCO-style RLE as a dict.".format(type(segm)) + ) + + if "keypoints" in annotation: + keypoints = transform_keypoint_annotations( + annotation["keypoints"], transforms, image_size, keypoint_hflip_indices + ) + annotation["keypoints"] = keypoints + + return annotation + + +def transform_keypoint_annotations(keypoints, transforms, image_size, keypoint_hflip_indices=None): + """ + Transform keypoint annotations of an image. + + Args: + keypoints (list[float]): Nx3 float in Detectron2 Dataset format. + transforms (TransformList): + image_size (tuple): the height, width of the transformed image + keypoint_hflip_indices (ndarray[int]): see `create_keypoint_hflip_indices`. + """ + # (N*3,) -> (N, 3) + keypoints = np.asarray(keypoints, dtype="float64").reshape(-1, 3) + keypoints[:, :2] = transforms.apply_coords(keypoints[:, :2]) + + # This assumes that HorizFlipTransform is the only one that does flip + do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 + + # Alternative way: check if probe points was horizontally flipped. + # probe = np.asarray([[0.0, 0.0], [image_width, 0.0]]) + # probe_aug = transforms.apply_coords(probe.copy()) + # do_hflip = np.sign(probe[1][0] - probe[0][0]) != np.sign(probe_aug[1][0] - probe_aug[0][0]) # noqa + + # If flipped, swap each keypoint with its opposite-handed equivalent + if do_hflip: + assert keypoint_hflip_indices is not None + keypoints = keypoints[keypoint_hflip_indices, :] + + # Maintain COCO convention that if visibility == 0, then x, y = 0 + # TODO may need to reset visibility for cropped keypoints, + # but it does not matter for our existing algorithms + keypoints[keypoints[:, 2] == 0] = 0 + return keypoints + + +def annotations_to_instances(annos, image_size, mask_format="polygon"): + """ + Create an :class:`Instances` object used by the models, + from instance annotations in the dataset dict. + + Args: + annos (list[dict]): a list of instance annotations in one image, each + element for one instance. + image_size (tuple): height, width + + Returns: + Instances: + It will contain fields "gt_boxes", "gt_classes", + "gt_masks", "gt_keypoints", if they can be obtained from `annos`. + This is the format that builtin models expect. + """ + boxes = [BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos] + target = Instances(image_size) + boxes = target.gt_boxes = Boxes(boxes) + boxes.clip(image_size) + + classes = [obj["category_id"] for obj in annos] + classes = torch.tensor(classes, dtype=torch.int64) + target.gt_classes = classes + + if len(annos) and "segmentation" in annos[0]: + segms = [obj["segmentation"] for obj in annos] + if mask_format == "polygon": + masks = PolygonMasks(segms) + else: + assert mask_format == "bitmask", mask_format + masks = [] + for segm in segms: + if isinstance(segm, list): + # polygon + masks.append(polygons_to_bitmask(segm, *image_size)) + elif isinstance(segm, dict): + # COCO RLE + masks.append(mask_util.decode(segm)) + elif isinstance(segm, np.ndarray): + assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( + segm.ndim + ) + # mask array + masks.append(segm) + else: + raise ValueError( + "Cannot convert segmentation of type '{}' to BitMasks!" + "Supported types are: polygons as list[list[float] or ndarray]," + " COCO-style RLE as a dict, or a full-image segmentation mask " + "as a 2D ndarray.".format(type(segm)) + ) + # torch.from_numpy does not support array with negative stride. + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x)) for x in masks]) + ) + target.gt_masks = masks + + if len(annos) and "keypoints" in annos[0]: + kpts = [obj.get("keypoints", []) for obj in annos] + target.gt_keypoints = Keypoints(kpts) + + return target + + +def annotations_to_instances_rotated(annos, image_size): + """ + Create an :class:`Instances` object used by the models, + from instance annotations in the dataset dict. + Compared to `annotations_to_instances`, this function is for rotated boxes only + + Args: + annos (list[dict]): a list of instance annotations in one image, each + element for one instance. + image_size (tuple): height, width + + Returns: + Instances: + Containing fields "gt_boxes", "gt_classes", + if they can be obtained from `annos`. + This is the format that builtin models expect. + """ + boxes = [obj["bbox"] for obj in annos] + target = Instances(image_size) + boxes = target.gt_boxes = RotatedBoxes(boxes) + boxes.clip(image_size) + + classes = [obj["category_id"] for obj in annos] + classes = torch.tensor(classes, dtype=torch.int64) + target.gt_classes = classes + + return target + + +def filter_empty_instances(instances, by_box=True, by_mask=True, box_threshold=1e-5): + """ + Filter out empty instances in an `Instances` object. + + Args: + instances (Instances): + by_box (bool): whether to filter out instances with empty boxes + by_mask (bool): whether to filter out instances with empty masks + box_threshold (float): minimum width and height to be considered non-empty + + Returns: + Instances: the filtered instances. + """ + assert by_box or by_mask + r = [] + if by_box: + r.append(instances.gt_boxes.nonempty(threshold=box_threshold)) + if instances.has("gt_masks") and by_mask: + r.append(instances.gt_masks.nonempty()) + + # TODO: can also filter visible keypoints + + if not r: + return instances + m = r[0] + for x in r[1:]: + m = m & x + return instances[m] + + +def create_keypoint_hflip_indices(dataset_names): + """ + Args: + dataset_names (list[str]): list of dataset names + Returns: + ndarray[int]: a vector of size=#keypoints, storing the + horizontally-flipped keypoint indices. + """ + + check_metadata_consistency("keypoint_names", dataset_names) + check_metadata_consistency("keypoint_flip_map", dataset_names) + + meta = MetadataCatalog.get(dataset_names[0]) + names = meta.keypoint_names + # TODO flip -> hflip + flip_map = dict(meta.keypoint_flip_map) + flip_map.update({v: k for k, v in flip_map.items()}) + flipped_names = [i if i not in flip_map else flip_map[i] for i in names] + flip_indices = [names.index(i) for i in flipped_names] + return np.asarray(flip_indices) + + +def gen_crop_transform_with_instance(crop_size, image_size, instance): + """ + Generate a CropTransform so that the cropping region contains + the center of the given instance. + + Args: + crop_size (tuple): h, w in pixels + image_size (tuple): h, w + instance (dict): an annotation dict of one instance, in Detectron2's + dataset format. + """ + crop_size = np.asarray(crop_size, dtype=np.int32) + bbox = BoxMode.convert(instance["bbox"], instance["bbox_mode"], BoxMode.XYXY_ABS) + center_yx = (bbox[1] + bbox[3]) * 0.5, (bbox[0] + bbox[2]) * 0.5 + assert ( + image_size[0] >= center_yx[0] and image_size[1] >= center_yx[1] + ), "The annotation bounding box is outside of the image!" + assert ( + image_size[0] >= crop_size[0] and image_size[1] >= crop_size[1] + ), "Crop size is larger than image size!" + + min_yx = np.maximum(np.floor(center_yx).astype(np.int32) - crop_size, 0) + max_yx = np.maximum(np.asarray(image_size, dtype=np.int32) - crop_size, 0) + max_yx = np.minimum(max_yx, np.ceil(center_yx).astype(np.int32)) + + y0 = np.random.randint(min_yx[0], max_yx[0] + 1) + x0 = np.random.randint(min_yx[1], max_yx[1] + 1) + return T.CropTransform(x0, y0, crop_size[1], crop_size[0]) + + +def check_metadata_consistency(key, dataset_names): + """ + Check that the datasets have consistent metadata. + + Args: + key (str): a metadata key + dataset_names (list[str]): a list of dataset names + + Raises: + AttributeError: if the key does not exist in the metadata + ValueError: if the given datasets do not have the same metadata values defined by key + """ + if len(dataset_names) == 0: + return + logger = logging.getLogger(__name__) + entries_per_dataset = [getattr(MetadataCatalog.get(d), key) for d in dataset_names] + for idx, entry in enumerate(entries_per_dataset): + if entry != entries_per_dataset[0]: + logger.error( + "Metadata '{}' for dataset '{}' is '{}'".format(key, dataset_names[idx], str(entry)) + ) + logger.error( + "Metadata '{}' for dataset '{}' is '{}'".format( + key, dataset_names[0], str(entries_per_dataset[0]) + ) + ) + raise ValueError("Datasets have different metadata '{}'!".format(key)) + + +def build_transform_gen(cfg, is_train): + """ + Create a list of :class:`TransformGen` from config. + Now it includes resizing and flipping. + + Returns: + list[TransformGen] + """ + if is_train: + min_size = cfg.INPUT.MIN_SIZE_TRAIN + max_size = cfg.INPUT.MAX_SIZE_TRAIN + sample_style = cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + sample_style = "choice" + if sample_style == "range": + assert len(min_size) == 2, "more than 2 ({}) min_size(s) are provided for ranges".format( + len(min_size) + ) + + logger = logging.getLogger(__name__) + tfm_gens = [] + tfm_gens.append(T.ResizeShortestEdge(min_size, max_size, sample_style)) + if is_train: + tfm_gens.append(T.RandomFlip()) + logger.info("TransformGens used in training: " + str(tfm_gens)) + return tfm_gens diff --git a/detectron2/data/samplers/__init__.py b/detectron2/data/samplers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9cfa8a65259a850b8259016d482a0eac1bbafb38 --- /dev/null +++ b/detectron2/data/samplers/__init__.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .distributed_sampler import InferenceSampler, RepeatFactorTrainingSampler, TrainingSampler +from .grouped_batch_sampler import GroupedBatchSampler + +__all__ = [ + "GroupedBatchSampler", + "TrainingSampler", + "InferenceSampler", + "RepeatFactorTrainingSampler", +] diff --git a/detectron2/data/samplers/distributed_sampler.py b/detectron2/data/samplers/distributed_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..b98361b2d3a019c25cbd214db5c5930b1ca640e3 --- /dev/null +++ b/detectron2/data/samplers/distributed_sampler.py @@ -0,0 +1,199 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +import math +from collections import defaultdict +from typing import Optional +import torch +from torch.utils.data.sampler import Sampler + +from detectron2.utils import comm + + +class TrainingSampler(Sampler): + """ + In training, we only care about the "infinite stream" of training data. + So this sampler produces an infinite stream of indices and + all workers cooperate to correctly shuffle the indices and sample different indices. + + The samplers in each worker effectively produces `indices[worker_id::num_workers]` + where `indices` is an infinite stream of indices consisting of + `shuffle(range(size)) + shuffle(range(size)) + ...` (if shuffle is True) + or `range(size) + range(size) + ...` (if shuffle is False) + """ + + def __init__(self, size: int, shuffle: bool = True, seed: Optional[int] = None): + """ + Args: + size (int): the total number of data of the underlying dataset to sample from + shuffle (bool): whether to shuffle the indices or not + seed (int): the initial seed of the shuffle. Must be the same + across all workers. If None, will use a random seed shared + among workers (require synchronization among all workers). + """ + self._size = size + assert size > 0 + self._shuffle = shuffle + if seed is None: + seed = comm.shared_random_seed() + self._seed = int(seed) + + self._rank = comm.get_rank() + self._world_size = comm.get_world_size() + + def __iter__(self): + start = self._rank + yield from itertools.islice(self._infinite_indices(), start, None, self._world_size) + + def _infinite_indices(self): + g = torch.Generator() + g.manual_seed(self._seed) + while True: + if self._shuffle: + yield from torch.randperm(self._size, generator=g) + else: + yield from torch.arange(self._size) + + +class RepeatFactorTrainingSampler(Sampler): + """ + Similar to TrainingSampler, but suitable for training on class imbalanced datasets + like LVIS. In each epoch, an image may appear multiple times based on its "repeat + factor". The repeat factor for an image is a function of the frequency the rarest + category labeled in that image. The "frequency of category c" in [0, 1] is defined + as the fraction of images in the training set (without repeats) in which category c + appears. + + See :paper:`lvis` (>= v2) Appendix B.2. + """ + + def __init__(self, dataset_dicts, repeat_thresh, shuffle=True, seed=None): + """ + Args: + dataset_dicts (list[dict]): annotations in Detectron2 dataset format. + repeat_thresh (float): frequency threshold below which data is repeated. + shuffle (bool): whether to shuffle the indices or not + seed (int): the initial seed of the shuffle. Must be the same + across all workers. If None, will use a random seed shared + among workers (require synchronization among all workers). + """ + self._shuffle = shuffle + if seed is None: + seed = comm.shared_random_seed() + self._seed = int(seed) + + self._rank = comm.get_rank() + self._world_size = comm.get_world_size() + + # Get fractional repeat factors and split into whole number (_int_part) + # and fractional (_frac_part) parts. + rep_factors = self._get_repeat_factors(dataset_dicts, repeat_thresh) + self._int_part = torch.trunc(rep_factors) + self._frac_part = rep_factors - self._int_part + + def _get_repeat_factors(self, dataset_dicts, repeat_thresh): + """ + Compute (fractional) per-image repeat factors. + + Args: + See __init__. + + Returns: + torch.Tensor: the i-th element is the repeat factor for the dataset image + at index i. + """ + # 1. For each category c, compute the fraction of images that contain it: f(c) + category_freq = defaultdict(int) + for dataset_dict in dataset_dicts: # For each image (without repeats) + cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} + for cat_id in cat_ids: + category_freq[cat_id] += 1 + num_images = len(dataset_dicts) + for k, v in category_freq.items(): + category_freq[k] = v / num_images + + # 2. For each category c, compute the category-level repeat factor: + # r(c) = max(1, sqrt(t / f(c))) + category_rep = { + cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq)) + for cat_id, cat_freq in category_freq.items() + } + + # 3. For each image I, compute the image-level repeat factor: + # r(I) = max_{c in I} r(c) + rep_factors = [] + for dataset_dict in dataset_dicts: + cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]} + rep_factor = max({category_rep[cat_id] for cat_id in cat_ids}) + rep_factors.append(rep_factor) + + return torch.tensor(rep_factors, dtype=torch.float32) + + def _get_epoch_indices(self, generator): + """ + Create a list of dataset indices (with repeats) to use for one epoch. + + Args: + generator (torch.Generator): pseudo random number generator used for + stochastic rounding. + + Returns: + torch.Tensor: list of dataset indices to use in one epoch. Each index + is repeated based on its calculated repeat factor. + """ + # Since repeat factors are fractional, we use stochastic rounding so + # that the target repeat factor is achieved in expectation over the + # course of training + rands = torch.rand(len(self._frac_part), generator=generator) + rep_factors = self._int_part + (rands < self._frac_part).float() + # Construct a list of indices in which we repeat images as specified + indices = [] + for dataset_index, rep_factor in enumerate(rep_factors): + indices.extend([dataset_index] * int(rep_factor.item())) + return torch.tensor(indices, dtype=torch.int64) + + def __iter__(self): + start = self._rank + yield from itertools.islice(self._infinite_indices(), start, None, self._world_size) + + def _infinite_indices(self): + g = torch.Generator() + g.manual_seed(self._seed) + while True: + # Sample indices with repeats determined by stochastic rounding; each + # "epoch" may have a slightly different size due to the rounding. + indices = self._get_epoch_indices(g) + if self._shuffle: + randperm = torch.randperm(len(indices), generator=g) + yield from indices[randperm] + else: + yield from indices + + +class InferenceSampler(Sampler): + """ + Produce indices for inference. + Inference needs to run on the __exact__ set of samples, + therefore when the total number of samples is not divisible by the number of workers, + this sampler produces different number of samples on different workers. + """ + + def __init__(self, size: int): + """ + Args: + size (int): the total number of data of the underlying dataset to sample from + """ + self._size = size + assert size > 0 + self._rank = comm.get_rank() + self._world_size = comm.get_world_size() + + shard_size = (self._size - 1) // self._world_size + 1 + begin = shard_size * self._rank + end = min(shard_size * (self._rank + 1), self._size) + self._local_indices = range(begin, end) + + def __iter__(self): + yield from self._local_indices + + def __len__(self): + return len(self._local_indices) diff --git a/detectron2/data/samplers/grouped_batch_sampler.py b/detectron2/data/samplers/grouped_batch_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..138e106136083383d9f8729f1da930804463b297 --- /dev/null +++ b/detectron2/data/samplers/grouped_batch_sampler.py @@ -0,0 +1,47 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +from torch.utils.data.sampler import BatchSampler, Sampler + + +class GroupedBatchSampler(BatchSampler): + """ + Wraps another sampler to yield a mini-batch of indices. + It enforces that the batch only contain elements from the same group. + It also tries to provide mini-batches which follows an ordering which is + as close as possible to the ordering from the original sampler. + """ + + def __init__(self, sampler, group_ids, batch_size): + """ + Args: + sampler (Sampler): Base sampler. + group_ids (list[int]): If the sampler produces indices in range [0, N), + `group_ids` must be a list of `N` ints which contains the group id of each sample. + The group ids must be a set of integers in the range [0, num_groups). + batch_size (int): Size of mini-batch. + """ + if not isinstance(sampler, Sampler): + raise ValueError( + "sampler should be an instance of " + "torch.utils.data.Sampler, but got sampler={}".format(sampler) + ) + self.sampler = sampler + self.group_ids = np.asarray(group_ids) + assert self.group_ids.ndim == 1 + self.batch_size = batch_size + groups = np.unique(self.group_ids).tolist() + + # buffer the indices of each group until batch size is reached + self.buffer_per_group = {k: [] for k in groups} + + def __iter__(self): + for idx in self.sampler: + group_id = self.group_ids[idx] + group_buffer = self.buffer_per_group[group_id] + group_buffer.append(idx) + if len(group_buffer) == self.batch_size: + yield group_buffer[:] # yield a copy of the list + del group_buffer[:] + + def __len__(self): + raise NotImplementedError("len() of GroupedBatchSampler is not well-defined.") diff --git a/detectron2/data/transforms/__init__.py b/detectron2/data/transforms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f7638bb58009ff3e00eb1373f2faa5dc2f30100d --- /dev/null +++ b/detectron2/data/transforms/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .transform import * +from fvcore.transforms.transform import * +from .transform_gen import * + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/data/transforms/transform.py b/detectron2/data/transforms/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..9723034e446ac2f57ffcf9209f431c7aed2c4505 --- /dev/null +++ b/detectron2/data/transforms/transform.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# File: transform.py + +import numpy as np +import torch +import torch.nn.functional as F +from fvcore.transforms.transform import HFlipTransform, NoOpTransform, Transform +from PIL import Image + +try: + import cv2 # noqa +except ImportError: + # OpenCV is an optional dependency at the moment + pass + +__all__ = ["ExtentTransform", "ResizeTransform", "RotationTransform"] + + +class ExtentTransform(Transform): + """ + Extracts a subregion from the source image and scales it to the output size. + + The fill color is used to map pixels from the source rect that fall outside + the source image. + + See: https://pillow.readthedocs.io/en/latest/PIL.html#PIL.ImageTransform.ExtentTransform + """ + + def __init__(self, src_rect, output_size, interp=Image.LINEAR, fill=0): + """ + Args: + src_rect (x0, y0, x1, y1): src coordinates + output_size (h, w): dst image size + interp: PIL interpolation methods + fill: Fill color used when src_rect extends outside image + """ + super().__init__() + self._set_attributes(locals()) + + def apply_image(self, img, interp=None): + h, w = self.output_size + ret = Image.fromarray(img).transform( + size=(w, h), + method=Image.EXTENT, + data=self.src_rect, + resample=interp if interp else self.interp, + fill=self.fill, + ) + return np.asarray(ret) + + def apply_coords(self, coords): + # Transform image center from source coordinates into output coordinates + # and then map the new origin to the corner of the output image. + h, w = self.output_size + x0, y0, x1, y1 = self.src_rect + new_coords = coords.astype(np.float32) + new_coords[:, 0] -= 0.5 * (x0 + x1) + new_coords[:, 1] -= 0.5 * (y0 + y1) + new_coords[:, 0] *= w / (x1 - x0) + new_coords[:, 1] *= h / (y1 - y0) + new_coords[:, 0] += 0.5 * w + new_coords[:, 1] += 0.5 * h + return new_coords + + def apply_segmentation(self, segmentation): + segmentation = self.apply_image(segmentation, interp=Image.NEAREST) + return segmentation + + +class ResizeTransform(Transform): + """ + Resize the image to a target size. + """ + + def __init__(self, h, w, new_h, new_w, interp=None): + """ + Args: + h, w (int): original image size + new_h, new_w (int): new image size + interp: PIL interpolation methods, defaults to bilinear. + """ + # TODO decide on PIL vs opencv + super().__init__() + if interp is None: + interp = Image.BILINEAR + self._set_attributes(locals()) + + def apply_image(self, img, interp=None): + assert img.shape[:2] == (self.h, self.w) + assert len(img.shape) <= 4 + + if img.dtype == np.uint8: + pil_image = Image.fromarray(img) + interp_method = interp if interp is not None else self.interp + pil_image = pil_image.resize((self.new_w, self.new_h), interp_method) + ret = np.asarray(pil_image) + else: + # PIL only supports uint8 + img = torch.from_numpy(img) + shape = list(img.shape) + shape_4d = shape[:2] + [1] * (4 - len(shape)) + shape[2:] + img = img.view(shape_4d).permute(2, 3, 0, 1) # hw(c) -> nchw + _PIL_RESIZE_TO_INTERPOLATE_MODE = {Image.BILINEAR: "bilinear", Image.BICUBIC: "bicubic"} + mode = _PIL_RESIZE_TO_INTERPOLATE_MODE[self.interp] + img = F.interpolate(img, (self.new_h, self.new_w), mode=mode, align_corners=False) + shape[:2] = (self.new_h, self.new_w) + ret = img.permute(2, 3, 0, 1).view(shape).numpy() # nchw -> hw(c) + + return ret + + def apply_coords(self, coords): + coords[:, 0] = coords[:, 0] * (self.new_w * 1.0 / self.w) + coords[:, 1] = coords[:, 1] * (self.new_h * 1.0 / self.h) + return coords + + def apply_segmentation(self, segmentation): + segmentation = self.apply_image(segmentation, interp=Image.NEAREST) + return segmentation + + def inverse(self): + return ResizeTransform(self.new_h, self.new_w, self.h, self.w, self.interp) + + +class RotationTransform(Transform): + """ + This method returns a copy of this image, rotated the given + number of degrees counter clockwise around its center. + """ + + def __init__(self, h, w, angle, expand=True, center=None, interp=None): + """ + Args: + h, w (int): original image size + angle (float): degrees for rotation + expand (bool): choose if the image should be resized to fit the whole + rotated image (default), or simply cropped + center (tuple (width, height)): coordinates of the rotation center + if left to None, the center will be fit to the center of each image + center has no effect if expand=True because it only affects shifting + interp: cv2 interpolation method, default cv2.INTER_LINEAR + """ + super().__init__() + image_center = np.array((w / 2, h / 2)) + if center is None: + center = image_center + if interp is None: + interp = cv2.INTER_LINEAR + abs_cos, abs_sin = abs(np.cos(np.deg2rad(angle))), abs(np.sin(np.deg2rad(angle))) + if expand: + # find the new width and height bounds + bound_w, bound_h = np.rint( + [h * abs_sin + w * abs_cos, h * abs_cos + w * abs_sin] + ).astype(int) + else: + bound_w, bound_h = w, h + + self._set_attributes(locals()) + self.rm_coords = self.create_rotation_matrix() + # Needed because of this problem https://github.com/opencv/opencv/issues/11784 + self.rm_image = self.create_rotation_matrix(offset=-0.5) + + def apply_image(self, img, interp=None): + """ + img should be a numpy array, formatted as Height * Width * Nchannels + """ + if len(img) == 0 or self.angle % 360 == 0: + return img + assert img.shape[:2] == (self.h, self.w) + interp = interp if interp is not None else self.interp + return cv2.warpAffine(img, self.rm_image, (self.bound_w, self.bound_h), flags=interp) + + def apply_coords(self, coords): + """ + coords should be a N * 2 array-like, containing N couples of (x, y) points + """ + coords = np.asarray(coords, dtype=float) + if len(coords) == 0 or self.angle % 360 == 0: + return coords + return cv2.transform(coords[:, np.newaxis, :], self.rm_coords)[:, 0, :] + + def apply_segmentation(self, segmentation): + segmentation = self.apply_image(segmentation, interp=cv2.INTER_NEAREST) + return segmentation + + def create_rotation_matrix(self, offset=0): + center = (self.center[0] + offset, self.center[1] + offset) + rm = cv2.getRotationMatrix2D(tuple(center), self.angle, 1) + if self.expand: + # Find the coordinates of the center of rotation in the new image + # The only point for which we know the future coordinates is the center of the image + rot_im_center = cv2.transform(self.image_center[None, None, :] + offset, rm)[0, 0, :] + new_center = np.array([self.bound_w / 2, self.bound_h / 2]) + offset - rot_im_center + # shift the rotation center to the new coordinates + rm[:, 2] += new_center + return rm + + +def HFlip_rotated_box(transform, rotated_boxes): + """ + Apply the horizontal flip transform on rotated boxes. + + Args: + rotated_boxes (ndarray): Nx5 floating point array of + (x_center, y_center, width, height, angle_degrees) format + in absolute coordinates. + """ + # Transform x_center + rotated_boxes[:, 0] = transform.width - rotated_boxes[:, 0] + # Transform angle + rotated_boxes[:, 4] = -rotated_boxes[:, 4] + return rotated_boxes + + +def Resize_rotated_box(transform, rotated_boxes): + """ + Apply the resizing transform on rotated boxes. For details of how these (approximation) + formulas are derived, please refer to :meth:`RotatedBoxes.scale`. + + Args: + rotated_boxes (ndarray): Nx5 floating point array of + (x_center, y_center, width, height, angle_degrees) format + in absolute coordinates. + """ + scale_factor_x = transform.new_w * 1.0 / transform.w + scale_factor_y = transform.new_h * 1.0 / transform.h + rotated_boxes[:, 0] *= scale_factor_x + rotated_boxes[:, 1] *= scale_factor_y + theta = rotated_boxes[:, 4] * np.pi / 180.0 + c = np.cos(theta) + s = np.sin(theta) + rotated_boxes[:, 2] *= np.sqrt(np.square(scale_factor_x * c) + np.square(scale_factor_y * s)) + rotated_boxes[:, 3] *= np.sqrt(np.square(scale_factor_x * s) + np.square(scale_factor_y * c)) + rotated_boxes[:, 4] = np.arctan2(scale_factor_x * s, scale_factor_y * c) * 180 / np.pi + + return rotated_boxes + + +HFlipTransform.register_type("rotated_box", HFlip_rotated_box) +NoOpTransform.register_type("rotated_box", lambda t, x: x) +ResizeTransform.register_type("rotated_box", Resize_rotated_box) diff --git a/detectron2/data/transforms/transform_gen.py b/detectron2/data/transforms/transform_gen.py new file mode 100644 index 0000000000000000000000000000000000000000..197a0ebf6750a7ea459aa7e14413b4a41adcd42e --- /dev/null +++ b/detectron2/data/transforms/transform_gen.py @@ -0,0 +1,534 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# File: transformer.py + +import inspect +import numpy as np +import pprint +import sys +from abc import ABCMeta, abstractmethod +from fvcore.transforms.transform import ( + BlendTransform, + CropTransform, + HFlipTransform, + NoOpTransform, + Transform, + TransformList, + VFlipTransform, +) +from PIL import Image + +from .transform import ExtentTransform, ResizeTransform, RotationTransform + +__all__ = [ + "RandomApply", + "RandomBrightness", + "RandomContrast", + "RandomCrop", + "RandomExtent", + "RandomFlip", + "RandomSaturation", + "RandomLighting", + "RandomRotation", + "Resize", + "ResizeShortestEdge", + "TransformGen", + "apply_transform_gens", +] + + +def check_dtype(img): + assert isinstance(img, np.ndarray), "[TransformGen] Needs an numpy array, but got a {}!".format( + type(img) + ) + assert not isinstance(img.dtype, np.integer) or ( + img.dtype == np.uint8 + ), "[TransformGen] Got image of type {}, use uint8 or floating points instead!".format( + img.dtype + ) + assert img.ndim in [2, 3], img.ndim + + +class TransformGen(metaclass=ABCMeta): + """ + TransformGen takes an image of type uint8 in range [0, 255], or + floating point in range [0, 1] or [0, 255] as input. + + It creates a :class:`Transform` based on the given image, sometimes with randomness. + The transform can then be used to transform images + or other data (boxes, points, annotations, etc.) associated with it. + + The assumption made in this class + is that the image itself is sufficient to instantiate a transform. + When this assumption is not true, you need to create the transforms by your own. + + A list of `TransformGen` can be applied with :func:`apply_transform_gens`. + """ + + def _init(self, params=None): + if params: + for k, v in params.items(): + if k != "self" and not k.startswith("_"): + setattr(self, k, v) + + @abstractmethod + def get_transform(self, img): + pass + + def _rand_range(self, low=1.0, high=None, size=None): + """ + Uniform float random number between low and high. + """ + if high is None: + low, high = 0, low + if size is None: + size = [] + return np.random.uniform(low, high, size) + + def __repr__(self): + """ + Produce something like: + "MyTransformGen(field1={self.field1}, field2={self.field2})" + """ + try: + sig = inspect.signature(self.__init__) + classname = type(self).__name__ + argstr = [] + for name, param in sig.parameters.items(): + assert ( + param.kind != param.VAR_POSITIONAL and param.kind != param.VAR_KEYWORD + ), "The default __repr__ doesn't support *args or **kwargs" + assert hasattr(self, name), ( + "Attribute {} not found! " + "Default __repr__ only works if attributes match the constructor.".format(name) + ) + attr = getattr(self, name) + default = param.default + if default is attr: + continue + argstr.append("{}={}".format(name, pprint.pformat(attr))) + return "{}({})".format(classname, ", ".join(argstr)) + except AssertionError: + return super().__repr__() + + __str__ = __repr__ + + +class RandomApply(TransformGen): + """ + Randomly apply the wrapper transformation with a given probability. + """ + + def __init__(self, transform, prob=0.5): + """ + Args: + transform (Transform, TransformGen): the transform to be wrapped + by the `RandomApply`. The `transform` can either be a + `Transform` or `TransformGen` instance. + prob (float): probability between 0.0 and 1.0 that + the wrapper transformation is applied + """ + super().__init__() + assert isinstance(transform, (Transform, TransformGen)), ( + f"The given transform must either be a Transform or TransformGen instance. " + f"Not {type(transform)}" + ) + assert 0.0 <= prob <= 1.0, f"Probablity must be between 0.0 and 1.0 (given: {prob})" + self.prob = prob + self.transform = transform + + def get_transform(self, img): + do = self._rand_range() < self.prob + if do: + if isinstance(self.transform, TransformGen): + return self.transform.get_transform(img) + else: + return self.transform + else: + return NoOpTransform() + + +class RandomFlip(TransformGen): + """ + Flip the image horizontally or vertically with the given probability. + """ + + def __init__(self, prob=0.5, *, horizontal=True, vertical=False): + """ + Args: + prob (float): probability of flip. + horizontal (boolean): whether to apply horizontal flipping + vertical (boolean): whether to apply vertical flipping + """ + super().__init__() + + if horizontal and vertical: + raise ValueError("Cannot do both horiz and vert. Please use two Flip instead.") + if not horizontal and not vertical: + raise ValueError("At least one of horiz or vert has to be True!") + self._init(locals()) + + def get_transform(self, img): + h, w = img.shape[:2] + do = self._rand_range() < self.prob + if do: + if self.horizontal: + return HFlipTransform(w) + elif self.vertical: + return VFlipTransform(h) + else: + return NoOpTransform() + + +class Resize(TransformGen): + """ Resize image to a target size""" + + def __init__(self, shape, interp=Image.BILINEAR): + """ + Args: + shape: (h, w) tuple or a int + interp: PIL interpolation method + """ + if isinstance(shape, int): + shape = (shape, shape) + shape = tuple(shape) + self._init(locals()) + + def get_transform(self, img): + return ResizeTransform( + img.shape[0], img.shape[1], self.shape[0], self.shape[1], self.interp + ) + + +class ResizeShortestEdge(TransformGen): + """ + Scale the shorter edge to the given size, with a limit of `max_size` on the longer edge. + If `max_size` is reached, then downscale so that the longer edge does not exceed max_size. + """ + + def __init__( + self, short_edge_length, max_size=sys.maxsize, sample_style="range", interp=Image.BILINEAR + ): + """ + Args: + short_edge_length (list[int]): If ``sample_style=="range"``, + a [min, max] interval from which to sample the shortest edge length. + If ``sample_style=="choice"``, a list of shortest edge lengths to sample from. + max_size (int): maximum allowed longest edge length. + sample_style (str): either "range" or "choice". + """ + super().__init__() + assert sample_style in ["range", "choice"], sample_style + + self.is_range = sample_style == "range" + if isinstance(short_edge_length, int): + short_edge_length = (short_edge_length, short_edge_length) + self._init(locals()) + + def get_transform(self, img): + h, w = img.shape[:2] + + if self.is_range: + size = np.random.randint(self.short_edge_length[0], self.short_edge_length[1] + 1) + else: + size = np.random.choice(self.short_edge_length) + if size == 0: + return NoOpTransform() + + scale = size * 1.0 / min(h, w) + if h < w: + newh, neww = size, scale * w + else: + newh, neww = scale * h, size + if max(newh, neww) > self.max_size: + scale = self.max_size * 1.0 / max(newh, neww) + newh = newh * scale + neww = neww * scale + neww = int(neww + 0.5) + newh = int(newh + 0.5) + return ResizeTransform(h, w, newh, neww, self.interp) + + +class RandomRotation(TransformGen): + """ + This method returns a copy of this image, rotated the given + number of degrees counter clockwise around the given center. + """ + + def __init__(self, angle, expand=True, center=None, sample_style="range", interp=None): + """ + Args: + angle (list[float]): If ``sample_style=="range"``, + a [min, max] interval from which to sample the angle (in degrees). + If ``sample_style=="choice"``, a list of angles to sample from + expand (bool): choose if the image should be resized to fit the whole + rotated image (default), or simply cropped + center (list[[float, float]]): If ``sample_style=="range"``, + a [[minx, miny], [maxx, maxy]] relative interval from which to sample the center, + [0, 0] being the top left of the image and [1, 1] the bottom right. + If ``sample_style=="choice"``, a list of centers to sample from + Default: None, which means that the center of rotation is the center of the image + center has no effect if expand=True because it only affects shifting + """ + super().__init__() + assert sample_style in ["range", "choice"], sample_style + self.is_range = sample_style == "range" + if isinstance(angle, (float, int)): + angle = (angle, angle) + if center is not None and isinstance(center[0], (float, int)): + center = (center, center) + self._init(locals()) + + def get_transform(self, img): + h, w = img.shape[:2] + center = None + if self.is_range: + angle = np.random.uniform(self.angle[0], self.angle[1]) + if self.center is not None: + center = ( + np.random.uniform(self.center[0][0], self.center[1][0]), + np.random.uniform(self.center[0][1], self.center[1][1]), + ) + else: + angle = np.random.choice(self.angle) + if self.center is not None: + center = np.random.choice(self.center) + + if center is not None: + center = (w * center[0], h * center[1]) # Convert to absolute coordinates + + return RotationTransform(h, w, angle, expand=self.expand, center=center, interp=self.interp) + + +class RandomCrop(TransformGen): + """ + Randomly crop a subimage out of an image. + """ + + def __init__(self, crop_type: str, crop_size): + """ + Args: + crop_type (str): one of "relative_range", "relative", "absolute". + See `config/defaults.py` for explanation. + crop_size (tuple[float]): the relative ratio or absolute pixels of + height and width + """ + super().__init__() + assert crop_type in ["relative_range", "relative", "absolute"] + self._init(locals()) + + def get_transform(self, img): + h, w = img.shape[:2] + croph, cropw = self.get_crop_size((h, w)) + assert h >= croph and w >= cropw, "Shape computation in {} has bugs.".format(self) + h0 = np.random.randint(h - croph + 1) + w0 = np.random.randint(w - cropw + 1) + return CropTransform(w0, h0, cropw, croph) + + def get_crop_size(self, image_size): + """ + Args: + image_size (tuple): height, width + + Returns: + crop_size (tuple): height, width in absolute pixels + """ + h, w = image_size + if self.crop_type == "relative": + ch, cw = self.crop_size + return int(h * ch + 0.5), int(w * cw + 0.5) + elif self.crop_type == "relative_range": + crop_size = np.asarray(self.crop_size, dtype=np.float32) + ch, cw = crop_size + np.random.rand(2) * (1 - crop_size) + return int(h * ch + 0.5), int(w * cw + 0.5) + elif self.crop_type == "absolute": + return (min(self.crop_size[0], h), min(self.crop_size[1], w)) + else: + NotImplementedError("Unknown crop type {}".format(self.crop_type)) + + +class RandomExtent(TransformGen): + """ + Outputs an image by cropping a random "subrect" of the source image. + + The subrect can be parameterized to include pixels outside the source image, + in which case they will be set to zeros (i.e. black). The size of the output + image will vary with the size of the random subrect. + """ + + def __init__(self, scale_range, shift_range): + """ + Args: + output_size (h, w): Dimensions of output image + scale_range (l, h): Range of input-to-output size scaling factor + shift_range (x, y): Range of shifts of the cropped subrect. The rect + is shifted by [w / 2 * Uniform(-x, x), h / 2 * Uniform(-y, y)], + where (w, h) is the (width, height) of the input image. Set each + component to zero to crop at the image's center. + """ + super().__init__() + self._init(locals()) + + def get_transform(self, img): + img_h, img_w = img.shape[:2] + + # Initialize src_rect to fit the input image. + src_rect = np.array([-0.5 * img_w, -0.5 * img_h, 0.5 * img_w, 0.5 * img_h]) + + # Apply a random scaling to the src_rect. + src_rect *= np.random.uniform(self.scale_range[0], self.scale_range[1]) + + # Apply a random shift to the coordinates origin. + src_rect[0::2] += self.shift_range[0] * img_w * (np.random.rand() - 0.5) + src_rect[1::2] += self.shift_range[1] * img_h * (np.random.rand() - 0.5) + + # Map src_rect coordinates into image coordinates (center at corner). + src_rect[0::2] += 0.5 * img_w + src_rect[1::2] += 0.5 * img_h + + return ExtentTransform( + src_rect=(src_rect[0], src_rect[1], src_rect[2], src_rect[3]), + output_size=(int(src_rect[3] - src_rect[1]), int(src_rect[2] - src_rect[0])), + ) + + +class RandomContrast(TransformGen): + """ + Randomly transforms image contrast. + + Contrast intensity is uniformly sampled in (intensity_min, intensity_max). + - intensity < 1 will reduce contrast + - intensity = 1 will preserve the input image + - intensity > 1 will increase contrast + + See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html + """ + + def __init__(self, intensity_min, intensity_max): + """ + Args: + intensity_min (float): Minimum augmentation + intensity_max (float): Maximum augmentation + """ + super().__init__() + self._init(locals()) + + def get_transform(self, img): + w = np.random.uniform(self.intensity_min, self.intensity_max) + return BlendTransform(src_image=img.mean(), src_weight=1 - w, dst_weight=w) + + +class RandomBrightness(TransformGen): + """ + Randomly transforms image brightness. + + Brightness intensity is uniformly sampled in (intensity_min, intensity_max). + - intensity < 1 will reduce brightness + - intensity = 1 will preserve the input image + - intensity > 1 will increase brightness + + See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html + """ + + def __init__(self, intensity_min, intensity_max): + """ + Args: + intensity_min (float): Minimum augmentation + intensity_max (float): Maximum augmentation + """ + super().__init__() + self._init(locals()) + + def get_transform(self, img): + w = np.random.uniform(self.intensity_min, self.intensity_max) + return BlendTransform(src_image=0, src_weight=1 - w, dst_weight=w) + + +class RandomSaturation(TransformGen): + """ + Randomly transforms image saturation. + + Saturation intensity is uniformly sampled in (intensity_min, intensity_max). + - intensity < 1 will reduce saturation (make the image more grayscale) + - intensity = 1 will preserve the input image + - intensity > 1 will increase saturation + + See: https://pillow.readthedocs.io/en/3.0.x/reference/ImageEnhance.html + """ + + def __init__(self, intensity_min, intensity_max): + """ + Args: + intensity_min (float): Minimum augmentation (1 preserves input). + intensity_max (float): Maximum augmentation (1 preserves input). + """ + super().__init__() + self._init(locals()) + + def get_transform(self, img): + assert img.shape[-1] == 3, "Saturation only works on RGB images" + w = np.random.uniform(self.intensity_min, self.intensity_max) + grayscale = img.dot([0.299, 0.587, 0.114])[:, :, np.newaxis] + return BlendTransform(src_image=grayscale, src_weight=1 - w, dst_weight=w) + + +class RandomLighting(TransformGen): + """ + Randomly transforms image color using fixed PCA over ImageNet. + + The degree of color jittering is randomly sampled via a normal distribution, + with standard deviation given by the scale parameter. + """ + + def __init__(self, scale): + """ + Args: + scale (float): Standard deviation of principal component weighting. + """ + super().__init__() + self._init(locals()) + self.eigen_vecs = np.array( + [[-0.5675, 0.7192, 0.4009], [-0.5808, -0.0045, -0.8140], [-0.5836, -0.6948, 0.4203]] + ) + self.eigen_vals = np.array([0.2175, 0.0188, 0.0045]) + + def get_transform(self, img): + assert img.shape[-1] == 3, "Saturation only works on RGB images" + weights = np.random.normal(scale=self.scale, size=3) + return BlendTransform( + src_image=self.eigen_vecs.dot(weights * self.eigen_vals), src_weight=1.0, dst_weight=1.0 + ) + + +def apply_transform_gens(transform_gens, img): + """ + Apply a list of :class:`TransformGen` or :class:`Transform` on the input image, and + returns the transformed image and a list of transforms. + + We cannot simply create and return all transforms without + applying it to the image, because a subsequent transform may + need the output of the previous one. + + Args: + transform_gens (list): list of :class:`TransformGen` or :class:`Transform` instance to + be applied. + img (ndarray): uint8 or floating point images with 1 or 3 channels. + + Returns: + ndarray: the transformed image + TransformList: contain the transforms that's used. + """ + for g in transform_gens: + assert isinstance(g, (Transform, TransformGen)), g + + check_dtype(img) + + tfms = [] + for g in transform_gens: + tfm = g.get_transform(img) if isinstance(g, TransformGen) else g + assert isinstance( + tfm, Transform + ), "TransformGen {} must return an instance of Transform! Got {} instead".format(g, tfm) + img = tfm.apply_image(img) + tfms.append(tfm) + return img, TransformList(tfms) diff --git a/detectron2/engine/__init__.py b/detectron2/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6a4538da3e66593e4ef8916cd9cbca3c83b8c14e --- /dev/null +++ b/detectron2/engine/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +from .launch import * +from .train_loop import * + +__all__ = [k for k in globals().keys() if not k.startswith("_")] + + +# prefer to let hooks and defaults live in separate namespaces (therefore not in __all__) +# but still make them available here +from .hooks import * +from .defaults import * diff --git a/detectron2/engine/defaults.py b/detectron2/engine/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..db9ab68f21d77b9e3be730c4784abe665df3d96a --- /dev/null +++ b/detectron2/engine/defaults.py @@ -0,0 +1,531 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +This file contains components with some default boilerplate logic user may need +in training / testing. They will not work for everyone, but many users may find them useful. + +The behavior of functions/classes in this file is subject to change, +since they are meant to represent the "common default behavior" people need in their projects. +""" + +import argparse +import logging +import os +import sys +from collections import OrderedDict +import torch +from fvcore.common.file_io import PathManager +from fvcore.nn.precise_bn import get_bn_modules +from torch.nn.parallel import DistributedDataParallel + +import detectron2.data.transforms as T +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.data import ( + MetadataCatalog, + build_detection_test_loader, + build_detection_train_loader, +) +from detectron2.evaluation import ( + DatasetEvaluator, + inference_on_dataset, + print_csv_format, + verify_results, +) +from detectron2.modeling import build_model +from detectron2.solver import build_lr_scheduler, build_optimizer +from detectron2.utils import comm +from detectron2.utils.collect_env import collect_env_info +from detectron2.utils.env import seed_all_rng +from detectron2.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter +from detectron2.utils.logger import setup_logger + +from . import hooks +from .train_loop import SimpleTrainer + +__all__ = ["default_argument_parser", "default_setup", "DefaultPredictor", "DefaultTrainer"] + + +def default_argument_parser(epilog=None): + """ + Create a parser with some common arguments used by detectron2 users. + + Args: + epilog (str): epilog passed to ArgumentParser describing the usage. + + Returns: + argparse.ArgumentParser: + """ + parser = argparse.ArgumentParser( + epilog=epilog + or f""" +Examples: + +Run on single machine: + $ {sys.argv[0]} --num-gpus 8 --config-file cfg.yaml MODEL.WEIGHTS /path/to/weight.pth + +Run on multiple machines: + (machine0)$ {sys.argv[0]} --machine-rank 0 --num-machines 2 --dist-url [--other-flags] + (machine1)$ {sys.argv[0]} --machine-rank 1 --num-machines 2 --dist-url [--other-flags] +""", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") + parser.add_argument( + "--resume", + action="store_true", + help="whether to attempt to resume from the checkpoint directory", + ) + parser.add_argument("--eval-only", action="store_true", help="perform evaluation only") + parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*") + parser.add_argument("--num-machines", type=int, default=1, help="total number of machines") + parser.add_argument( + "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)" + ) + + # PyTorch still may leave orphan processes in multi-gpu training. + # Therefore we use a deterministic way to obtain port, + # so that users are aware of orphan processes by seeing the port occupied. + port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14 + parser.add_argument( + "--dist-url", + default="tcp://127.0.0.1:{}".format(port), + help="initialization URL for pytorch distributed backend. See " + "https://pytorch.org/docs/stable/distributed.html for details.", + ) + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + return parser + + +def default_setup(cfg, args): + """ + Perform some basic common setups at the beginning of a job, including: + + 1. Set up the detectron2 logger + 2. Log basic information about environment, cmdline arguments, and config + 3. Backup the config to the output directory + + Args: + cfg (CfgNode): the full config to be used + args (argparse.NameSpace): the command line arguments to be logged + """ + output_dir = cfg.OUTPUT_DIR + if comm.is_main_process() and output_dir: + PathManager.mkdirs(output_dir) + + rank = comm.get_rank() + setup_logger(output_dir, distributed_rank=rank, name="fvcore") + logger = setup_logger(output_dir, distributed_rank=rank) + + logger.info("Rank of current process: {}. World size: {}".format(rank, comm.get_world_size())) + logger.info("Environment info:\n" + collect_env_info()) + + logger.info("Command line arguments: " + str(args)) + if hasattr(args, "config_file") and args.config_file != "": + logger.info( + "Contents of args.config_file={}:\n{}".format( + args.config_file, PathManager.open(args.config_file, "r").read() + ) + ) + + logger.info("Running with full config:\n{}".format(cfg)) + if comm.is_main_process() and output_dir: + # Note: some of our scripts may expect the existence of + # config.yaml in output directory + path = os.path.join(output_dir, "config.yaml") + with PathManager.open(path, "w") as f: + f.write(cfg.dump()) + logger.info("Full config saved to {}".format(path)) + + # make sure each worker has a different, yet deterministic seed if specified + seed_all_rng(None if cfg.SEED < 0 else cfg.SEED + rank) + + # cudnn benchmark has large overhead. It shouldn't be used considering the small size of + # typical validation set. + if not (hasattr(args, "eval_only") and args.eval_only): + torch.backends.cudnn.benchmark = cfg.CUDNN_BENCHMARK + + +class DefaultPredictor: + """ + Create a simple end-to-end predictor with the given config that runs on + single device for a single input image. + + Compared to using the model directly, this class does the following additions: + + 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. + 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. + 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. + 4. Take one input image and produce a single output, instead of a batch. + + If you'd like to do anything more fancy, please refer to its source code + as examples to build and use the model manually. + + Attributes: + metadata (Metadata): the metadata of the underlying dataset, obtained from + cfg.DATASETS.TEST. + + Examples: + + .. code-block:: python + + pred = DefaultPredictor(cfg) + inputs = cv2.imread("input.jpg") + outputs = pred(inputs) + """ + + def __init__(self, cfg): + self.cfg = cfg.clone() # cfg can be modified by model + self.model = build_model(self.cfg) + self.model.eval() + self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) + + checkpointer = DetectionCheckpointer(self.model) + checkpointer.load(cfg.MODEL.WEIGHTS) + + self.transform_gen = T.ResizeShortestEdge( + [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST + ) + + self.input_format = cfg.INPUT.FORMAT + assert self.input_format in ["RGB", "BGR"], self.input_format + + def __call__(self, original_image): + """ + Args: + original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). + + Returns: + predictions (dict): + the output of the model for one image only. + See :doc:`/tutorials/models` for details about the format. + """ + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + height, width = original_image.shape[:2] + image = self.transform_gen.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + inputs = {"image": image, "height": height, "width": width} + predictions = self.model([inputs])[0] + return predictions + + +class DefaultTrainer(SimpleTrainer): + """ + A trainer with default training logic. Compared to `SimpleTrainer`, it + contains the following logic in addition: + + 1. Create model, optimizer, scheduler, dataloader from the given config. + 2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists, when + `resume_or_load` is called. + 3. Register a few common hooks. + + It is created to simplify the **standard model training workflow** and reduce code boilerplate + for users who only need the standard training workflow, with standard features. + It means this class makes *many assumptions* about your training logic that + may easily become invalid in a new research. In fact, any assumptions beyond those made in the + :class:`SimpleTrainer` are too much for research. + + The code of this class has been annotated about restrictive assumptions it mades. + When they do not work for you, you're encouraged to: + + 1. Overwrite methods of this class, OR: + 2. Use :class:`SimpleTrainer`, which only does minimal SGD training and + nothing else. You can then add your own hooks if needed. OR: + 3. Write your own training loop similar to `tools/plain_train_net.py`. + + Also note that the behavior of this class, like other functions/classes in + this file, is not stable, since it is meant to represent the "common default behavior". + It is only guaranteed to work well with the standard models and training workflow in detectron2. + To obtain more stable behavior, write your own training logic with other public APIs. + + Examples: + + .. code-block:: python + + trainer = DefaultTrainer(cfg) + trainer.resume_or_load() # load last checkpoint or MODEL.WEIGHTS + trainer.train() + + Attributes: + scheduler: + checkpointer (DetectionCheckpointer): + cfg (CfgNode): + """ + + def __init__(self, cfg): + """ + Args: + cfg (CfgNode): + """ + logger = logging.getLogger("detectron2") + if not logger.isEnabledFor(logging.INFO): # setup_logger is not called for d2 + setup_logger() + # Assume these objects must be constructed in this order. + model = self.build_model(cfg) + optimizer = self.build_optimizer(cfg, model) + data_loader = self.build_train_loader(cfg) + + # For training, wrap with DDP. But don't need this for inference. + if comm.get_world_size() > 1: + model = DistributedDataParallel( + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False + ) + super().__init__(model, data_loader, optimizer) + + self.scheduler = self.build_lr_scheduler(cfg, optimizer) + # Assume no other objects need to be checkpointed. + # We can later make it checkpoint the stateful hooks + self.checkpointer = DetectionCheckpointer( + # Assume you want to save checkpoints together with logs/statistics + model, + cfg.OUTPUT_DIR, + optimizer=optimizer, + scheduler=self.scheduler, + ) + self.start_iter = 0 + self.max_iter = cfg.SOLVER.MAX_ITER + self.cfg = cfg + + self.register_hooks(self.build_hooks()) + + def resume_or_load(self, resume=True): + """ + If `resume==True`, and last checkpoint exists, resume from it and load all + checkpointables (eg. optimizer and scheduler). + + Otherwise, load the model specified by the config (skip all checkpointables). + + Args: + resume (bool): whether to do resume or not + """ + checkpoint = self.checkpointer.resume_or_load(self.cfg.MODEL.WEIGHTS, resume=resume) + self.start_iter = checkpoint.get("iteration", -1) if resume else -1 + # The checkpoint stores the training iteration that just finished, thus we start + # at the next iteration (or iter zero if there's no checkpoint). + self.start_iter += 1 + + def build_hooks(self): + """ + Build a list of default hooks, including timing, evaluation, + checkpointing, lr scheduling, precise BN, writing events. + + Returns: + list[HookBase]: + """ + cfg = self.cfg.clone() + cfg.defrost() + cfg.DATALOADER.NUM_WORKERS = 0 # save some memory and time for PreciseBN + + ret = [ + hooks.IterationTimer(), + hooks.LRScheduler(self.optimizer, self.scheduler), + hooks.PreciseBN( + # Run at the same freq as (but before) evaluation. + cfg.TEST.EVAL_PERIOD, + self.model, + # Build a new data loader to not affect training + self.build_train_loader(cfg), + cfg.TEST.PRECISE_BN.NUM_ITER, + ) + if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model) + else None, + ] + + # Do PreciseBN before checkpointer, because it updates the model and need to + # be saved by checkpointer. + # This is not always the best: if checkpointing has a different frequency, + # some checkpoints may have more precise statistics than others. + if comm.is_main_process(): + ret.append(hooks.PeriodicCheckpointer(self.checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD)) + + def test_and_save_results(): + self._last_eval_results = self.test(self.cfg, self.model) + return self._last_eval_results + + # Do evaluation after checkpointer, because then if it fails, + # we can use the saved checkpoint to debug. + ret.append(hooks.EvalHook(cfg.TEST.EVAL_PERIOD, test_and_save_results)) + + if comm.is_main_process(): + # run writers in the end, so that evaluation metrics are written + ret.append(hooks.PeriodicWriter(self.build_writers(), period=20)) + return ret + + def build_writers(self): + """ + Build a list of writers to be used. By default it contains + writers that write metrics to the screen, + a json file, and a tensorboard event file respectively. + If you'd like a different list of writers, you can overwrite it in + your trainer. + + Returns: + list[EventWriter]: a list of :class:`EventWriter` objects. + + It is now implemented by: + + .. code-block:: python + + return [ + CommonMetricPrinter(self.max_iter), + JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), + TensorboardXWriter(self.cfg.OUTPUT_DIR), + ] + + """ + # Here the default print/log frequency of each writer is used. + return [ + # It may not always print what you want to see, since it prints "common" metrics only. + CommonMetricPrinter(self.max_iter), + JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), + TensorboardXWriter(self.cfg.OUTPUT_DIR), + ] + + def train(self): + """ + Run training. + + Returns: + OrderedDict of results, if evaluation is enabled. Otherwise None. + """ + super().train(self.start_iter, self.max_iter) + if len(self.cfg.TEST.EXPECTED_RESULTS) and comm.is_main_process(): + assert hasattr( + self, "_last_eval_results" + ), "No evaluation results obtained during training!" + verify_results(self.cfg, self._last_eval_results) + return self._last_eval_results + + @classmethod + def build_model(cls, cfg): + """ + Returns: + torch.nn.Module: + + It now calls :func:`detectron2.modeling.build_model`. + Overwrite it if you'd like a different model. + """ + model = build_model(cfg) + logger = logging.getLogger(__name__) + logger.info("Model:\n{}".format(model)) + return model + + @classmethod + def build_optimizer(cls, cfg, model): + """ + Returns: + torch.optim.Optimizer: + + It now calls :func:`detectron2.solver.build_optimizer`. + Overwrite it if you'd like a different optimizer. + """ + return build_optimizer(cfg, model) + + @classmethod + def build_lr_scheduler(cls, cfg, optimizer): + """ + It now calls :func:`detectron2.solver.build_lr_scheduler`. + Overwrite it if you'd like a different scheduler. + """ + return build_lr_scheduler(cfg, optimizer) + + @classmethod + def build_train_loader(cls, cfg): + """ + Returns: + iterable + + It now calls :func:`detectron2.data.build_detection_train_loader`. + Overwrite it if you'd like a different data loader. + """ + return build_detection_train_loader(cfg) + + @classmethod + def build_test_loader(cls, cfg, dataset_name): + """ + Returns: + iterable + + It now calls :func:`detectron2.data.build_detection_test_loader`. + Overwrite it if you'd like a different data loader. + """ + return build_detection_test_loader(cfg, dataset_name) + + @classmethod + def build_evaluator(cls, cfg, dataset_name): + """ + Returns: + DatasetEvaluator or None + + It is not implemented by default. + """ + raise NotImplementedError( + """ +If you want DefaultTrainer to automatically run evaluation, +please implement `build_evaluator()` in subclasses (see train_net.py for example). +Alternatively, you can call evaluation functions yourself (see Colab balloon tutorial for example). +""" + ) + + @classmethod + def test(cls, cfg, model, evaluators=None): + """ + Args: + cfg (CfgNode): + model (nn.Module): + evaluators (list[DatasetEvaluator] or None): if None, will call + :meth:`build_evaluator`. Otherwise, must have the same length as + `cfg.DATASETS.TEST`. + + Returns: + dict: a dict of result metrics + """ + logger = logging.getLogger(__name__) + if isinstance(evaluators, DatasetEvaluator): + evaluators = [evaluators] + if evaluators is not None: + assert len(cfg.DATASETS.TEST) == len(evaluators), "{} != {}".format( + len(cfg.DATASETS.TEST), len(evaluators) + ) + + results = OrderedDict() + for idx, dataset_name in enumerate(cfg.DATASETS.TEST): + data_loader = cls.build_test_loader(cfg, dataset_name) + # When evaluators are passed in as arguments, + # implicitly assume that evaluators can be created before data_loader. + if evaluators is not None: + evaluator = evaluators[idx] + else: + try: + evaluator = cls.build_evaluator(cfg, dataset_name) + except NotImplementedError: + logger.warn( + "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, " + "or implement its `build_evaluator` method." + ) + results[dataset_name] = {} + continue + results_i = inference_on_dataset(model, data_loader, evaluator) + results[dataset_name] = results_i + if comm.is_main_process(): + assert isinstance( + results_i, dict + ), "Evaluator must return a dict on the main process. Got {} instead.".format( + results_i + ) + logger.info("Evaluation results for {} in csv format:".format(dataset_name)) + print_csv_format(results_i) + + if len(results) == 1: + results = list(results.values())[0] + return results diff --git a/detectron2/engine/hooks.py b/detectron2/engine/hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..e5085b4561302d2328ab505568dec4e9fc5ee0ad --- /dev/null +++ b/detectron2/engine/hooks.py @@ -0,0 +1,427 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import datetime +import itertools +import logging +import os +import tempfile +import time +from collections import Counter +import torch +from fvcore.common.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer +from fvcore.common.file_io import PathManager +from fvcore.common.timer import Timer +from fvcore.nn.precise_bn import get_bn_modules, update_bn_stats + +import detectron2.utils.comm as comm +from detectron2.evaluation.testing import flatten_results_dict +from detectron2.utils.events import EventStorage, EventWriter + +from .train_loop import HookBase + +__all__ = [ + "CallbackHook", + "IterationTimer", + "PeriodicWriter", + "PeriodicCheckpointer", + "LRScheduler", + "AutogradProfiler", + "EvalHook", + "PreciseBN", +] + + +""" +Implement some common hooks. +""" + + +class CallbackHook(HookBase): + """ + Create a hook using callback functions provided by the user. + """ + + def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None): + """ + Each argument is a function that takes one argument: the trainer. + """ + self._before_train = before_train + self._before_step = before_step + self._after_step = after_step + self._after_train = after_train + + def before_train(self): + if self._before_train: + self._before_train(self.trainer) + + def after_train(self): + if self._after_train: + self._after_train(self.trainer) + # The functions may be closures that hold reference to the trainer + # Therefore, delete them to avoid circular reference. + del self._before_train, self._after_train + del self._before_step, self._after_step + + def before_step(self): + if self._before_step: + self._before_step(self.trainer) + + def after_step(self): + if self._after_step: + self._after_step(self.trainer) + + +class IterationTimer(HookBase): + """ + Track the time spent for each iteration (each run_step call in the trainer). + Print a summary in the end of training. + + This hook uses the time between the call to its :meth:`before_step` + and :meth:`after_step` methods. + Under the convention that :meth:`before_step` of all hooks should only + take negligible amount of time, the :class:`IterationTimer` hook should be + placed at the beginning of the list of hooks to obtain accurate timing. + """ + + def __init__(self, warmup_iter=3): + """ + Args: + warmup_iter (int): the number of iterations at the beginning to exclude + from timing. + """ + self._warmup_iter = warmup_iter + self._step_timer = Timer() + self._start_time = time.perf_counter() + self._total_timer = Timer() + + def before_train(self): + self._start_time = time.perf_counter() + self._total_timer.reset() + self._total_timer.pause() + + def after_train(self): + logger = logging.getLogger(__name__) + total_time = time.perf_counter() - self._start_time + total_time_minus_hooks = self._total_timer.seconds() + hook_time = total_time - total_time_minus_hooks + + num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter + + if num_iter > 0 and total_time_minus_hooks > 0: + # Speed is meaningful only after warmup + # NOTE this format is parsed by grep in some scripts + logger.info( + "Overall training speed: {} iterations in {} ({:.4f} s / it)".format( + num_iter, + str(datetime.timedelta(seconds=int(total_time_minus_hooks))), + total_time_minus_hooks / num_iter, + ) + ) + + logger.info( + "Total training time: {} ({} on hooks)".format( + str(datetime.timedelta(seconds=int(total_time))), + str(datetime.timedelta(seconds=int(hook_time))), + ) + ) + + def before_step(self): + self._step_timer.reset() + self._total_timer.resume() + + def after_step(self): + # +1 because we're in after_step + iter_done = self.trainer.iter - self.trainer.start_iter + 1 + if iter_done >= self._warmup_iter: + sec = self._step_timer.seconds() + self.trainer.storage.put_scalars(time=sec) + else: + self._start_time = time.perf_counter() + self._total_timer.reset() + + self._total_timer.pause() + + +class PeriodicWriter(HookBase): + """ + Write events to EventStorage periodically. + + It is executed every ``period`` iterations and after the last iteration. + """ + + def __init__(self, writers, period=20): + """ + Args: + writers (list[EventWriter]): a list of EventWriter objects + period (int): + """ + self._writers = writers + for w in writers: + assert isinstance(w, EventWriter), w + self._period = period + + def after_step(self): + if (self.trainer.iter + 1) % self._period == 0 or ( + self.trainer.iter == self.trainer.max_iter - 1 + ): + for writer in self._writers: + writer.write() + + def after_train(self): + for writer in self._writers: + writer.close() + + +class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase): + """ + Same as :class:`detectron2.checkpoint.PeriodicCheckpointer`, but as a hook. + + Note that when used as a hook, + it is unable to save additional data other than what's defined + by the given `checkpointer`. + + It is executed every ``period`` iterations and after the last iteration. + """ + + def before_train(self): + self.max_iter = self.trainer.max_iter + + def after_step(self): + # No way to use **kwargs + self.step(self.trainer.iter) + + +class LRScheduler(HookBase): + """ + A hook which executes a torch builtin LR scheduler and summarizes the LR. + It is executed after every iteration. + """ + + def __init__(self, optimizer, scheduler): + """ + Args: + optimizer (torch.optim.Optimizer): + scheduler (torch.optim._LRScheduler) + """ + self._optimizer = optimizer + self._scheduler = scheduler + + # NOTE: some heuristics on what LR to summarize + # summarize the param group with most parameters + largest_group = max(len(g["params"]) for g in optimizer.param_groups) + + if largest_group == 1: + # If all groups have one parameter, + # then find the most common initial LR, and use it for summary + lr_count = Counter([g["lr"] for g in optimizer.param_groups]) + lr = lr_count.most_common()[0][0] + for i, g in enumerate(optimizer.param_groups): + if g["lr"] == lr: + self._best_param_group_id = i + break + else: + for i, g in enumerate(optimizer.param_groups): + if len(g["params"]) == largest_group: + self._best_param_group_id = i + break + + def after_step(self): + lr = self._optimizer.param_groups[self._best_param_group_id]["lr"] + self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False) + self._scheduler.step() + + +class AutogradProfiler(HookBase): + """ + A hook which runs `torch.autograd.profiler.profile`. + + Examples: + + .. code-block:: python + + hooks.AutogradProfiler( + lambda trainer: trainer.iter > 10 and trainer.iter < 20, self.cfg.OUTPUT_DIR + ) + + The above example will run the profiler for iteration 10~20 and dump + results to ``OUTPUT_DIR``. We did not profile the first few iterations + because they are typically slower than the rest. + The result files can be loaded in the ``chrome://tracing`` page in chrome browser. + + Note: + When used together with NCCL on older version of GPUs, + autograd profiler may cause deadlock because it unnecessarily allocates + memory on every device it sees. The memory management calls, if + interleaved with NCCL calls, lead to deadlock on GPUs that do not + support `cudaLaunchCooperativeKernelMultiDevice`. + """ + + def __init__(self, enable_predicate, output_dir, *, use_cuda=True): + """ + Args: + enable_predicate (callable[trainer -> bool]): a function which takes a trainer, + and returns whether to enable the profiler. + It will be called once every step, and can be used to select which steps to profile. + output_dir (str): the output directory to dump tracing files. + use_cuda (bool): same as in `torch.autograd.profiler.profile`. + """ + self._enable_predicate = enable_predicate + self._use_cuda = use_cuda + self._output_dir = output_dir + + def before_step(self): + if self._enable_predicate(self.trainer): + self._profiler = torch.autograd.profiler.profile(use_cuda=self._use_cuda) + self._profiler.__enter__() + else: + self._profiler = None + + def after_step(self): + if self._profiler is None: + return + self._profiler.__exit__(None, None, None) + PathManager.mkdirs(self._output_dir) + out_file = os.path.join( + self._output_dir, "profiler-trace-iter{}.json".format(self.trainer.iter) + ) + if "://" not in out_file: + self._profiler.export_chrome_trace(out_file) + else: + # Support non-posix filesystems + with tempfile.TemporaryDirectory(prefix="detectron2_profiler") as d: + tmp_file = os.path.join(d, "tmp.json") + self._profiler.export_chrome_trace(tmp_file) + with open(tmp_file) as f: + content = f.read() + with PathManager.open(out_file, "w") as f: + f.write(content) + + +class EvalHook(HookBase): + """ + Run an evaluation function periodically, and at the end of training. + + It is executed every ``eval_period`` iterations and after the last iteration. + """ + + def __init__(self, eval_period, eval_function): + """ + Args: + eval_period (int): the period to run `eval_function`. + eval_function (callable): a function which takes no arguments, and + returns a nested dict of evaluation metrics. + + Note: + This hook must be enabled in all or none workers. + If you would like only certain workers to perform evaluation, + give other workers a no-op function (`eval_function=lambda: None`). + """ + self._period = eval_period + self._func = eval_function + + def _do_eval(self): + results = self._func() + + if results: + assert isinstance( + results, dict + ), "Eval function must return a dict. Got {} instead.".format(results) + + flattened_results = flatten_results_dict(results) + for k, v in flattened_results.items(): + try: + v = float(v) + except Exception: + raise ValueError( + "[EvalHook] eval_function should return a nested dict of float. " + "Got '{}: {}' instead.".format(k, v) + ) + self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False) + + # Evaluation may take different time among workers. + # A barrier make them start the next iteration together. + comm.synchronize() + + def after_step(self): + next_iter = self.trainer.iter + 1 + is_final = next_iter == self.trainer.max_iter + if is_final or (self._period > 0 and next_iter % self._period == 0): + self._do_eval() + + def after_train(self): + # func is likely a closure that holds reference to the trainer + # therefore we clean it to avoid circular reference in the end + del self._func + + +class PreciseBN(HookBase): + """ + The standard implementation of BatchNorm uses EMA in inference, which is + sometimes suboptimal. + This class computes the true average of statistics rather than the moving average, + and put true averages to every BN layer in the given model. + + It is executed every ``period`` iterations and after the last iteration. + """ + + def __init__(self, period, model, data_loader, num_iter): + """ + Args: + period (int): the period this hook is run, or 0 to not run during training. + The hook will always run in the end of training. + model (nn.Module): a module whose all BN layers in training mode will be + updated by precise BN. + Note that user is responsible for ensuring the BN layers to be + updated are in training mode when this hook is triggered. + data_loader (iterable): it will produce data to be run by `model(data)`. + num_iter (int): number of iterations used to compute the precise + statistics. + """ + self._logger = logging.getLogger(__name__) + if len(get_bn_modules(model)) == 0: + self._logger.info( + "PreciseBN is disabled because model does not contain BN layers in training mode." + ) + self._disabled = True + return + + self._model = model + self._data_loader = data_loader + self._num_iter = num_iter + self._period = period + self._disabled = False + + self._data_iter = None + + def after_step(self): + next_iter = self.trainer.iter + 1 + is_final = next_iter == self.trainer.max_iter + if is_final or (self._period > 0 and next_iter % self._period == 0): + self.update_stats() + + def update_stats(self): + """ + Update the model with precise statistics. Users can manually call this method. + """ + if self._disabled: + return + + if self._data_iter is None: + self._data_iter = iter(self._data_loader) + + def data_loader(): + for num_iter in itertools.count(1): + if num_iter % 100 == 0: + self._logger.info( + "Running precise-BN ... {}/{} iterations.".format(num_iter, self._num_iter) + ) + # This way we can reuse the same iterator + yield next(self._data_iter) + + with EventStorage(): # capture events in a new storage to discard them + self._logger.info( + "Running precise-BN for {} iterations... ".format(self._num_iter) + + "Note that this could produce different statistics every time." + ) + update_bn_stats(self._model, data_loader(), self._num_iter) diff --git a/detectron2/engine/launch.py b/detectron2/engine/launch.py new file mode 100644 index 0000000000000000000000000000000000000000..9efbb0395d2c788d8cfe2cbbf66cde6ddc053585 --- /dev/null +++ b/detectron2/engine/launch.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from detectron2.utils import comm + +__all__ = ["launch"] + + +def _find_free_port(): + import socket + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # Binding to port 0 will cause the OS to find an available port for us + sock.bind(("", 0)) + port = sock.getsockname()[1] + sock.close() + # NOTE: there is still a chance the port could be taken by other processes. + return port + + +def launch(main_func, num_gpus_per_machine, num_machines=1, machine_rank=0, dist_url=None, args=()): + """ + Args: + main_func: a function that will be called by `main_func(*args)` + num_machines (int): the total number of machines + machine_rank (int): the rank of this machine (one per machine) + dist_url (str): url to connect to for distributed jobs, including protocol + e.g. "tcp://127.0.0.1:8686". + Can be set to "auto" to automatically select a free port on localhost + args (tuple): arguments passed to main_func + """ + world_size = num_machines * num_gpus_per_machine + if world_size > 1: + # https://github.com/pytorch/pytorch/pull/14391 + # TODO prctl in spawned processes + + if dist_url == "auto": + assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs." + port = _find_free_port() + dist_url = f"tcp://127.0.0.1:{port}" + if num_machines > 1 and dist_url.startswith("file://"): + logger = logging.getLogger(__name__) + logger.warning( + "file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://" + ) + + mp.spawn( + _distributed_worker, + nprocs=num_gpus_per_machine, + args=(main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args), + daemon=False, + ) + else: + main_func(*args) + + +def _distributed_worker( + local_rank, main_func, world_size, num_gpus_per_machine, machine_rank, dist_url, args +): + assert torch.cuda.is_available(), "cuda is not available. Please check your installation." + global_rank = machine_rank * num_gpus_per_machine + local_rank + try: + dist.init_process_group( + backend="NCCL", init_method=dist_url, world_size=world_size, rank=global_rank + ) + except Exception as e: + logger = logging.getLogger(__name__) + logger.error("Process group URL: {}".format(dist_url)) + raise e + # synchronize is needed here to prevent a possible timeout after calling init_process_group + # See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172 + comm.synchronize() + + assert num_gpus_per_machine <= torch.cuda.device_count() + torch.cuda.set_device(local_rank) + + # Setup the local process group (which contains ranks within the same machine) + assert comm._LOCAL_PROCESS_GROUP is None + num_machines = world_size // num_gpus_per_machine + for i in range(num_machines): + ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine)) + pg = dist.new_group(ranks_on_i) + if i == machine_rank: + comm._LOCAL_PROCESS_GROUP = pg + + main_func(*args) diff --git a/detectron2/engine/train_loop.py b/detectron2/engine/train_loop.py new file mode 100644 index 0000000000000000000000000000000000000000..453c9acfde2d65a182fbf18a6bce4b4583df5ca5 --- /dev/null +++ b/detectron2/engine/train_loop.py @@ -0,0 +1,273 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import numpy as np +import time +import weakref +import torch + +import detectron2.utils.comm as comm +from detectron2.utils.events import EventStorage + +__all__ = ["HookBase", "TrainerBase", "SimpleTrainer"] + + +class HookBase: + """ + Base class for hooks that can be registered with :class:`TrainerBase`. + + Each hook can implement 4 methods. The way they are called is demonstrated + in the following snippet: + + .. code-block:: python + + hook.before_train() + for iter in range(start_iter, max_iter): + hook.before_step() + trainer.run_step() + hook.after_step() + hook.after_train() + + Notes: + 1. In the hook method, users can access `self.trainer` to access more + properties about the context (e.g., current iteration). + + 2. A hook that does something in :meth:`before_step` can often be + implemented equivalently in :meth:`after_step`. + If the hook takes non-trivial time, it is strongly recommended to + implement the hook in :meth:`after_step` instead of :meth:`before_step`. + The convention is that :meth:`before_step` should only take negligible time. + + Following this convention will allow hooks that do care about the difference + between :meth:`before_step` and :meth:`after_step` (e.g., timer) to + function properly. + + Attributes: + trainer: A weak reference to the trainer object. Set by the trainer when the hook is + registered. + """ + + def before_train(self): + """ + Called before the first iteration. + """ + pass + + def after_train(self): + """ + Called after the last iteration. + """ + pass + + def before_step(self): + """ + Called before each iteration. + """ + pass + + def after_step(self): + """ + Called after each iteration. + """ + pass + + +class TrainerBase: + """ + Base class for iterative trainer with hooks. + + The only assumption we made here is: the training runs in a loop. + A subclass can implement what the loop is. + We made no assumptions about the existence of dataloader, optimizer, model, etc. + + Attributes: + iter(int): the current iteration. + + start_iter(int): The iteration to start with. + By convention the minimum possible value is 0. + + max_iter(int): The iteration to end training. + + storage(EventStorage): An EventStorage that's opened during the course of training. + """ + + def __init__(self): + self._hooks = [] + + def register_hooks(self, hooks): + """ + Register hooks to the trainer. The hooks are executed in the order + they are registered. + + Args: + hooks (list[Optional[HookBase]]): list of hooks + """ + hooks = [h for h in hooks if h is not None] + for h in hooks: + assert isinstance(h, HookBase) + # To avoid circular reference, hooks and trainer cannot own each other. + # This normally does not matter, but will cause memory leak if the + # involved objects contain __del__: + # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/ + h.trainer = weakref.proxy(self) + self._hooks.extend(hooks) + + def train(self, start_iter: int, max_iter: int): + """ + Args: + start_iter, max_iter (int): See docs above + """ + logger = logging.getLogger(__name__) + logger.info("Starting training from iteration {}".format(start_iter)) + + self.iter = self.start_iter = start_iter + self.max_iter = max_iter + + with EventStorage(start_iter) as self.storage: + try: + self.before_train() + for self.iter in range(start_iter, max_iter): + self.before_step() + self.run_step() + self.after_step() + except Exception: + logger.exception("Exception during training:") + raise + finally: + self.after_train() + + def before_train(self): + for h in self._hooks: + h.before_train() + + def after_train(self): + for h in self._hooks: + h.after_train() + + def before_step(self): + for h in self._hooks: + h.before_step() + + def after_step(self): + for h in self._hooks: + h.after_step() + # this guarantees, that in each hook's after_step, storage.iter == trainer.iter + self.storage.step() + + def run_step(self): + raise NotImplementedError + + +class SimpleTrainer(TrainerBase): + """ + A simple trainer for the most common type of task: + single-cost single-optimizer single-data-source iterative optimization. + It assumes that every step, you: + + 1. Compute the loss with a data from the data_loader. + 2. Compute the gradients with the above loss. + 3. Update the model with the optimizer. + + If you want to do anything fancier than this, + either subclass TrainerBase and implement your own `run_step`, + or write your own training loop. + """ + + def __init__(self, model, data_loader, optimizer): + """ + Args: + model: a torch Module. Takes a data from data_loader and returns a + dict of losses. + data_loader: an iterable. Contains data to be used to call model. + optimizer: a torch optimizer. + """ + super().__init__() + + """ + We set the model to training mode in the trainer. + However it's valid to train a model that's in eval mode. + If you want your model (or a submodule of it) to behave + like evaluation during training, you can overwrite its train() method. + """ + model.train() + + self.model = model + self.data_loader = data_loader + self._data_loader_iter = iter(data_loader) + self.optimizer = optimizer + + def run_step(self): + """ + Implement the standard training logic described above. + """ + assert self.model.training, "[SimpleTrainer] model was changed to eval mode!" + start = time.perf_counter() + """ + If you want to do something with the data, you can wrap the dataloader. + """ + data = next(self._data_loader_iter) + data_time = time.perf_counter() - start + + """ + If you want to do something with the losses, you can wrap the model. + """ + loss_dict = self.model(data) + losses = sum(loss_dict.values()) + self._detect_anomaly(losses, loss_dict) + + metrics_dict = loss_dict + metrics_dict["data_time"] = data_time + self._write_metrics(metrics_dict) + + """ + If you need to accumulate gradients or something similar, you can + wrap the optimizer with your custom `zero_grad()` method. + """ + self.optimizer.zero_grad() + losses.backward() + + """ + If you need gradient clipping/scaling or other processing, you can + wrap the optimizer with your custom `step()` method. + """ + self.optimizer.step() + + def _detect_anomaly(self, losses, loss_dict): + if not torch.isfinite(losses).all(): + raise FloatingPointError( + "Loss became infinite or NaN at iteration={}!\nloss_dict = {}".format( + self.iter, loss_dict + ) + ) + + def _write_metrics(self, metrics_dict: dict): + """ + Args: + metrics_dict (dict): dict of scalar metrics + """ + metrics_dict = { + k: v.detach().cpu().item() if isinstance(v, torch.Tensor) else float(v) + for k, v in metrics_dict.items() + } + # gather metrics among all workers for logging + # This assumes we do DDP-style training, which is currently the only + # supported method in detectron2. + all_metrics_dict = comm.gather(metrics_dict) + + if comm.is_main_process(): + if "data_time" in all_metrics_dict[0]: + # data_time among workers can have high variance. The actual latency + # caused by data_time is the maximum among workers. + data_time = np.max([x.pop("data_time") for x in all_metrics_dict]) + self.storage.put_scalar("data_time", data_time) + + # average the rest metrics + metrics_dict = { + k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys() + } + total_losses_reduced = sum(loss for loss in metrics_dict.values()) + + self.storage.put_scalar("total_loss", total_losses_reduced) + if len(metrics_dict) > 1: + self.storage.put_scalars(**metrics_dict) diff --git a/detectron2/evaluation/__init__.py b/detectron2/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f1d2f1001af2eb46060db362a94d9dae26e3fb4e --- /dev/null +++ b/detectron2/evaluation/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .cityscapes_evaluation import CityscapesInstanceEvaluator, CityscapesSemSegEvaluator +from .coco_evaluation import COCOEvaluator +from .rotated_coco_evaluation import RotatedCOCOEvaluator +from .evaluator import DatasetEvaluator, DatasetEvaluators, inference_context, inference_on_dataset +from .lvis_evaluation import LVISEvaluator +from .panoptic_evaluation import COCOPanopticEvaluator +from .pascal_voc_evaluation import PascalVOCDetectionEvaluator +from .sem_seg_evaluation import SemSegEvaluator +from .testing import print_csv_format, verify_results + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/evaluation/cityscapes_evaluation.py b/detectron2/evaluation/cityscapes_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..f6287a8980b10d9d13f0f0e6a0f0e1a16ff3566c --- /dev/null +++ b/detectron2/evaluation/cityscapes_evaluation.py @@ -0,0 +1,187 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import glob +import logging +import numpy as np +import os +import tempfile +from collections import OrderedDict +import torch +from fvcore.common.file_io import PathManager +from PIL import Image + +from detectron2.data import MetadataCatalog +from detectron2.utils import comm + +from .evaluator import DatasetEvaluator + + +class CityscapesEvaluator(DatasetEvaluator): + """ + Base class for evaluation using cityscapes API. + """ + + def __init__(self, dataset_name): + """ + Args: + dataset_name (str): the name of the dataset. + It must have the following metadata associated with it: + "thing_classes", "gt_dir". + """ + self._metadata = MetadataCatalog.get(dataset_name) + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + def reset(self): + self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_") + self._temp_dir = self._working_dir.name + # All workers will write to the same results directory + # TODO this does not work in distributed training + self._temp_dir = comm.all_gather(self._temp_dir)[0] + if self._temp_dir != self._working_dir.name: + self._working_dir.cleanup() + self._logger.info( + "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir) + ) + + +class CityscapesInstanceEvaluator(CityscapesEvaluator): + """ + Evaluate instance segmentation results using cityscapes API. + + Note: + * It does not work in multi-machine distributed training. + * It contains a synchronization, therefore has to be used on all ranks. + * Only the main process runs evaluation. + """ + + def process(self, inputs, outputs): + from cityscapesscripts.helpers.labels import name2label + + for input, output in zip(inputs, outputs): + file_name = input["file_name"] + basename = os.path.splitext(os.path.basename(file_name))[0] + pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt") + + output = output["instances"].to(self._cpu_device) + num_instances = len(output) + with open(pred_txt, "w") as fout: + for i in range(num_instances): + pred_class = output.pred_classes[i] + classes = self._metadata.thing_classes[pred_class] + class_id = name2label[classes].id + score = output.scores[i] + mask = output.pred_masks[i].numpy().astype("uint8") + png_filename = os.path.join( + self._temp_dir, basename + "_{}_{}.png".format(i, classes) + ) + + Image.fromarray(mask * 255).save(png_filename) + fout.write("{} {} {}\n".format(os.path.basename(png_filename), class_id, score)) + + def evaluate(self): + """ + Returns: + dict: has a key "segm", whose value is a dict of "AP" and "AP50". + """ + comm.synchronize() + if comm.get_rank() > 0: + return + import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval + + self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) + + # set some global states in cityscapes evaluation API, before evaluating + cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) + cityscapes_eval.args.predictionWalk = None + cityscapes_eval.args.JSONOutput = False + cityscapes_eval.args.colorized = False + cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") + + # These lines are adopted from + # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa + gt_dir = PathManager.get_local_path(self._metadata.gt_dir) + groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png")) + assert len( + groundTruthImgList + ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( + cityscapes_eval.args.groundTruthSearch + ) + predictionImgList = [] + for gt in groundTruthImgList: + predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) + results = cityscapes_eval.evaluateImgLists( + predictionImgList, groundTruthImgList, cityscapes_eval.args + )["averages"] + + ret = OrderedDict() + ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} + self._working_dir.cleanup() + return ret + + +class CityscapesSemSegEvaluator(CityscapesEvaluator): + """ + Evaluate semantic segmentation results using cityscapes API. + + Note: + * It does not work in multi-machine distributed training. + * It contains a synchronization, therefore has to be used on all ranks. + * Only the main process runs evaluation. + """ + + def process(self, inputs, outputs): + from cityscapesscripts.helpers.labels import trainId2label + + for input, output in zip(inputs, outputs): + file_name = input["file_name"] + basename = os.path.splitext(os.path.basename(file_name))[0] + pred_filename = os.path.join(self._temp_dir, basename + "_pred.png") + + output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy() + pred = 255 * np.ones(output.shape, dtype=np.uint8) + for train_id, label in trainId2label.items(): + if label.ignoreInEval: + continue + pred[output == train_id] = label.id + Image.fromarray(pred).save(pred_filename) + + def evaluate(self): + comm.synchronize() + if comm.get_rank() > 0: + return + # Load the Cityscapes eval script *after* setting the required env var, + # since the script reads CITYSCAPES_DATASET into global variables at load time. + import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval + + self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) + + # set some global states in cityscapes evaluation API, before evaluating + cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) + cityscapes_eval.args.predictionWalk = None + cityscapes_eval.args.JSONOutput = False + cityscapes_eval.args.colorized = False + + # These lines are adopted from + # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa + gt_dir = PathManager.get_local_path(self._metadata.gt_dir) + groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png")) + assert len( + groundTruthImgList + ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( + cityscapes_eval.args.groundTruthSearch + ) + predictionImgList = [] + for gt in groundTruthImgList: + predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt)) + results = cityscapes_eval.evaluateImgLists( + predictionImgList, groundTruthImgList, cityscapes_eval.args + ) + ret = OrderedDict() + ret["sem_seg"] = { + "IoU": 100.0 * results["averageScoreClasses"], + "iIoU": 100.0 * results["averageScoreInstClasses"], + "IoU_sup": 100.0 * results["averageScoreCategories"], + "iIoU_sup": 100.0 * results["averageScoreInstCategories"], + } + self._working_dir.cleanup() + return ret diff --git a/detectron2/evaluation/coco_evaluation.py b/detectron2/evaluation/coco_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..08f9a556473d4145cf1655c4865e9ac66cadf0a1 --- /dev/null +++ b/detectron2/evaluation/coco_evaluation.py @@ -0,0 +1,512 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import contextlib +import copy +import io +import itertools +import json +import logging +import numpy as np +import os +import pickle +from collections import OrderedDict +import pycocotools.mask as mask_util +import torch +from fvcore.common.file_io import PathManager +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval +from tabulate import tabulate + +import detectron2.utils.comm as comm +from detectron2.data import MetadataCatalog +from detectron2.data.datasets.coco import convert_to_coco_json +from detectron2.structures import Boxes, BoxMode, pairwise_iou +from detectron2.utils.logger import create_small_table + +from .evaluator import DatasetEvaluator + + +class COCOEvaluator(DatasetEvaluator): + """ + Evaluate object proposal, instance detection/segmentation, keypoint detection + outputs using COCO's metrics and APIs. + """ + + def __init__(self, dataset_name, cfg, distributed, output_dir=None): + """ + Args: + dataset_name (str): name of the dataset to be evaluated. + It must have either the following corresponding metadata: + + "json_file": the path to the COCO format annotation + + Or it must be in detectron2's standard dataset format + so it can be converted to COCO format automatically. + cfg (CfgNode): config instance + distributed (True): if True, will collect results from all ranks and run evaluation + in the main process. + Otherwise, will evaluate the results in the current process. + output_dir (str): optional, an output directory to dump all + results predicted on the dataset. The dump contains two files: + + 1. "instance_predictions.pth" a file in torch serialization + format that contains all the raw original predictions. + 2. "coco_instances_results.json" a json file in COCO's result + format. + """ + self._tasks = self._tasks_from_config(cfg) + self._distributed = distributed + self._output_dir = output_dir + + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + self._metadata = MetadataCatalog.get(dataset_name) + if not hasattr(self._metadata, "json_file"): + self._logger.warning( + f"json_file was not found in MetaDataCatalog for '{dataset_name}'." + " Trying to convert it to COCO format ..." + ) + + cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") + self._metadata.json_file = cache_path + convert_to_coco_json(dataset_name, cache_path) + + json_file = PathManager.get_local_path(self._metadata.json_file) + with contextlib.redirect_stdout(io.StringIO()): + self._coco_api = COCO(json_file) + + self._kpt_oks_sigmas = cfg.TEST.KEYPOINT_OKS_SIGMAS + # Test set json files do not contain annotations (evaluation must be + # performed using the COCO evaluation server). + self._do_evaluation = "annotations" in self._coco_api.dataset + + def reset(self): + self._predictions = [] + + def _tasks_from_config(self, cfg): + """ + Returns: + tuple[str]: tasks that can be evaluated under the given configuration. + """ + tasks = ("bbox",) + if cfg.MODEL.MASK_ON: + tasks = tasks + ("segm",) + if cfg.MODEL.KEYPOINT_ON: + tasks = tasks + ("keypoints",) + return tasks + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + """ + for input, output in zip(inputs, outputs): + prediction = {"image_id": input["image_id"]} + + # TODO this is ugly + if "instances" in output: + instances = output["instances"].to(self._cpu_device) + prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) + if "proposals" in output: + prediction["proposals"] = output["proposals"].to(self._cpu_device) + self._predictions.append(prediction) + + def evaluate(self): + if self._distributed: + comm.synchronize() + predictions = comm.gather(self._predictions, dst=0) + predictions = list(itertools.chain(*predictions)) + + if not comm.is_main_process(): + return {} + else: + predictions = self._predictions + + if len(predictions) == 0: + self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") + return {} + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "instances_predictions.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(predictions, f) + + self._results = OrderedDict() + if "proposals" in predictions[0]: + self._eval_box_proposals(predictions) + if "instances" in predictions[0]: + self._eval_predictions(set(self._tasks), predictions) + # Copy so the caller can do whatever with results + return copy.deepcopy(self._results) + + def _eval_predictions(self, tasks, predictions): + """ + Evaluate predictions on the given tasks. + Fill self._results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) + + # unmap the category ids for COCO + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + reverse_id_mapping = { + v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() + } + for result in coco_results: + category_id = result["category_id"] + assert ( + category_id in reverse_id_mapping + ), "A prediction has category_id={}, which is not available in the dataset.".format( + category_id + ) + result["category_id"] = reverse_id_mapping[category_id] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(coco_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating predictions ...") + for task in sorted(tasks): + coco_eval = ( + _evaluate_predictions_on_coco( + self._coco_api, coco_results, task, kpt_oks_sigmas=self._kpt_oks_sigmas + ) + if len(coco_results) > 0 + else None # cocoapi does not handle empty results very well + ) + + res = self._derive_coco_results( + coco_eval, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res + + def _eval_box_proposals(self, predictions): + """ + Evaluate the box proposals in predictions. + Fill self._results with the metrics for "box_proposals" task. + """ + if self._output_dir: + # Saving generated box proposals to file. + # Predicted box_proposals are in XYXY_ABS mode. + bbox_mode = BoxMode.XYXY_ABS.value + ids, boxes, objectness_logits = [], [], [] + for prediction in predictions: + ids.append(prediction["image_id"]) + boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) + objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) + + proposal_data = { + "boxes": boxes, + "objectness_logits": objectness_logits, + "ids": ids, + "bbox_mode": bbox_mode, + } + with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: + pickle.dump(proposal_data, f) + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating bbox proposals ...") + res = {} + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit) + key = "AR{}@{:d}".format(suffix, limit) + res[key] = float(stats["ar"].item() * 100) + self._logger.info("Proposal metrics: \n" + create_small_table(res)) + self._results["box_proposals"] = res + + def _derive_coco_results(self, coco_eval, iou_type, class_names=None): + """ + Derive the desired score numbers from summarized COCOeval. + + Args: + coco_eval (None or COCOEval): None represents no predictions from model. + iou_type (str): + class_names (None or list[str]): if provided, will use it to predict + per-category AP. + + Returns: + a dict of {metric name: score} + """ + + metrics = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], + }[iou_type] + + if coco_eval is None: + self._logger.warn("No predictions from the model!") + return {metric: float("nan") for metric in metrics} + + # the standard metrics + results = { + metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") + for idx, metric in enumerate(metrics) + } + self._logger.info( + "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) + ) + if not np.isfinite(sum(results.values())): + self._logger.info("Note that some metrics cannot be computed.") + + if class_names is None or len(class_names) <= 1: + return results + # Compute per-category AP + # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa + precisions = coco_eval.eval["precision"] + # precision has dims (iou, recall, cls, area range, max dets) + assert len(class_names) == precisions.shape[2] + + results_per_category = [] + for idx, name in enumerate(class_names): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float("nan") + results_per_category.append(("{}".format(name), float(ap * 100))) + + # tabulate it + N_COLS = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) + table = tabulate( + results_2d, + tablefmt="pipe", + floatfmt=".3f", + headers=["category", "AP"] * (N_COLS // 2), + numalign="left", + ) + self._logger.info("Per-category {} AP: \n".format(iou_type) + table) + + results.update({"AP-" + name: ap for name, ap in results_per_category}) + return results + + +def instances_to_coco_json(instances, img_id): + """ + Dump an "Instances" object to a COCO-format json that's used for evaluation. + + Args: + instances (Instances): + img_id (int): the image id + + Returns: + list[dict]: list of json annotations in COCO format. + """ + num_instance = len(instances) + if num_instance == 0: + return [] + + boxes = instances.pred_boxes.tensor.numpy() + boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + boxes = boxes.tolist() + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + + has_mask = instances.has("pred_masks") + if has_mask: + # use RLE to encode the masks, because they are too large and takes memory + # since this evaluator stores outputs of the entire dataset + rles = [ + mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] + for mask in instances.pred_masks + ] + for rle in rles: + # "counts" is an array encoded by mask_util as a byte-stream. Python3's + # json writer which always produces strings cannot serialize a bytestream + # unless you decode it. Thankfully, utf-8 works out (which is also what + # the pycocotools/_mask.pyx does). + rle["counts"] = rle["counts"].decode("utf-8") + + has_keypoints = instances.has("pred_keypoints") + if has_keypoints: + keypoints = instances.pred_keypoints + + results = [] + for k in range(num_instance): + result = { + "image_id": img_id, + "category_id": classes[k], + "bbox": boxes[k], + "score": scores[k], + } + if has_mask: + result["segmentation"] = rles[k] + if has_keypoints: + # In COCO annotations, + # keypoints coordinates are pixel indices. + # However our predictions are floating point coordinates. + # Therefore we subtract 0.5 to be consistent with the annotation format. + # This is the inverse of data loading logic in `datasets/coco.py`. + keypoints[k][:, :2] -= 0.5 + result["keypoints"] = keypoints[k].flatten().tolist() + results.append(result) + return results + + +# inspired from Detectron: +# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa +def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): + """ + Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0 ** 2, 1e5 ** 2], # all + [0 ** 2, 32 ** 2], # small + [32 ** 2, 96 ** 2], # medium + [96 ** 2, 1e5 ** 2], # large + [96 ** 2, 128 ** 2], # 96-128 + [128 ** 2, 256 ** 2], # 128-256 + [256 ** 2, 512 ** 2], # 256-512 + [512 ** 2, 1e5 ** 2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for prediction_dict in dataset_predictions: + predictions = prediction_dict["proposals"] + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = predictions.objectness_logits.sort(descending=True)[1] + predictions = predictions[inds] + + ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) + anno = coco_api.loadAnns(ann_ids) + gt_boxes = [ + BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + for obj in anno + if obj["iscrowd"] == 0 + ] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = Boxes(gt_boxes) + gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) + + if len(gt_boxes) == 0 or len(predictions) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if limit is not None and len(predictions) > limit: + predictions = predictions[:limit] + + overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(predictions), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = ( + torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) + ) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def _evaluate_predictions_on_coco(coco_gt, coco_results, iou_type, kpt_oks_sigmas=None): + """ + Evaluate the coco results using COCOEval API. + """ + assert len(coco_results) > 0 + + if iou_type == "segm": + coco_results = copy.deepcopy(coco_results) + # When evaluating mask AP, if the results contain bbox, cocoapi will + # use the box area as the area of the instance, instead of the mask area. + # This leads to a different definition of small/medium/large. + # We remove the bbox field to let mask AP use mask area. + for c in coco_results: + c.pop("bbox", None) + + coco_dt = coco_gt.loadRes(coco_results) + coco_eval = COCOeval(coco_gt, coco_dt, iou_type) + # Use the COCO default keypoint OKS sigmas unless overrides are specified + if kpt_oks_sigmas: + coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) + + if iou_type == "keypoints": + num_keypoints = len(coco_results[0]["keypoints"]) // 3 + assert len(coco_eval.params.kpt_oks_sigmas) == num_keypoints, ( + "[COCOEvaluator] The length of cfg.TEST.KEYPOINT_OKS_SIGMAS (default: 17) " + "must be equal to the number of keypoints. However the prediction has {} " + "keypoints! For more information please refer to " + "http://cocodataset.org/#keypoints-eval.".format(num_keypoints) + ) + + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + return coco_eval diff --git a/detectron2/evaluation/evaluator.py b/detectron2/evaluation/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..23164f2b03a771bbd97c2a32196a72b50b4699b0 --- /dev/null +++ b/detectron2/evaluation/evaluator.py @@ -0,0 +1,196 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import datetime +import logging +import time +from collections import OrderedDict +from contextlib import contextmanager +import torch + +from detectron2.utils.comm import get_world_size, is_main_process +from detectron2.utils.logger import log_every_n_seconds + + +class DatasetEvaluator: + """ + Base class for a dataset evaluator. + + The function :func:`inference_on_dataset` runs the model over + all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. + + This class will accumulate information of the inputs/outputs (by :meth:`process`), + and produce evaluation results in the end (by :meth:`evaluate`). + """ + + def reset(self): + """ + Preparation for a new round of evaluation. + Should be called before starting a round of evaluation. + """ + pass + + def process(self, inputs, outputs): + """ + Process the pair of inputs and outputs. + If they contain batches, the pairs can be consumed one-by-one using `zip`: + + .. code-block:: python + + for input_, output in zip(inputs, outputs): + # do evaluation on single input/output pair + ... + + Args: + inputs (list): the inputs that's used to call the model. + outputs (list): the return value of `model(inputs)` + """ + pass + + def evaluate(self): + """ + Evaluate/summarize the performance, after processing all input/output pairs. + + Returns: + dict: + A new evaluator class can return a dict of arbitrary format + as long as the user can process the results. + In our train_net.py, we expect the following format: + + * key: the name of the task (e.g., bbox) + * value: a dict of {metric name: score}, e.g.: {"AP50": 80} + """ + pass + + +class DatasetEvaluators(DatasetEvaluator): + """ + Wrapper class to combine multiple :class:`DatasetEvaluator` instances. + + This class dispatches every evaluation call to + all of its :class:`DatasetEvaluator`. + """ + + def __init__(self, evaluators): + """ + Args: + evaluators (list): the evaluators to combine. + """ + super().__init__() + self._evaluators = evaluators + + def reset(self): + for evaluator in self._evaluators: + evaluator.reset() + + def process(self, inputs, outputs): + for evaluator in self._evaluators: + evaluator.process(inputs, outputs) + + def evaluate(self): + results = OrderedDict() + for evaluator in self._evaluators: + result = evaluator.evaluate() + if is_main_process() and result is not None: + for k, v in result.items(): + assert ( + k not in results + ), "Different evaluators produce results with the same key {}".format(k) + results[k] = v + return results + + +def inference_on_dataset(model, data_loader, evaluator): + """ + Run model on the data_loader and evaluate the metrics with evaluator. + Also benchmark the inference speed of `model.forward` accurately. + The model will be used in eval mode. + + Args: + model (nn.Module): a module which accepts an object from + `data_loader` and returns some outputs. It will be temporarily set to `eval` mode. + + If you wish to evaluate a model in `training` mode instead, you can + wrap the given model and override its behavior of `.eval()` and `.train()`. + data_loader: an iterable object with a length. + The elements it generates will be the inputs to the model. + evaluator (DatasetEvaluator): the evaluator to run. Use `None` if you only want + to benchmark, but don't want to do any evaluation. + + Returns: + The return value of `evaluator.evaluate()` + """ + num_devices = get_world_size() + logger = logging.getLogger(__name__) + logger.info("Start inference on {} images".format(len(data_loader))) + + total = len(data_loader) # inference data loader must have a fixed length + if evaluator is None: + # create a no-op evaluator + evaluator = DatasetEvaluators([]) + evaluator.reset() + + num_warmup = min(5, total - 1) + start_time = time.perf_counter() + total_compute_time = 0 + with inference_context(model), torch.no_grad(): + for idx, inputs in enumerate(data_loader): + if idx == num_warmup: + start_time = time.perf_counter() + total_compute_time = 0 + + start_compute_time = time.perf_counter() + outputs = model(inputs) + if torch.cuda.is_available(): + torch.cuda.synchronize() + total_compute_time += time.perf_counter() - start_compute_time + evaluator.process(inputs, outputs) + + iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) + seconds_per_img = total_compute_time / iters_after_start + if idx >= num_warmup * 2 or seconds_per_img > 5: + total_seconds_per_img = (time.perf_counter() - start_time) / iters_after_start + eta = datetime.timedelta(seconds=int(total_seconds_per_img * (total - idx - 1))) + log_every_n_seconds( + logging.INFO, + "Inference done {}/{}. {:.4f} s / img. ETA={}".format( + idx + 1, total, seconds_per_img, str(eta) + ), + n=5, + ) + + # Measure the time only for this worker (before the synchronization barrier) + total_time = time.perf_counter() - start_time + total_time_str = str(datetime.timedelta(seconds=total_time)) + # NOTE this format is parsed by grep + logger.info( + "Total inference time: {} ({:.6f} s / img per device, on {} devices)".format( + total_time_str, total_time / (total - num_warmup), num_devices + ) + ) + total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) + logger.info( + "Total inference pure compute time: {} ({:.6f} s / img per device, on {} devices)".format( + total_compute_time_str, total_compute_time / (total - num_warmup), num_devices + ) + ) + + results = evaluator.evaluate() + # An evaluator may return None when not in main process. + # Replace it by an empty dict instead to make it easier for downstream code to handle + if results is None: + results = {} + return results + + +@contextmanager +def inference_context(model): + """ + A context where the model is temporarily changed to eval mode, + and restored to previous mode afterwards. + + Args: + model: a torch Module + """ + training_mode = model.training + model.eval() + yield + model.train(training_mode) diff --git a/detectron2/evaluation/lvis_evaluation.py b/detectron2/evaluation/lvis_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..e55f50fb9d1fa7ccb685f812b603c10f9a1ffea0 --- /dev/null +++ b/detectron2/evaluation/lvis_evaluation.py @@ -0,0 +1,350 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import itertools +import json +import logging +import os +import pickle +from collections import OrderedDict +import torch +from fvcore.common.file_io import PathManager + +import detectron2.utils.comm as comm +from detectron2.data import MetadataCatalog +from detectron2.structures import Boxes, BoxMode, pairwise_iou +from detectron2.utils.logger import create_small_table + +from .coco_evaluation import instances_to_coco_json +from .evaluator import DatasetEvaluator + + +class LVISEvaluator(DatasetEvaluator): + """ + Evaluate object proposal and instance detection/segmentation outputs using + LVIS's metrics and evaluation API. + """ + + def __init__(self, dataset_name, cfg, distributed, output_dir=None): + """ + Args: + dataset_name (str): name of the dataset to be evaluated. + It must have the following corresponding metadata: + "json_file": the path to the LVIS format annotation + cfg (CfgNode): config instance + distributed (True): if True, will collect results from all ranks for evaluation. + Otherwise, will evaluate the results in the current process. + output_dir (str): optional, an output directory to dump results. + """ + from lvis import LVIS + + self._tasks = self._tasks_from_config(cfg) + self._distributed = distributed + self._output_dir = output_dir + + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + self._metadata = MetadataCatalog.get(dataset_name) + json_file = PathManager.get_local_path(self._metadata.json_file) + self._lvis_api = LVIS(json_file) + # Test set json files do not contain annotations (evaluation must be + # performed using the LVIS evaluation server). + self._do_evaluation = len(self._lvis_api.get_ann_ids()) > 0 + + def reset(self): + self._predictions = [] + + def _tasks_from_config(self, cfg): + """ + Returns: + tuple[str]: tasks that can be evaluated under the given configuration. + """ + tasks = ("bbox",) + if cfg.MODEL.MASK_ON: + tasks = tasks + ("segm",) + return tasks + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a LVIS model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a LVIS model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + """ + for input, output in zip(inputs, outputs): + prediction = {"image_id": input["image_id"]} + + if "instances" in output: + instances = output["instances"].to(self._cpu_device) + prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) + if "proposals" in output: + prediction["proposals"] = output["proposals"].to(self._cpu_device) + self._predictions.append(prediction) + + def evaluate(self): + if self._distributed: + comm.synchronize() + predictions = comm.gather(self._predictions, dst=0) + predictions = list(itertools.chain(*predictions)) + + if not comm.is_main_process(): + return + else: + predictions = self._predictions + + if len(predictions) == 0: + self._logger.warning("[LVISEvaluator] Did not receive valid predictions.") + return {} + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "instances_predictions.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(predictions, f) + + self._results = OrderedDict() + if "proposals" in predictions[0]: + self._eval_box_proposals(predictions) + if "instances" in predictions[0]: + self._eval_predictions(set(self._tasks), predictions) + # Copy so the caller can do whatever with results + return copy.deepcopy(self._results) + + def _eval_predictions(self, tasks, predictions): + """ + Evaluate predictions on the given tasks. + Fill self._results with the metrics of the tasks. + + Args: + predictions (list[dict]): list of outputs from the model + """ + self._logger.info("Preparing results in the LVIS format ...") + lvis_results = list(itertools.chain(*[x["instances"] for x in predictions])) + + # LVIS evaluator can be used to evaluate results for COCO dataset categories. + # In this case `_metadata` variable will have a field with COCO-specific category mapping. + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + reverse_id_mapping = { + v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() + } + for result in lvis_results: + result["category_id"] = reverse_id_mapping[result["category_id"]] + else: + # unmap the category ids for LVIS (from 0-indexed to 1-indexed) + for result in lvis_results: + result["category_id"] += 1 + + if self._output_dir: + file_path = os.path.join(self._output_dir, "lvis_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(lvis_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating predictions ...") + for task in sorted(tasks): + res = _evaluate_predictions_on_lvis( + self._lvis_api, lvis_results, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res + + def _eval_box_proposals(self, predictions): + """ + Evaluate the box proposals in predictions. + Fill self._results with the metrics for "box_proposals" task. + """ + if self._output_dir: + # Saving generated box proposals to file. + # Predicted box_proposals are in XYXY_ABS mode. + bbox_mode = BoxMode.XYXY_ABS.value + ids, boxes, objectness_logits = [], [], [] + for prediction in predictions: + ids.append(prediction["image_id"]) + boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) + objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) + + proposal_data = { + "boxes": boxes, + "objectness_logits": objectness_logits, + "ids": ids, + "bbox_mode": bbox_mode, + } + with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: + pickle.dump(proposal_data, f) + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating bbox proposals ...") + res = {} + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = _evaluate_box_proposals(predictions, self._lvis_api, area=area, limit=limit) + key = "AR{}@{:d}".format(suffix, limit) + res[key] = float(stats["ar"].item() * 100) + self._logger.info("Proposal metrics: \n" + create_small_table(res)) + self._results["box_proposals"] = res + + +# inspired from Detectron: +# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa +def _evaluate_box_proposals(dataset_predictions, lvis_api, thresholds=None, area="all", limit=None): + """ + Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official LVIS API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0 ** 2, 1e5 ** 2], # all + [0 ** 2, 32 ** 2], # small + [32 ** 2, 96 ** 2], # medium + [96 ** 2, 1e5 ** 2], # large + [96 ** 2, 128 ** 2], # 96-128 + [128 ** 2, 256 ** 2], # 128-256 + [256 ** 2, 512 ** 2], # 256-512 + [512 ** 2, 1e5 ** 2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for prediction_dict in dataset_predictions: + predictions = prediction_dict["proposals"] + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = predictions.objectness_logits.sort(descending=True)[1] + predictions = predictions[inds] + + ann_ids = lvis_api.get_ann_ids(img_ids=[prediction_dict["image_id"]]) + anno = lvis_api.load_anns(ann_ids) + gt_boxes = [ + BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) for obj in anno + ] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = Boxes(gt_boxes) + gt_areas = torch.as_tensor([obj["area"] for obj in anno]) + + if len(gt_boxes) == 0 or len(predictions) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if limit is not None and len(predictions) > limit: + predictions = predictions[:limit] + + overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(predictions), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = ( + torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) + ) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def _evaluate_predictions_on_lvis(lvis_gt, lvis_results, iou_type, class_names=None): + """ + Args: + iou_type (str): + kpt_oks_sigmas (list[float]): + class_names (None or list[str]): if provided, will use it to predict + per-category AP. + + Returns: + a dict of {metric name: score} + """ + metrics = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], + }[iou_type] + + logger = logging.getLogger(__name__) + + if len(lvis_results) == 0: # TODO: check if needed + logger.warn("No predictions from the model!") + return {metric: float("nan") for metric in metrics} + + if iou_type == "segm": + lvis_results = copy.deepcopy(lvis_results) + # When evaluating mask AP, if the results contain bbox, LVIS API will + # use the box area as the area of the instance, instead of the mask area. + # This leads to a different definition of small/medium/large. + # We remove the bbox field to let mask AP use mask area. + for c in lvis_results: + c.pop("bbox", None) + + from lvis import LVISEval, LVISResults + + lvis_results = LVISResults(lvis_gt, lvis_results) + lvis_eval = LVISEval(lvis_gt, lvis_results, iou_type) + lvis_eval.run() + lvis_eval.print_results() + + # Pull the standard metrics from the LVIS results + results = lvis_eval.get_results() + results = {metric: float(results[metric] * 100) for metric in metrics} + logger.info("Evaluation results for {}: \n".format(iou_type) + create_small_table(results)) + return results diff --git a/detectron2/evaluation/panoptic_evaluation.py b/detectron2/evaluation/panoptic_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..fb5e7ab87b1dd5bb3e0c5d1e405e321c48d9e6a0 --- /dev/null +++ b/detectron2/evaluation/panoptic_evaluation.py @@ -0,0 +1,167 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import contextlib +import io +import itertools +import json +import logging +import os +import tempfile +from collections import OrderedDict +from fvcore.common.file_io import PathManager +from PIL import Image +from tabulate import tabulate + +from detectron2.data import MetadataCatalog +from detectron2.utils import comm + +from .evaluator import DatasetEvaluator + +logger = logging.getLogger(__name__) + + +class COCOPanopticEvaluator(DatasetEvaluator): + """ + Evaluate Panoptic Quality metrics on COCO using PanopticAPI. + It saves panoptic segmentation prediction in `output_dir` + + It contains a synchronize call and has to be called from all workers. + """ + + def __init__(self, dataset_name, output_dir): + """ + Args: + dataset_name (str): name of the dataset + output_dir (str): output directory to save results for evaluation + """ + self._metadata = MetadataCatalog.get(dataset_name) + self._thing_contiguous_id_to_dataset_id = { + v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() + } + self._stuff_contiguous_id_to_dataset_id = { + v: k for k, v in self._metadata.stuff_dataset_id_to_contiguous_id.items() + } + + self._predictions_json = os.path.join(output_dir, "predictions.json") + + def reset(self): + self._predictions = [] + + def _convert_category_id(self, segment_info): + isthing = segment_info.pop("isthing", None) + if isthing is None: + # the model produces panoptic category id directly. No more conversion needed + return segment_info + if isthing is True: + segment_info["category_id"] = self._thing_contiguous_id_to_dataset_id[ + segment_info["category_id"] + ] + else: + segment_info["category_id"] = self._stuff_contiguous_id_to_dataset_id[ + segment_info["category_id"] + ] + return segment_info + + def process(self, inputs, outputs): + from panopticapi.utils import id2rgb + + for input, output in zip(inputs, outputs): + panoptic_img, segments_info = output["panoptic_seg"] + panoptic_img = panoptic_img.cpu().numpy() + + file_name = os.path.basename(input["file_name"]) + file_name_png = os.path.splitext(file_name)[0] + ".png" + with io.BytesIO() as out: + Image.fromarray(id2rgb(panoptic_img)).save(out, format="PNG") + segments_info = [self._convert_category_id(x) for x in segments_info] + self._predictions.append( + { + "image_id": input["image_id"], + "file_name": file_name_png, + "png_string": out.getvalue(), + "segments_info": segments_info, + } + ) + + def evaluate(self): + comm.synchronize() + + self._predictions = comm.gather(self._predictions) + self._predictions = list(itertools.chain(*self._predictions)) + if not comm.is_main_process(): + return + + # PanopticApi requires local files + gt_json = PathManager.get_local_path(self._metadata.panoptic_json) + gt_folder = PathManager.get_local_path(self._metadata.panoptic_root) + + with tempfile.TemporaryDirectory(prefix="panoptic_eval") as pred_dir: + logger.info("Writing all panoptic predictions to {} ...".format(pred_dir)) + for p in self._predictions: + with open(os.path.join(pred_dir, p["file_name"]), "wb") as f: + f.write(p.pop("png_string")) + + with open(gt_json, "r") as f: + json_data = json.load(f) + json_data["annotations"] = self._predictions + with PathManager.open(self._predictions_json, "w") as f: + f.write(json.dumps(json_data)) + + from panopticapi.evaluation import pq_compute + + with contextlib.redirect_stdout(io.StringIO()): + pq_res = pq_compute( + gt_json, + PathManager.get_local_path(self._predictions_json), + gt_folder=gt_folder, + pred_folder=pred_dir, + ) + + res = {} + res["PQ"] = 100 * pq_res["All"]["pq"] + res["SQ"] = 100 * pq_res["All"]["sq"] + res["RQ"] = 100 * pq_res["All"]["rq"] + res["PQ_th"] = 100 * pq_res["Things"]["pq"] + res["SQ_th"] = 100 * pq_res["Things"]["sq"] + res["RQ_th"] = 100 * pq_res["Things"]["rq"] + res["PQ_st"] = 100 * pq_res["Stuff"]["pq"] + res["SQ_st"] = 100 * pq_res["Stuff"]["sq"] + res["RQ_st"] = 100 * pq_res["Stuff"]["rq"] + + results = OrderedDict({"panoptic_seg": res}) + _print_panoptic_results(pq_res) + + return results + + +def _print_panoptic_results(pq_res): + headers = ["", "PQ", "SQ", "RQ", "#categories"] + data = [] + for name in ["All", "Things", "Stuff"]: + row = [name] + [pq_res[name][k] * 100 for k in ["pq", "sq", "rq"]] + [pq_res[name]["n"]] + data.append(row) + table = tabulate( + data, headers=headers, tablefmt="pipe", floatfmt=".3f", stralign="center", numalign="center" + ) + logger.info("Panoptic Evaluation Results:\n" + table) + + +if __name__ == "__main__": + from detectron2.utils.logger import setup_logger + + logger = setup_logger() + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--gt-json") + parser.add_argument("--gt-dir") + parser.add_argument("--pred-json") + parser.add_argument("--pred-dir") + args = parser.parse_args() + + from panopticapi.evaluation import pq_compute + + with contextlib.redirect_stdout(io.StringIO()): + pq_res = pq_compute( + args.gt_json, args.pred_json, gt_folder=args.gt_dir, pred_folder=args.pred_dir + ) + _print_panoptic_results(pq_res) diff --git a/detectron2/evaluation/pascal_voc_evaluation.py b/detectron2/evaluation/pascal_voc_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..e920d394a7cebc58eeb901f3e9683d0e761b9909 --- /dev/null +++ b/detectron2/evaluation/pascal_voc_evaluation.py @@ -0,0 +1,294 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import numpy as np +import os +import tempfile +import xml.etree.ElementTree as ET +from collections import OrderedDict, defaultdict +from functools import lru_cache +import torch +from fvcore.common.file_io import PathManager + +from detectron2.data import MetadataCatalog +from detectron2.utils import comm + +from .evaluator import DatasetEvaluator + + +class PascalVOCDetectionEvaluator(DatasetEvaluator): + """ + Evaluate Pascal VOC AP. + It contains a synchronization, therefore has to be called from all ranks. + + Note that this is a rewrite of the official Matlab API. + The results should be similar, but not identical to the one produced by + the official API. + """ + + def __init__(self, dataset_name): + """ + Args: + dataset_name (str): name of the dataset, e.g., "voc_2007_test" + """ + self._dataset_name = dataset_name + meta = MetadataCatalog.get(dataset_name) + self._anno_file_template = os.path.join(meta.dirname, "Annotations", "{}.xml") + self._image_set_path = os.path.join(meta.dirname, "ImageSets", "Main", meta.split + ".txt") + self._class_names = meta.thing_classes + assert meta.year in [2007, 2012], meta.year + self._is_2007 = meta.year == 2007 + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + def reset(self): + self._predictions = defaultdict(list) # class name -> list of prediction strings + + def process(self, inputs, outputs): + for input, output in zip(inputs, outputs): + image_id = input["image_id"] + instances = output["instances"].to(self._cpu_device) + boxes = instances.pred_boxes.tensor.numpy() + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + for box, score, cls in zip(boxes, scores, classes): + xmin, ymin, xmax, ymax = box + # The inverse of data loading logic in `datasets/pascal_voc.py` + xmin += 1 + ymin += 1 + self._predictions[cls].append( + f"{image_id} {score:.3f} {xmin:.1f} {ymin:.1f} {xmax:.1f} {ymax:.1f}" + ) + + def evaluate(self): + """ + Returns: + dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". + """ + all_predictions = comm.gather(self._predictions, dst=0) + if not comm.is_main_process(): + return + predictions = defaultdict(list) + for predictions_per_rank in all_predictions: + for clsid, lines in predictions_per_rank.items(): + predictions[clsid].extend(lines) + del all_predictions + + self._logger.info( + "Evaluating {} using {} metric. " + "Note that results do not use the official Matlab API.".format( + self._dataset_name, 2007 if self._is_2007 else 2012 + ) + ) + + with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: + res_file_template = os.path.join(dirname, "{}.txt") + + aps = defaultdict(list) # iou -> ap per class + for cls_id, cls_name in enumerate(self._class_names): + lines = predictions.get(cls_id, [""]) + + with open(res_file_template.format(cls_name), "w") as f: + f.write("\n".join(lines)) + + for thresh in range(50, 100, 5): + rec, prec, ap = voc_eval( + res_file_template, + self._anno_file_template, + self._image_set_path, + cls_name, + ovthresh=thresh / 100.0, + use_07_metric=self._is_2007, + ) + aps[thresh].append(ap * 100) + + ret = OrderedDict() + mAP = {iou: np.mean(x) for iou, x in aps.items()} + ret["bbox"] = {"AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75]} + return ret + + +############################################################################## +# +# Below code is modified from +# https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/datasets/voc_eval.py +# -------------------------------------------------------- +# Fast/er R-CNN +# Licensed under The MIT License [see LICENSE for details] +# Written by Bharath Hariharan +# -------------------------------------------------------- + +"""Python implementation of the PASCAL VOC devkit's AP evaluation code.""" + + +@lru_cache(maxsize=None) +def parse_rec(filename): + """Parse a PASCAL VOC xml file.""" + with PathManager.open(filename) as f: + tree = ET.parse(f) + objects = [] + for obj in tree.findall("object"): + obj_struct = {} + obj_struct["name"] = obj.find("name").text + obj_struct["pose"] = obj.find("pose").text + obj_struct["truncated"] = int(obj.find("truncated").text) + obj_struct["difficult"] = int(obj.find("difficult").text) + bbox = obj.find("bndbox") + obj_struct["bbox"] = [ + int(bbox.find("xmin").text), + int(bbox.find("ymin").text), + int(bbox.find("xmax").text), + int(bbox.find("ymax").text), + ] + objects.append(obj_struct) + + return objects + + +def voc_ap(rec, prec, use_07_metric=False): + """Compute VOC AP given precision and recall. If use_07_metric is true, uses + the VOC 07 11-point method (default:False). + """ + if use_07_metric: + # 11 point metric + ap = 0.0 + for t in np.arange(0.0, 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap = ap + p / 11.0 + else: + # correct AP calculation + # first append sentinel values at the end + mrec = np.concatenate(([0.0], rec, [1.0])) + mpre = np.concatenate(([0.0], prec, [0.0])) + + # compute the precision envelope + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # to calculate area under PR curve, look for points + # where X axis (recall) changes value + i = np.where(mrec[1:] != mrec[:-1])[0] + + # and sum (\Delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap + + +def voc_eval(detpath, annopath, imagesetfile, classname, ovthresh=0.5, use_07_metric=False): + """rec, prec, ap = voc_eval(detpath, + annopath, + imagesetfile, + classname, + [ovthresh], + [use_07_metric]) + + Top level function that does the PASCAL VOC evaluation. + + detpath: Path to detections + detpath.format(classname) should produce the detection results file. + annopath: Path to annotations + annopath.format(imagename) should be the xml annotations file. + imagesetfile: Text file containing the list of images, one image per line. + classname: Category name (duh) + [ovthresh]: Overlap threshold (default = 0.5) + [use_07_metric]: Whether to use VOC07's 11 point AP computation + (default False) + """ + # assumes detections are in detpath.format(classname) + # assumes annotations are in annopath.format(imagename) + # assumes imagesetfile is a text file with each line an image name + + # first load gt + # read list of images + with PathManager.open(imagesetfile, "r") as f: + lines = f.readlines() + imagenames = [x.strip() for x in lines] + + # load annots + recs = {} + for imagename in imagenames: + recs[imagename] = parse_rec(annopath.format(imagename)) + + # extract gt objects for this class + class_recs = {} + npos = 0 + for imagename in imagenames: + R = [obj for obj in recs[imagename] if obj["name"] == classname] + bbox = np.array([x["bbox"] for x in R]) + difficult = np.array([x["difficult"] for x in R]).astype(np.bool) + # difficult = np.array([False for x in R]).astype(np.bool) # treat all "difficult" as GT + det = [False] * len(R) + npos = npos + sum(~difficult) + class_recs[imagename] = {"bbox": bbox, "difficult": difficult, "det": det} + + # read dets + detfile = detpath.format(classname) + with open(detfile, "r") as f: + lines = f.readlines() + + splitlines = [x.strip().split(" ") for x in lines] + image_ids = [x[0] for x in splitlines] + confidence = np.array([float(x[1]) for x in splitlines]) + BB = np.array([[float(z) for z in x[2:]] for x in splitlines]).reshape(-1, 4) + + # sort by confidence + sorted_ind = np.argsort(-confidence) + BB = BB[sorted_ind, :] + image_ids = [image_ids[x] for x in sorted_ind] + + # go down dets and mark TPs and FPs + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + R = class_recs[image_ids[d]] + bb = BB[d, :].astype(float) + ovmax = -np.inf + BBGT = R["bbox"].astype(float) + + if BBGT.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(BBGT[:, 0], bb[0]) + iymin = np.maximum(BBGT[:, 1], bb[1]) + ixmax = np.minimum(BBGT[:, 2], bb[2]) + iymax = np.minimum(BBGT[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1.0, 0.0) + ih = np.maximum(iymax - iymin + 1.0, 0.0) + inters = iw * ih + + # union + uni = ( + (bb[2] - bb[0] + 1.0) * (bb[3] - bb[1] + 1.0) + + (BBGT[:, 2] - BBGT[:, 0] + 1.0) * (BBGT[:, 3] - BBGT[:, 1] + 1.0) + - inters + ) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not R["difficult"][jmax]: + if not R["det"][jmax]: + tp[d] = 1.0 + R["det"][jmax] = 1 + else: + fp[d] = 1.0 + else: + fp[d] = 1.0 + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid divide by zero in case the first detection matches a difficult + # ground truth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap diff --git a/detectron2/evaluation/rotated_coco_evaluation.py b/detectron2/evaluation/rotated_coco_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..30746e1aaac9a1feb0c7994d9229423e9f04bb51 --- /dev/null +++ b/detectron2/evaluation/rotated_coco_evaluation.py @@ -0,0 +1,204 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +import json +import numpy as np +import os +import torch +from fvcore.common.file_io import PathManager +from pycocotools.cocoeval import COCOeval, maskUtils + +from detectron2.structures import BoxMode, RotatedBoxes, pairwise_iou_rotated + +from .coco_evaluation import COCOEvaluator + + +class RotatedCOCOeval(COCOeval): + @staticmethod + def is_rotated(box_list): + if type(box_list) == np.ndarray: + return box_list.shape[1] == 5 + elif type(box_list) == list: + if box_list == []: # cannot decide the box_dim + return False + return np.all( + np.array( + [ + (len(obj) == 5) and ((type(obj) == list) or (type(obj) == np.ndarray)) + for obj in box_list + ] + ) + ) + return False + + @staticmethod + def boxlist_to_tensor(boxlist, output_box_dim): + if type(boxlist) == np.ndarray: + box_tensor = torch.from_numpy(boxlist) + elif type(boxlist) == list: + if boxlist == []: + return torch.zeros((0, output_box_dim), dtype=torch.float32) + else: + box_tensor = torch.FloatTensor(boxlist) + else: + raise Exception("Unrecognized boxlist type") + + input_box_dim = box_tensor.shape[1] + if input_box_dim != output_box_dim: + if input_box_dim == 4 and output_box_dim == 5: + box_tensor = BoxMode.convert(box_tensor, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS) + else: + raise Exception( + "Unable to convert from {}-dim box to {}-dim box".format( + input_box_dim, output_box_dim + ) + ) + return box_tensor + + def compute_iou_dt_gt(self, dt, gt, is_crowd): + if self.is_rotated(dt) or self.is_rotated(gt): + # TODO: take is_crowd into consideration + assert all(c == 0 for c in is_crowd) + dt = RotatedBoxes(self.boxlist_to_tensor(dt, output_box_dim=5)) + gt = RotatedBoxes(self.boxlist_to_tensor(gt, output_box_dim=5)) + return pairwise_iou_rotated(dt, gt) + else: + # This is the same as the classical COCO evaluation + return maskUtils.iou(dt, gt, is_crowd) + + def computeIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0 : p.maxDets[-1]] + + assert p.iouType == "bbox", "unsupported iouType for iou computation" + + g = [g["bbox"] for g in gt] + d = [d["bbox"] for d in dt] + + # compute iou between each dt and gt region + iscrowd = [int(o["iscrowd"]) for o in gt] + + # Note: this function is copied from cocoeval.py in cocoapi + # and the major difference is here. + ious = self.compute_iou_dt_gt(d, g, iscrowd) + return ious + + +class RotatedCOCOEvaluator(COCOEvaluator): + """ + Evaluate object proposal/instance detection outputs using COCO-like metrics and APIs, + with rotated boxes support. + Note: this uses IOU only and does not consider angle differences. + """ + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + """ + for input, output in zip(inputs, outputs): + prediction = {"image_id": input["image_id"]} + + if "instances" in output: + instances = output["instances"].to(self._cpu_device) + + prediction["instances"] = self.instances_to_json(instances, input["image_id"]) + if "proposals" in output: + prediction["proposals"] = output["proposals"].to(self._cpu_device) + self._predictions.append(prediction) + + def instances_to_json(self, instances, img_id): + num_instance = len(instances) + if num_instance == 0: + return [] + + boxes = instances.pred_boxes.tensor.numpy() + if boxes.shape[1] == 4: + boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + boxes = boxes.tolist() + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + + results = [] + for k in range(num_instance): + result = { + "image_id": img_id, + "category_id": classes[k], + "bbox": boxes[k], + "score": scores[k], + } + + results.append(result) + return results + + def _eval_predictions(self, tasks, predictions): + """ + Evaluate predictions on the given tasks. + Fill self._results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) + + # unmap the category ids for COCO + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + reverse_id_mapping = { + v: k for k, v in self._metadata.thing_dataset_id_to_contiguous_id.items() + } + for result in coco_results: + result["category_id"] = reverse_id_mapping[result["category_id"]] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(coco_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating predictions ...") + for task in sorted(tasks): + assert task == "bbox", "Task {} is not supported".format(task) + coco_eval = ( + self._evaluate_predictions_on_coco(self._coco_api, coco_results) + if len(coco_results) > 0 + else None # cocoapi does not handle empty results very well + ) + + res = self._derive_coco_results( + coco_eval, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res + + def _evaluate_predictions_on_coco(self, coco_gt, coco_results): + """ + Evaluate the coco results using COCOEval API. + """ + assert len(coco_results) > 0 + + coco_dt = coco_gt.loadRes(coco_results) + + # Only bbox is supported for now + coco_eval = RotatedCOCOeval(coco_gt, coco_dt, iouType="bbox") + + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + return coco_eval diff --git a/detectron2/evaluation/sem_seg_evaluation.py b/detectron2/evaluation/sem_seg_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..fb3b28d79284a5eeb335fc8ee8d859b4e46510ef --- /dev/null +++ b/detectron2/evaluation/sem_seg_evaluation.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +import json +import logging +import numpy as np +import os +from collections import OrderedDict +import PIL.Image as Image +import pycocotools.mask as mask_util +import torch +from fvcore.common.file_io import PathManager + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.utils.comm import all_gather, is_main_process, synchronize + +from .evaluator import DatasetEvaluator + + +class SemSegEvaluator(DatasetEvaluator): + """ + Evaluate semantic segmentation + """ + + def __init__(self, dataset_name, distributed, num_classes, ignore_label=255, output_dir=None): + """ + Args: + dataset_name (str): name of the dataset to be evaluated. + distributed (True): if True, will collect results from all ranks for evaluation. + Otherwise, will evaluate the results in the current process. + num_classes (int): number of classes + ignore_label (int): value in semantic segmentation ground truth. Predictions for the + corresponding pixels should be ignored. + output_dir (str): an output directory to dump results. + """ + self._dataset_name = dataset_name + self._distributed = distributed + self._output_dir = output_dir + self._num_classes = num_classes + self._ignore_label = ignore_label + self._N = num_classes + 1 + + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + self.input_file_to_gt_file = { + dataset_record["file_name"]: dataset_record["sem_seg_file_name"] + for dataset_record in DatasetCatalog.get(dataset_name) + } + + meta = MetadataCatalog.get(dataset_name) + # Dict that maps contiguous training ids to COCO category ids + try: + c2d = meta.stuff_dataset_id_to_contiguous_id + self._contiguous_id_to_dataset_id = {v: k for k, v in c2d.items()} + except AttributeError: + self._contiguous_id_to_dataset_id = None + self._class_names = meta.stuff_classes + + def reset(self): + self._conf_matrix = np.zeros((self._N, self._N), dtype=np.int64) + self._predictions = [] + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a model. + It is a list of dicts. Each dict corresponds to an image and + contains keys like "height", "width", "file_name". + outputs: the outputs of a model. It is either list of semantic segmentation predictions + (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic + segmentation prediction in the same format. + """ + for input, output in zip(inputs, outputs): + output = output["sem_seg"].argmax(dim=0).to(self._cpu_device) + pred = np.array(output, dtype=np.int) + with PathManager.open(self.input_file_to_gt_file[input["file_name"]], "rb") as f: + gt = np.array(Image.open(f), dtype=np.int) + + gt[gt == self._ignore_label] = self._num_classes + + self._conf_matrix += np.bincount( + self._N * pred.reshape(-1) + gt.reshape(-1), minlength=self._N ** 2 + ).reshape(self._N, self._N) + + self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"])) + + def evaluate(self): + """ + Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): + + * Mean intersection-over-union averaged across classes (mIoU) + * Frequency Weighted IoU (fwIoU) + * Mean pixel accuracy averaged across classes (mACC) + * Pixel Accuracy (pACC) + """ + if self._distributed: + synchronize() + conf_matrix_list = all_gather(self._conf_matrix) + self._predictions = all_gather(self._predictions) + self._predictions = list(itertools.chain(*self._predictions)) + if not is_main_process(): + return + + self._conf_matrix = np.zeros_like(self._conf_matrix) + for conf_matrix in conf_matrix_list: + self._conf_matrix += conf_matrix + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(self._predictions)) + + acc = np.full(self._num_classes, np.nan, dtype=np.float) + iou = np.full(self._num_classes, np.nan, dtype=np.float) + tp = self._conf_matrix.diagonal()[:-1].astype(np.float) + pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) + class_weights = pos_gt / np.sum(pos_gt) + pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) + acc_valid = pos_gt > 0 + acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] + iou_valid = (pos_gt + pos_pred) > 0 + union = pos_gt + pos_pred - tp + iou[acc_valid] = tp[acc_valid] / union[acc_valid] + macc = np.sum(acc[acc_valid]) / np.sum(acc_valid) + miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) + fiou = np.sum(iou[acc_valid] * class_weights[acc_valid]) + pacc = np.sum(tp) / np.sum(pos_gt) + + res = {} + res["mIoU"] = 100 * miou + res["fwIoU"] = 100 * fiou + for i, name in enumerate(self._class_names): + res["IoU-{}".format(name)] = 100 * iou[i] + res["mACC"] = 100 * macc + res["pACC"] = 100 * pacc + for i, name in enumerate(self._class_names): + res["ACC-{}".format(name)] = 100 * acc[i] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(res, f) + results = OrderedDict({"sem_seg": res}) + self._logger.info(results) + return results + + def encode_json_sem_seg(self, sem_seg, input_file_name): + """ + Convert semantic segmentation to COCO stuff format with segments encoded as RLEs. + See http://cocodataset.org/#format-results + """ + json_list = [] + for label in np.unique(sem_seg): + if self._contiguous_id_to_dataset_id is not None: + assert ( + label in self._contiguous_id_to_dataset_id + ), "Label {} is not in the metadata info for {}".format(label, self._dataset_name) + dataset_id = self._contiguous_id_to_dataset_id[label] + else: + dataset_id = int(label) + mask = (sem_seg == label).astype(np.uint8) + mask_rle = mask_util.encode(np.array(mask[:, :, None], order="F"))[0] + mask_rle["counts"] = mask_rle["counts"].decode("utf-8") + json_list.append( + {"file_name": input_file_name, "category_id": dataset_id, "segmentation": mask_rle} + ) + return json_list diff --git a/detectron2/evaluation/testing.py b/detectron2/evaluation/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..95addebc185111c572cb19aa98f7e055b21fc74e --- /dev/null +++ b/detectron2/evaluation/testing.py @@ -0,0 +1,78 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import numpy as np +import pprint +import sys +from collections import OrderedDict +from collections.abc import Mapping + + +def print_csv_format(results): + """ + Print main metrics in a format similar to Detectron, + so that they are easy to copypaste into a spreadsheet. + + Args: + results (OrderedDict[dict]): task_name -> {metric -> score} + """ + assert isinstance(results, OrderedDict), results # unordered results cannot be properly printed + logger = logging.getLogger(__name__) + for task, res in results.items(): + # Don't print "AP-category" metrics since they are usually not tracked. + important_res = [(k, v) for k, v in res.items() if "-" not in k] + logger.info("copypaste: Task: {}".format(task)) + logger.info("copypaste: " + ",".join([k[0] for k in important_res])) + logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res])) + + +def verify_results(cfg, results): + """ + Args: + results (OrderedDict[dict]): task_name -> {metric -> score} + + Returns: + bool: whether the verification succeeds or not + """ + expected_results = cfg.TEST.EXPECTED_RESULTS + if not len(expected_results): + return True + + ok = True + for task, metric, expected, tolerance in expected_results: + actual = results[task][metric] + if not np.isfinite(actual): + ok = False + diff = abs(actual - expected) + if diff > tolerance: + ok = False + + logger = logging.getLogger(__name__) + if not ok: + logger.error("Result verification failed!") + logger.error("Expected Results: " + str(expected_results)) + logger.error("Actual Results: " + pprint.pformat(results)) + + sys.exit(1) + else: + logger.info("Results verification passed.") + return ok + + +def flatten_results_dict(results): + """ + Expand a hierarchical dict of scalars into a flat dict of scalars. + If results[k1][k2][k3] = v, the returned dict will have the entry + {"k1/k2/k3": v}. + + Args: + results (dict): + """ + r = {} + for k, v in results.items(): + if isinstance(v, Mapping): + v = flatten_results_dict(v) + for kk, vv in v.items(): + r[k + "/" + kk] = vv + else: + r[k] = v + return r diff --git a/detectron2/export/README.md b/detectron2/export/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9bd8b57c1a5f15e391eb63b690f1051b1ad79d21 --- /dev/null +++ b/detectron2/export/README.md @@ -0,0 +1,10 @@ + +This directory contains code to prepare a detectron2 model for deployment. +Currently it supports exporting a detectron2 model to Caffe2 format through ONNX. + +Please see [documentation](https://detectron2.readthedocs.io/tutorials/deployment.html) for its usage. + + +### Acknowledgements + +Thanks to Mobile Vision team at Facebook for developing the conversion tools. diff --git a/detectron2/export/__init__.py b/detectron2/export/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1e2bf4d0670ed0ccd73dbdb7ce27a8e617bbf6aa --- /dev/null +++ b/detectron2/export/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +from .api import * + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/export/api.py b/detectron2/export/api.py new file mode 100644 index 0000000000000000000000000000000000000000..a7600714e1edb019def04f9d0d1a063668943101 --- /dev/null +++ b/detectron2/export/api.py @@ -0,0 +1,277 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import copy +import logging +import os +import torch +from caffe2.proto import caffe2_pb2 +from torch import nn + +from detectron2.config import CfgNode as CN + +from .caffe2_export import export_caffe2_detection_model +from .caffe2_export import export_onnx_model as export_onnx_model_impl +from .caffe2_export import run_and_save_graph +from .caffe2_inference import ProtobufDetectionModel +from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format +from .shared import get_pb_arg_vali, get_pb_arg_vals, save_graph + +__all__ = [ + "add_export_config", + "export_caffe2_model", + "Caffe2Model", + "export_onnx_model", + "Caffe2Tracer", +] + + +def add_export_config(cfg): + """ + Args: + cfg (CfgNode): a detectron2 config + + Returns: + CfgNode: an updated config with new options that will be used + by :class:`Caffe2Tracer`. + """ + is_frozen = cfg.is_frozen() + cfg.defrost() + cfg.EXPORT_CAFFE2 = CN() + cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT = False + if is_frozen: + cfg.freeze() + return cfg + + +class Caffe2Tracer: + """ + Make a detectron2 model traceable with caffe2 style. + + An original detectron2 model may not be traceable, or + cannot be deployed directly after being traced, due to some reasons: + 1. control flow in some ops + 2. custom ops + 3. complicated pre/post processing + + This class provides a traceable version of a detectron2 model by: + 1. Rewrite parts of the model using ops in caffe2. Note that some ops do + not have GPU implementation. + 2. Define the inputs "after pre-processing" as inputs to the model + 3. Remove post-processing and produce raw layer outputs + + More specifically about inputs: all builtin models take two input tensors. + (1) NCHW float "data" which is an image (usually in [0, 255]) + (2) Nx3 float "im_info", each row of which is (height, width, 1.0) + + After making a traceable model, the class provide methods to export such a + model to different deployment formats. + + The class currently only supports models using builtin meta architectures. + """ + + def __init__(self, cfg, model, inputs): + """ + Args: + cfg (CfgNode): a detectron2 config, with extra export-related options + added by :func:`add_export_config`. + model (nn.Module): a model built by + :func:`detectron2.modeling.build_model`. + inputs: sample inputs that the given model takes for inference. + Will be used to trace the model. + """ + assert isinstance(cfg, CN), cfg + assert isinstance(model, torch.nn.Module), type(model) + if "EXPORT_CAFFE2" not in cfg: + cfg = add_export_config(cfg) # will just the defaults + + self.cfg = cfg + self.model = model + self.inputs = inputs + + def _get_traceable(self): + # TODO how to make it extensible to support custom models + C2MetaArch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[self.cfg.MODEL.META_ARCHITECTURE] + traceable_model = C2MetaArch(self.cfg, copy.deepcopy(self.model)) + traceable_inputs = traceable_model.get_caffe2_inputs(self.inputs) + return traceable_model, traceable_inputs + + def export_caffe2(self): + """ + Export the model to Caffe2's protobuf format. + The returned object can be saved with `.save_protobuf()` method. + The result can be loaded and executed using Caffe2 runtime. + + Returns: + Caffe2Model + """ + model, inputs = self._get_traceable() + predict_net, init_net = export_caffe2_detection_model(model, inputs) + return Caffe2Model(predict_net, init_net) + + def export_onnx(self): + """ + Export the model to ONNX format. + Note that the exported model contains custom ops only available in caffe2, therefore it + cannot be directly executed by other runtime. Post-processing or transformation passes + may be applied on the model to accommodate different runtimes. + + Returns: + onnx.ModelProto: an onnx model. + """ + model, inputs = self._get_traceable() + return export_onnx_model_impl(model, (inputs,)) + + def export_torchscript(self): + """ + Export the model to a `torch.jit.TracedModule` by tracing. + The returned object can be saved to a file by ".save()". + + Returns: + torch.jit.TracedModule: a torch TracedModule + """ + model, inputs = self._get_traceable() + logger = logging.getLogger(__name__) + logger.info("Tracing the model with torch.jit.trace ...") + with torch.no_grad(): + return torch.jit.trace(model, (inputs,), optimize=True) + + +def export_caffe2_model(cfg, model, inputs): + """ + Export a detectron2 model to caffe2 format. + + Args: + cfg (CfgNode): a detectron2 config, with extra export-related options + added by :func:`add_export_config`. + model (nn.Module): a model built by + :func:`detectron2.modeling.build_model`. + It will be modified by this function. + inputs: sample inputs that the given model takes for inference. + Will be used to trace the model. + + Returns: + Caffe2Model + """ + return Caffe2Tracer(cfg, model, inputs).export_caffe2() + + +def export_onnx_model(cfg, model, inputs): + """ + Export a detectron2 model to ONNX format. + Note that the exported model contains custom ops only available in caffe2, therefore it + cannot be directly executed by other runtime. Post-processing or transformation passes + may be applied on the model to accommodate different runtimes. + Args: + cfg (CfgNode): a detectron2 config, with extra export-related options + added by :func:`add_export_config`. + model (nn.Module): a model built by + :func:`detectron2.modeling.build_model`. + It will be modified by this function. + inputs: sample inputs that the given model takes for inference. + Will be used to trace the model. + Returns: + onnx.ModelProto: an onnx model. + """ + return Caffe2Tracer(cfg, model, inputs).export_onnx() + + +class Caffe2Model(nn.Module): + """ + A wrapper around the traced model in caffe2's pb format. + """ + + def __init__(self, predict_net, init_net): + super().__init__() + self.eval() # always in eval mode + self._predict_net = predict_net + self._init_net = init_net + self._predictor = None + + @property + def predict_net(self): + """ + Returns: + core.Net: the underlying caffe2 predict net + """ + return self._predict_net + + @property + def init_net(self): + """ + Returns: + core.Net: the underlying caffe2 init net + """ + return self._init_net + + __init__.__HIDE_SPHINX_DOC__ = True + + def save_protobuf(self, output_dir): + """ + Save the model as caffe2's protobuf format. + + Args: + output_dir (str): the output directory to save protobuf files. + """ + logger = logging.getLogger(__name__) + logger.info("Saving model to {} ...".format(output_dir)) + os.makedirs(output_dir, exist_ok=True) + + with open(os.path.join(output_dir, "model.pb"), "wb") as f: + f.write(self._predict_net.SerializeToString()) + with open(os.path.join(output_dir, "model.pbtxt"), "w") as f: + f.write(str(self._predict_net)) + with open(os.path.join(output_dir, "model_init.pb"), "wb") as f: + f.write(self._init_net.SerializeToString()) + + def save_graph(self, output_file, inputs=None): + """ + Save the graph as SVG format. + + Args: + output_file (str): a SVG file + inputs: optional inputs given to the model. + If given, the inputs will be used to run the graph to record + shape of every tensor. The shape information will be + saved together with the graph. + """ + if inputs is None: + save_graph(self._predict_net, output_file, op_only=False) + else: + size_divisibility = get_pb_arg_vali(self._predict_net, "size_divisibility", 0) + device = get_pb_arg_vals(self._predict_net, "device", b"cpu").decode("ascii") + inputs = convert_batched_inputs_to_c2_format(inputs, size_divisibility, device) + inputs = [x.cpu().numpy() for x in inputs] + run_and_save_graph(self._predict_net, self._init_net, inputs, output_file) + + @staticmethod + def load_protobuf(dir): + """ + Args: + dir (str): a directory used to save Caffe2Model with + :meth:`save_protobuf`. + The files "model.pb" and "model_init.pb" are needed. + + Returns: + Caffe2Model: the caffe2 model loaded from this directory. + """ + predict_net = caffe2_pb2.NetDef() + with open(os.path.join(dir, "model.pb"), "rb") as f: + predict_net.ParseFromString(f.read()) + + init_net = caffe2_pb2.NetDef() + with open(os.path.join(dir, "model_init.pb"), "rb") as f: + init_net.ParseFromString(f.read()) + + return Caffe2Model(predict_net, init_net) + + def __call__(self, inputs): + """ + An interface that wraps around a caffe2 model and mimics detectron2's models' + input & output format. This is used to compare the outputs of caffe2 model + with its original torch model. + + Due to the extra conversion between torch/caffe2, + this method is not meant for benchmark. + """ + if self._predictor is None: + self._predictor = ProtobufDetectionModel(self._predict_net, self._init_net) + return self._predictor(inputs) diff --git a/detectron2/export/c10.py b/detectron2/export/c10.py new file mode 100644 index 0000000000000000000000000000000000000000..6e3cbe3ce94d0c56596c645b8c85592ed5d31fe1 --- /dev/null +++ b/detectron2/export/c10.py @@ -0,0 +1,503 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import math +import torch +import torch.nn.functional as F + +from detectron2.layers import cat +from detectron2.layers.roi_align_rotated import ROIAlignRotated +from detectron2.modeling import poolers +from detectron2.modeling.proposal_generator import rpn +from detectron2.modeling.roi_heads.mask_head import mask_rcnn_inference +from detectron2.structures import Boxes, ImageList, Instances, Keypoints + +from .shared import alias, to_device + + +""" +This file contains caffe2-compatible implementation of several detectrno2 components. +""" + + +class Caffe2Boxes(Boxes): + """ + Representing a list of detectron2.structures.Boxes from minibatch, each box + is represented by a 5d vector (batch index + 4 coordinates), or a 6d vector + (batch index + 5 coordinates) for RotatedBoxes. + """ + + def __init__(self, tensor): + assert isinstance(tensor, torch.Tensor) + assert tensor.dim() == 2 and tensor.size(-1) in [4, 5, 6], tensor.size() + # TODO: make tensor immutable when dim is Nx5 for Boxes, + # and Nx6 for RotatedBoxes? + self.tensor = tensor + + +# TODO clean up this class, maybe just extend Instances +class InstancesList(object): + """ + Tensor representation of a list of Instances object for a batch of images. + + When dealing with a batch of images with Caffe2 ops, a list of bboxes + (instances) are usually represented by single Tensor with size + (sigma(Ni), 5) or (sigma(Ni), 4) plus a batch split Tensor. This class is + for providing common functions to convert between these two representations. + """ + + def __init__(self, im_info, indices, extra_fields=None): + # [N, 3] -> (H, W, Scale) + self.im_info = im_info + # [N,] -> indice of batch to which the instance belongs + self.indices = indices + # [N, ...] + self.batch_extra_fields = extra_fields or {} + + self.image_size = self.im_info + + def get_fields(self): + """ like `get_fields` in the Instances object, + but return each field in tensor representations """ + ret = {} + for k, v in self.batch_extra_fields.items(): + # if isinstance(v, torch.Tensor): + # tensor_rep = v + # elif isinstance(v, (Boxes, Keypoints)): + # tensor_rep = v.tensor + # else: + # raise ValueError("Can't find tensor representation for: {}".format()) + ret[k] = v + return ret + + def has(self, name): + return name in self.batch_extra_fields + + def set(self, name, value): + data_len = len(value) + if len(self.batch_extra_fields): + assert ( + len(self) == data_len + ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self)) + self.batch_extra_fields[name] = value + + def __setattr__(self, name, val): + if name in ["im_info", "indices", "batch_extra_fields", "image_size"]: + super().__setattr__(name, val) + else: + self.set(name, val) + + def __getattr__(self, name): + if name not in self.batch_extra_fields: + raise AttributeError("Cannot find field '{}' in the given Instances!".format(name)) + return self.batch_extra_fields[name] + + def __len__(self): + return len(self.indices) + + def flatten(self): + ret = [] + for _, v in self.batch_extra_fields.items(): + if isinstance(v, (Boxes, Keypoints)): + ret.append(v.tensor) + else: + ret.append(v) + return ret + + @staticmethod + def to_d2_instances_list(instances_list): + """ + Convert InstancesList to List[Instances]. The input `instances_list` can + also be a List[Instances], in this case this method is a non-op. + """ + if not isinstance(instances_list, InstancesList): + assert all(isinstance(x, Instances) for x in instances_list) + return instances_list + + ret = [] + for i, info in enumerate(instances_list.im_info): + instances = Instances(torch.Size([int(info[0].item()), int(info[1].item())])) + + ids = instances_list.indices == i + for k, v in instances_list.batch_extra_fields.items(): + if isinstance(v, torch.Tensor): + instances.set(k, v[ids]) + continue + elif isinstance(v, Boxes): + instances.set(k, v[ids, -4:]) + continue + + target_type, tensor_source = v + assert isinstance(tensor_source, torch.Tensor) + assert tensor_source.shape[0] == instances_list.indices.shape[0] + tensor_source = tensor_source[ids] + + if issubclass(target_type, Boxes): + instances.set(k, Boxes(tensor_source[:, -4:])) + elif issubclass(target_type, Keypoints): + instances.set(k, Keypoints(tensor_source)) + elif issubclass(target_type, torch.Tensor): + instances.set(k, tensor_source) + else: + raise ValueError("Can't handle targe type: {}".format(target_type)) + + ret.append(instances) + return ret + + +class Caffe2Compatible(object): + def _get_tensor_mode(self): + return self._tensor_mode + + def _set_tensor_mode(self, v): + self._tensor_mode = v + + tensor_mode = property(_get_tensor_mode, _set_tensor_mode) + """ + If true, the model expects C2-style tensor only inputs/outputs format. + """ + + +class Caffe2RPN(Caffe2Compatible, rpn.RPN): + def forward(self, images, features, gt_instances=None): + assert not self.training + + features = [features[f] for f in self.in_features] + objectness_logits_pred, anchor_deltas_pred = self.rpn_head(features) + + assert isinstance(images, ImageList) + if self.tensor_mode: + im_info = images.image_sizes + else: + im_info = torch.Tensor( + [[im_sz[0], im_sz[1], torch.Tensor([1.0])] for im_sz in images.image_sizes] + ).to(images.tensor.device) + assert isinstance(im_info, torch.Tensor) + + rpn_rois_list = [] + rpn_roi_probs_list = [] + for scores, bbox_deltas, cell_anchors_tensor, feat_stride in zip( + objectness_logits_pred, + anchor_deltas_pred, + iter(self.anchor_generator.cell_anchors), + self.anchor_generator.strides, + ): + scores = scores.detach() + bbox_deltas = bbox_deltas.detach() + + rpn_rois, rpn_roi_probs = torch.ops._caffe2.GenerateProposals( + scores, + bbox_deltas, + im_info, + cell_anchors_tensor, + spatial_scale=1.0 / feat_stride, + pre_nms_topN=self.pre_nms_topk[self.training], + post_nms_topN=self.post_nms_topk[self.training], + nms_thresh=self.nms_thresh, + min_size=self.min_box_side_len, + # correct_transform_coords=True, # deprecated argument + angle_bound_on=True, # Default + angle_bound_lo=-180, + angle_bound_hi=180, + clip_angle_thresh=1.0, # Default + legacy_plus_one=False, + ) + rpn_rois_list.append(rpn_rois) + rpn_roi_probs_list.append(rpn_roi_probs) + + # For FPN in D2, in RPN all proposals from different levels are concated + # together, ranked and picked by top post_nms_topk. Then in ROIPooler + # it calculates level_assignments and calls the RoIAlign from + # the corresponding level. + + if len(objectness_logits_pred) == 1: + rpn_rois = rpn_rois_list[0] + rpn_roi_probs = rpn_roi_probs_list[0] + else: + assert len(rpn_rois_list) == len(rpn_roi_probs_list) + rpn_post_nms_topN = self.post_nms_topk[self.training] + + device = rpn_rois_list[0].device + input_list = [to_device(x, "cpu") for x in (rpn_rois_list + rpn_roi_probs_list)] + + # TODO remove this after confirming rpn_max_level/rpn_min_level + # is not needed in CollectRpnProposals. + feature_strides = list(self.anchor_generator.strides) + rpn_min_level = int(math.log2(feature_strides[0])) + rpn_max_level = int(math.log2(feature_strides[-1])) + assert (rpn_max_level - rpn_min_level + 1) == len( + rpn_rois_list + ), "CollectRpnProposals requires continuous levels" + + rpn_rois = torch.ops._caffe2.CollectRpnProposals( + input_list, + # NOTE: in current implementation, rpn_max_level and rpn_min_level + # are not needed, only the subtraction of two matters and it + # can be infer from the number of inputs. Keep them now for + # consistency. + rpn_max_level=2 + len(rpn_rois_list) - 1, + rpn_min_level=2, + rpn_post_nms_topN=rpn_post_nms_topN, + ) + rpn_rois = to_device(rpn_rois, device) + rpn_roi_probs = [] + + proposals = self.c2_postprocess(im_info, rpn_rois, rpn_roi_probs, self.tensor_mode) + return proposals, {} + + @staticmethod + def c2_postprocess(im_info, rpn_rois, rpn_roi_probs, tensor_mode): + proposals = InstancesList( + im_info=im_info, + indices=rpn_rois[:, 0], + extra_fields={ + "proposal_boxes": Caffe2Boxes(rpn_rois), + "objectness_logits": (torch.Tensor, rpn_roi_probs), + }, + ) + if not tensor_mode: + proposals = InstancesList.to_d2_instances_list(proposals) + else: + proposals = [proposals] + return proposals + + +class Caffe2ROIPooler(Caffe2Compatible, poolers.ROIPooler): + @staticmethod + def c2_preprocess(box_lists): + assert all(isinstance(x, Boxes) for x in box_lists) + if all(isinstance(x, Caffe2Boxes) for x in box_lists): + # input is pure-tensor based + assert len(box_lists) == 1 + pooler_fmt_boxes = box_lists[0].tensor + else: + pooler_fmt_boxes = poolers.convert_boxes_to_pooler_format(box_lists) + return pooler_fmt_boxes + + def forward(self, x, box_lists): + assert not self.training + + pooler_fmt_boxes = self.c2_preprocess(box_lists) + num_level_assignments = len(self.level_poolers) + + if num_level_assignments == 1: + if isinstance(self.level_poolers[0], ROIAlignRotated): + c2_roi_align = torch.ops._caffe2.RoIAlignRotated + aligned = True + else: + c2_roi_align = torch.ops._caffe2.RoIAlign + aligned = self.level_poolers[0].aligned + + out = c2_roi_align( + x[0], + pooler_fmt_boxes, + order="NCHW", + spatial_scale=float(self.level_poolers[0].spatial_scale), + pooled_h=int(self.output_size[0]), + pooled_w=int(self.output_size[1]), + sampling_ratio=int(self.level_poolers[0].sampling_ratio), + aligned=aligned, + ) + return out + + device = pooler_fmt_boxes.device + assert ( + self.max_level - self.min_level + 1 == 4 + ), "Currently DistributeFpnProposals only support 4 levels" + fpn_outputs = torch.ops._caffe2.DistributeFpnProposals( + to_device(pooler_fmt_boxes, "cpu"), + roi_canonical_scale=self.canonical_box_size, + roi_canonical_level=self.canonical_level, + roi_max_level=self.max_level, + roi_min_level=self.min_level, + legacy_plus_one=False, + ) + fpn_outputs = [to_device(x, device) for x in fpn_outputs] + + rois_fpn_list = fpn_outputs[:-1] + rois_idx_restore_int32 = fpn_outputs[-1] + + roi_feat_fpn_list = [] + for roi_fpn, x_level, pooler in zip(rois_fpn_list, x, self.level_poolers): + if isinstance(pooler, ROIAlignRotated): + c2_roi_align = torch.ops._caffe2.RoIAlignRotated + aligned = True + else: + c2_roi_align = torch.ops._caffe2.RoIAlign + aligned = bool(pooler.aligned) + + roi_feat_fpn = c2_roi_align( + x_level, + roi_fpn, + order="NCHW", + spatial_scale=float(pooler.spatial_scale), + pooled_h=int(self.output_size[0]), + pooled_w=int(self.output_size[1]), + sampling_ratio=int(pooler.sampling_ratio), + aligned=aligned, + ) + roi_feat_fpn_list.append(roi_feat_fpn) + + roi_feat_shuffled = cat(roi_feat_fpn_list, dim=0) + roi_feat = torch.ops._caffe2.BatchPermutation(roi_feat_shuffled, rois_idx_restore_int32) + return roi_feat + + +class Caffe2FastRCNNOutputsInference: + def __init__(self, tensor_mode): + self.tensor_mode = tensor_mode # whether the output is caffe2 tensor mode + + def __call__(self, box_predictor, predictions, proposals): + """ equivalent to FastRCNNOutputLayers.inference """ + score_thresh = box_predictor.test_score_thresh + nms_thresh = box_predictor.test_nms_thresh + topk_per_image = box_predictor.test_topk_per_image + is_rotated = len(box_predictor.box2box_transform.weights) == 5 + + if is_rotated: + box_dim = 5 + assert box_predictor.box2box_transform.weights[4] == 1, ( + "The weights for Rotated BBoxTransform in C2 have only 4 dimensions," + + " thus enforcing the angle weight to be 1 for now" + ) + box2box_transform_weights = box_predictor.box2box_transform.weights[:4] + else: + box_dim = 4 + box2box_transform_weights = box_predictor.box2box_transform.weights + + class_logits, box_regression = predictions + class_prob = F.softmax(class_logits, -1) + + assert box_regression.shape[1] % box_dim == 0 + cls_agnostic_bbox_reg = box_regression.shape[1] // box_dim == 1 + + input_tensor_mode = proposals[0].proposal_boxes.tensor.shape[1] == box_dim + 1 + + rois = type(proposals[0].proposal_boxes).cat([p.proposal_boxes for p in proposals]) + device, dtype = rois.tensor.device, rois.tensor.dtype + if input_tensor_mode: + im_info = proposals[0].image_size + rois = rois.tensor + else: + im_info = torch.Tensor( + [[sz[0], sz[1], 1.0] for sz in [x.image_size for x in proposals]] + ) + batch_ids = cat( + [ + torch.full((b, 1), i, dtype=dtype, device=device) + for i, b in enumerate(len(p) for p in proposals) + ], + dim=0, + ) + rois = torch.cat([batch_ids, rois.tensor], dim=1) + + roi_pred_bbox, roi_batch_splits = torch.ops._caffe2.BBoxTransform( + to_device(rois, "cpu"), + to_device(box_regression, "cpu"), + to_device(im_info, "cpu"), + weights=box2box_transform_weights, + apply_scale=True, + rotated=is_rotated, + angle_bound_on=True, + angle_bound_lo=-180, + angle_bound_hi=180, + clip_angle_thresh=1.0, + legacy_plus_one=False, + ) + roi_pred_bbox = to_device(roi_pred_bbox, device) + roi_batch_splits = to_device(roi_batch_splits, device) + + nms_outputs = torch.ops._caffe2.BoxWithNMSLimit( + to_device(class_prob, "cpu"), + to_device(roi_pred_bbox, "cpu"), + to_device(roi_batch_splits, "cpu"), + score_thresh=float(score_thresh), + nms=float(nms_thresh), + detections_per_im=int(topk_per_image), + soft_nms_enabled=False, + soft_nms_method="linear", + soft_nms_sigma=0.5, + soft_nms_min_score_thres=0.001, + rotated=is_rotated, + cls_agnostic_bbox_reg=cls_agnostic_bbox_reg, + input_boxes_include_bg_cls=False, + output_classes_include_bg_cls=False, + legacy_plus_one=False, + ) + roi_score_nms = to_device(nms_outputs[0], device) + roi_bbox_nms = to_device(nms_outputs[1], device) + roi_class_nms = to_device(nms_outputs[2], device) + roi_batch_splits_nms = to_device(nms_outputs[3], device) + roi_keeps_nms = to_device(nms_outputs[4], device) + roi_keeps_size_nms = to_device(nms_outputs[5], device) + if not self.tensor_mode: + roi_class_nms = roi_class_nms.to(torch.int64) + + roi_batch_ids = cat( + [ + torch.full((b, 1), i, dtype=dtype, device=device) + for i, b in enumerate(int(x.item()) for x in roi_batch_splits_nms) + ], + dim=0, + ) + + roi_class_nms = alias(roi_class_nms, "class_nms") + roi_score_nms = alias(roi_score_nms, "score_nms") + roi_bbox_nms = alias(roi_bbox_nms, "bbox_nms") + roi_batch_splits_nms = alias(roi_batch_splits_nms, "batch_splits_nms") + roi_keeps_nms = alias(roi_keeps_nms, "keeps_nms") + roi_keeps_size_nms = alias(roi_keeps_size_nms, "keeps_size_nms") + + results = InstancesList( + im_info=im_info, + indices=roi_batch_ids[:, 0], + extra_fields={ + "pred_boxes": Caffe2Boxes(roi_bbox_nms), + "scores": roi_score_nms, + "pred_classes": roi_class_nms, + }, + ) + + if not self.tensor_mode: + results = InstancesList.to_d2_instances_list(results) + batch_splits = roi_batch_splits_nms.int().tolist() + kept_indices = list(roi_keeps_nms.to(torch.int64).split(batch_splits)) + else: + results = [results] + kept_indices = [roi_keeps_nms] + + return results, kept_indices + + +class Caffe2MaskRCNNInference: + def __call__(self, pred_mask_logits, pred_instances): + """ equivalent to mask_head.mask_rcnn_inference """ + if all(isinstance(x, InstancesList) for x in pred_instances): + assert len(pred_instances) == 1 + mask_probs_pred = pred_mask_logits.sigmoid() + mask_probs_pred = alias(mask_probs_pred, "mask_fcn_probs") + pred_instances[0].pred_masks = mask_probs_pred + else: + mask_rcnn_inference(pred_mask_logits, pred_instances) + + +class Caffe2KeypointRCNNInference: + def __init__(self, use_heatmap_max_keypoint): + self.use_heatmap_max_keypoint = use_heatmap_max_keypoint + + def __call__(self, pred_keypoint_logits, pred_instances): + # just return the keypoint heatmap for now, + # there will be option to call HeatmapMaxKeypointOp + output = alias(pred_keypoint_logits, "kps_score") + if all(isinstance(x, InstancesList) for x in pred_instances): + assert len(pred_instances) == 1 + if self.use_heatmap_max_keypoint: + device = output.device + output = torch.ops._caffe2.HeatmapMaxKeypoint( + to_device(output, "cpu"), + pred_instances[0].pred_boxes.tensor, + should_output_softmax=True, # worth make it configerable? + ) + output = to_device(output, device) + output = alias(output, "keypoints_out") + pred_instances[0].pred_keypoints = output + return pred_keypoint_logits diff --git a/detectron2/export/caffe2_export.py b/detectron2/export/caffe2_export.py new file mode 100644 index 0000000000000000000000000000000000000000..ccac809d7bf49ab144b5f0a34f57e00c3534ad60 --- /dev/null +++ b/detectron2/export/caffe2_export.py @@ -0,0 +1,204 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import copy +import io +import logging +import numpy as np +from typing import List +import onnx +import torch +from caffe2.proto import caffe2_pb2 +from caffe2.python import core +from caffe2.python.onnx.backend import Caffe2Backend +from tabulate import tabulate +from termcolor import colored +from torch.onnx import OperatorExportTypes + +from .shared import ( + ScopedWS, + construct_init_net_from_params, + fuse_alias_placeholder, + fuse_copy_between_cpu_and_gpu, + get_params_from_init_net, + group_norm_replace_aten_with_caffe2, + infer_device_type, + remove_dead_end_ops, + remove_reshape_for_fc, + save_graph, +) + +logger = logging.getLogger(__name__) + + +def export_onnx_model(model, inputs): + """ + Trace and export a model to onnx format. + + Args: + model (nn.Module): + inputs (tuple[args]): the model will be called by `model(*inputs)` + + Returns: + an onnx model + """ + assert isinstance(model, torch.nn.Module) + + # make sure all modules are in eval mode, onnx may change the training state + # of the module if the states are not consistent + def _check_eval(module): + assert not module.training + + model.apply(_check_eval) + + # Export the model to ONNX + with torch.no_grad(): + with io.BytesIO() as f: + torch.onnx.export( + model, + inputs, + f, + operator_export_type=OperatorExportTypes.ONNX_ATEN_FALLBACK, + # verbose=True, # NOTE: uncomment this for debugging + # export_params=True, + ) + onnx_model = onnx.load_from_string(f.getvalue()) + + # Apply ONNX's Optimization + all_passes = onnx.optimizer.get_available_passes() + passes = ["fuse_bn_into_conv"] + assert all(p in all_passes for p in passes) + onnx_model = onnx.optimizer.optimize(onnx_model, passes) + return onnx_model + + +def _op_stats(net_def): + type_count = {} + for t in [op.type for op in net_def.op]: + type_count[t] = type_count.get(t, 0) + 1 + type_count_list = sorted(type_count.items(), key=lambda kv: kv[0]) # alphabet + type_count_list = sorted(type_count_list, key=lambda kv: -kv[1]) # count + return "\n".join("{:>4}x {}".format(count, name) for name, count in type_count_list) + + +def _assign_device_option( + predict_net: caffe2_pb2.NetDef, init_net: caffe2_pb2.NetDef, tensor_inputs: List[torch.Tensor] +): + """ + ONNX exported network doesn't have concept of device, assign necessary + device option for each op in order to make it runable on GPU runtime. + """ + + def _get_device_type(torch_tensor): + assert torch_tensor.device.type in ["cpu", "cuda"] + assert torch_tensor.device.index == 0 + return torch_tensor.device.type + + def _assign_op_device_option(net_proto, net_ssa, blob_device_types): + for op, ssa_i in zip(net_proto.op, net_ssa): + if op.type in ["CopyCPUToGPU", "CopyGPUToCPU"]: + op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0)) + else: + devices = [blob_device_types[b] for b in ssa_i[0] + ssa_i[1]] + assert all(d == devices[0] for d in devices) + if devices[0] == "cuda": + op.device_option.CopyFrom(core.DeviceOption(caffe2_pb2.CUDA, 0)) + + # update ops in predict_net + predict_net_input_device_types = { + (name, 0): _get_device_type(tensor) + for name, tensor in zip(predict_net.external_input, tensor_inputs) + } + predict_net_device_types = infer_device_type( + predict_net, known_status=predict_net_input_device_types, device_name_style="pytorch" + ) + predict_net_ssa, _ = core.get_ssa(predict_net) + _assign_op_device_option(predict_net, predict_net_ssa, predict_net_device_types) + + # update ops in init_net + init_net_ssa, versions = core.get_ssa(init_net) + init_net_output_device_types = { + (name, versions[name]): predict_net_device_types[(name, 0)] + for name in init_net.external_output + } + init_net_device_types = infer_device_type( + init_net, known_status=init_net_output_device_types, device_name_style="pytorch" + ) + _assign_op_device_option(init_net, init_net_ssa, init_net_device_types) + + +def export_caffe2_detection_model(model: torch.nn.Module, tensor_inputs: List[torch.Tensor]): + """ + Export a caffe2-compatible Detectron2 model to caffe2 format via ONNX. + + Arg: + model: a caffe2-compatible version of detectron2 model, defined in caffe2_modeling.py + tensor_inputs: a list of tensors that caffe2 model takes as input. + """ + model = copy.deepcopy(model) + assert isinstance(model, torch.nn.Module) + assert hasattr(model, "encode_additional_info") + + # Export via ONNX + logger.info("Exporting a {} model via ONNX ...".format(type(model).__name__)) + onnx_model = export_onnx_model(model, (tensor_inputs,)) + # Convert ONNX model to Caffe2 protobuf + init_net, predict_net = Caffe2Backend.onnx_graph_to_caffe2_net(onnx_model) + ops_table = [[op.type, op.input, op.output] for op in predict_net.op] + table = tabulate(ops_table, headers=["type", "input", "output"], tablefmt="pipe") + logger.info( + "ONNX export Done. Exported predict_net (before optimizations):\n" + colored(table, "cyan") + ) + + # Apply protobuf optimization + fuse_alias_placeholder(predict_net, init_net) + if any(t.device.type != "cpu" for t in tensor_inputs): + fuse_copy_between_cpu_and_gpu(predict_net) + remove_dead_end_ops(init_net) + _assign_device_option(predict_net, init_net, tensor_inputs) + params, device_options = get_params_from_init_net(init_net) + predict_net, params = remove_reshape_for_fc(predict_net, params) + init_net = construct_init_net_from_params(params, device_options) + group_norm_replace_aten_with_caffe2(predict_net) + + # Record necessary information for running the pb model in Detectron2 system. + model.encode_additional_info(predict_net, init_net) + + logger.info("Operators used in predict_net: \n{}".format(_op_stats(predict_net))) + logger.info("Operators used in init_net: \n{}".format(_op_stats(init_net))) + + return predict_net, init_net + + +def run_and_save_graph(predict_net, init_net, tensor_inputs, graph_save_path): + """ + Run the caffe2 model on given inputs, recording the shape and draw the graph. + + predict_net/init_net: caffe2 model. + tensor_inputs: a list of tensors that caffe2 model takes as input. + graph_save_path: path for saving graph of exported model. + """ + + logger.info("Saving graph of ONNX exported model to {} ...".format(graph_save_path)) + save_graph(predict_net, graph_save_path, op_only=False) + + # Run the exported Caffe2 net + logger.info("Running ONNX exported model ...") + with ScopedWS("__ws_tmp__", True) as ws: + ws.RunNetOnce(init_net) + initialized_blobs = set(ws.Blobs()) + uninitialized = [inp for inp in predict_net.external_input if inp not in initialized_blobs] + for name, blob in zip(uninitialized, tensor_inputs): + ws.FeedBlob(name, blob) + + try: + ws.RunNetOnce(predict_net) + except RuntimeError as e: + logger.warning("Encountered RuntimeError: \n{}".format(str(e))) + + ws_blobs = {b: ws.FetchBlob(b) for b in ws.Blobs()} + blob_sizes = {b: ws_blobs[b].shape for b in ws_blobs if isinstance(ws_blobs[b], np.ndarray)} + + logger.info("Saving graph with blob shapes to {} ...".format(graph_save_path)) + save_graph(predict_net, graph_save_path, op_only=False, blob_sizes=blob_sizes) + + return ws_blobs diff --git a/detectron2/export/caffe2_inference.py b/detectron2/export/caffe2_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..92718d04031b4513c2324ad596eae9cdbfa7c75e --- /dev/null +++ b/detectron2/export/caffe2_inference.py @@ -0,0 +1,136 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import collections +import logging +import numpy as np +import torch +from caffe2.proto import caffe2_pb2 +from caffe2.python import core + +from .caffe2_modeling import META_ARCH_CAFFE2_EXPORT_TYPE_MAP, convert_batched_inputs_to_c2_format +from .shared import ScopedWS, get_pb_arg_vali, get_pb_arg_vals, infer_device_type + +logger = logging.getLogger(__name__) + + +class ProtobufModel(torch.nn.Module): + """ + A class works just like nn.Module in terms of inference, but running + caffe2 model under the hood. Input/Output are Dict[str, tensor] whose keys + are in external_input/output. + """ + + def __init__(self, predict_net, init_net): + logger.info("Initializing ProtobufModel ...") + super().__init__() + assert isinstance(predict_net, caffe2_pb2.NetDef) + assert isinstance(init_net, caffe2_pb2.NetDef) + self.ws_name = "__ws_tmp__" + self.net = core.Net(predict_net) + + with ScopedWS(self.ws_name, is_reset=True, is_cleanup=False) as ws: + ws.RunNetOnce(init_net) + for blob in self.net.Proto().external_input: + if blob not in ws.Blobs(): + ws.CreateBlob(blob) + ws.CreateNet(self.net) + + self._error_msgs = set() + + def forward(self, inputs_dict): + assert all(inp in self.net.Proto().external_input for inp in inputs_dict) + with ScopedWS(self.ws_name, is_reset=False, is_cleanup=False) as ws: + for b, tensor in inputs_dict.items(): + ws.FeedBlob(b, tensor) + try: + ws.RunNet(self.net.Proto().name) + except RuntimeError as e: + if not str(e) in self._error_msgs: + self._error_msgs.add(str(e)) + logger.warning("Encountered new RuntimeError: \n{}".format(str(e))) + logger.warning("Catch the error and use partial results.") + + outputs_dict = collections.OrderedDict( + [(b, ws.FetchBlob(b)) for b in self.net.Proto().external_output] + ) + # Remove outputs of current run, this is necessary in order to + # prevent fetching the result from previous run if the model fails + # in the middle. + for b in self.net.Proto().external_output: + # Needs to create uninitialized blob to make the net runable. + # This is "equivalent" to: ws.RemoveBlob(b) then ws.CreateBlob(b), + # but there'no such API. + ws.FeedBlob(b, "{}, a C++ native class of type nullptr (uninitialized).".format(b)) + + return outputs_dict + + +class ProtobufDetectionModel(torch.nn.Module): + """ + A class works just like a pytorch meta arch in terms of inference, but running + caffe2 model under the hood. + """ + + def __init__(self, predict_net, init_net, *, convert_outputs=None): + """ + Args: + predict_net, init_net (core.Net): caffe2 nets + convert_outptus (callable): a function that converts caffe2 + outputs to the same format of the original pytorch model. + By default, use the one defined in the caffe2 meta_arch. + """ + super().__init__() + self.protobuf_model = ProtobufModel(predict_net, init_net) + self.size_divisibility = get_pb_arg_vali(predict_net, "size_divisibility", 0) + self.device = get_pb_arg_vals(predict_net, "device", b"cpu").decode("ascii") + + if convert_outputs is None: + meta_arch = get_pb_arg_vals(predict_net, "meta_architecture", b"GeneralizedRCNN") + meta_arch = META_ARCH_CAFFE2_EXPORT_TYPE_MAP[meta_arch.decode("ascii")] + self._convert_outputs = meta_arch.get_outputs_converter(predict_net, init_net) + else: + self._convert_outputs = convert_outputs + + def _infer_output_devices(self, inputs_dict): + def _get_device_type(torch_tensor): + assert torch_tensor.device.type in ["cpu", "cuda"] + assert torch_tensor.device.index == 0 + return torch_tensor.device.type + + predict_net = self.protobuf_model.net.Proto() + input_device_types = { + (name, 0): _get_device_type(tensor) for name, tensor in inputs_dict.items() + } + device_type_map = infer_device_type( + predict_net, known_status=input_device_types, device_name_style="pytorch" + ) + ssa, versions = core.get_ssa(predict_net) + versioned_outputs = [(name, versions[name]) for name in predict_net.external_output] + output_devices = [device_type_map[outp] for outp in versioned_outputs] + return output_devices + + def _convert_inputs(self, batched_inputs): + # currently all models convert inputs in the same way + data, im_info = convert_batched_inputs_to_c2_format( + batched_inputs, self.size_divisibility, self.device + ) + return {"data": data, "im_info": im_info} + + def forward(self, batched_inputs): + c2_inputs = self._convert_inputs(batched_inputs) + c2_results = self.protobuf_model(c2_inputs) + + if any(t.device.type != "cpu" for _, t in c2_inputs.items()): + output_devices = self._infer_output_devices(c2_inputs) + else: + output_devices = ["cpu" for _ in self.protobuf_model.net.Proto().external_output] + + def _cast_caffe2_blob_to_torch_tensor(blob, device): + return torch.Tensor(blob).to(device) if isinstance(blob, np.ndarray) else None + + c2_results = { + name: _cast_caffe2_blob_to_torch_tensor(c2_results[name], device) + for name, device in zip(self.protobuf_model.net.Proto().external_output, output_devices) + } + + return self._convert_outputs(batched_inputs, c2_inputs, c2_results) diff --git a/detectron2/export/caffe2_modeling.py b/detectron2/export/caffe2_modeling.py new file mode 100644 index 0000000000000000000000000000000000000000..1732b322c75abc3ac178d61d31cdec4cdcd61dfd --- /dev/null +++ b/detectron2/export/caffe2_modeling.py @@ -0,0 +1,493 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import functools +import io +import struct +import types +import torch + +from detectron2.modeling import meta_arch +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.meta_arch.panoptic_fpn import combine_semantic_and_instance_outputs +from detectron2.modeling.postprocessing import detector_postprocess, sem_seg_postprocess +from detectron2.modeling.roi_heads import keypoint_head +from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes + +from .c10 import Caffe2Compatible +from .patcher import ROIHeadsPatcher, patch_generalized_rcnn +from .shared import ( + alias, + check_set_pb_arg, + get_pb_arg_floats, + get_pb_arg_valf, + get_pb_arg_vali, + get_pb_arg_vals, + mock_torch_nn_functional_interpolate, +) + + +def assemble_rcnn_outputs_by_name(image_sizes, tensor_outputs, force_mask_on=False): + """ + A function to assemble caffe2 model's outputs (i.e. Dict[str, Tensor]) + to detectron2's format (i.e. list of Instances instance). + This only works when the model follows the Caffe2 detectron's naming convention. + + Args: + image_sizes (List[List[int, int]]): [H, W] of every image. + tensor_outputs (Dict[str, Tensor]): external_output to its tensor. + + force_mask_on (Bool): if true, the it make sure there'll be pred_masks even + if the mask is not found from tensor_outputs (usually due to model crash) + """ + + results = [Instances(image_size) for image_size in image_sizes] + + batch_splits = tensor_outputs.get("batch_splits", None) + if batch_splits: + raise NotImplementedError() + assert len(image_sizes) == 1 + result = results[0] + + bbox_nms = tensor_outputs["bbox_nms"] + score_nms = tensor_outputs["score_nms"] + class_nms = tensor_outputs["class_nms"] + # Detection will always success because Conv support 0-batch + assert bbox_nms is not None + assert score_nms is not None + assert class_nms is not None + if bbox_nms.shape[1] == 5: + result.pred_boxes = RotatedBoxes(bbox_nms) + else: + result.pred_boxes = Boxes(bbox_nms) + result.scores = score_nms + result.pred_classes = class_nms.to(torch.int64) + + mask_fcn_probs = tensor_outputs.get("mask_fcn_probs", None) + if mask_fcn_probs is not None: + # finish the mask pred + mask_probs_pred = mask_fcn_probs + num_masks = mask_probs_pred.shape[0] + class_pred = result.pred_classes + indices = torch.arange(num_masks, device=class_pred.device) + mask_probs_pred = mask_probs_pred[indices, class_pred][:, None] + result.pred_masks = mask_probs_pred + elif force_mask_on: + # NOTE: there's no way to know the height/width of mask here, it won't be + # used anyway when batch size is 0, so just set them to 0. + result.pred_masks = torch.zeros([0, 1, 0, 0], dtype=torch.uint8) + + keypoints_out = tensor_outputs.get("keypoints_out", None) + kps_score = tensor_outputs.get("kps_score", None) + if keypoints_out is not None: + # keypoints_out: [N, 4, #kypoints], where 4 is in order of (x, y, score, prob) + keypoints_tensor = keypoints_out + # NOTE: it's possible that prob is not calculated if "should_output_softmax" + # is set to False in HeatmapMaxKeypoint, so just using raw score, seems + # it doesn't affect mAP. TODO: check more carefully. + keypoint_xyp = keypoints_tensor.transpose(1, 2)[:, :, [0, 1, 2]] + result.pred_keypoints = keypoint_xyp + elif kps_score is not None: + # keypoint heatmap to sparse data structure + pred_keypoint_logits = kps_score + keypoint_head.keypoint_rcnn_inference(pred_keypoint_logits, [result]) + + return results + + +def _cast_to_f32(f64): + return struct.unpack("f", struct.pack("f", f64))[0] + + +def set_caffe2_compatible_tensor_mode(model, enable=True): + def _fn(m): + if isinstance(m, Caffe2Compatible): + m.tensor_mode = enable + + model.apply(_fn) + + +def convert_batched_inputs_to_c2_format(batched_inputs, size_divisibility, device): + """ + See get_caffe2_inputs() below. + """ + assert all(isinstance(x, dict) for x in batched_inputs) + assert all(x["image"].dim() == 3 for x in batched_inputs) + + images = [x["image"] for x in batched_inputs] + images = ImageList.from_tensors(images, size_divisibility) + + im_info = [] + for input_per_image, image_size in zip(batched_inputs, images.image_sizes): + target_height = input_per_image.get("height", image_size[0]) + target_width = input_per_image.get("width", image_size[1]) # noqa + # NOTE: The scale inside im_info is kept as convention and for providing + # post-processing information if further processing is needed. For + # current Caffe2 model definitions that don't include post-processing inside + # the model, this number is not used. + # NOTE: There can be a slight difference between width and height + # scales, using a single number can results in numerical difference + # compared with D2's post-processing. + scale = target_height / image_size[0] + im_info.append([image_size[0], image_size[1], scale]) + im_info = torch.Tensor(im_info) + + return images.tensor.to(device), im_info.to(device) + + +class Caffe2MetaArch(Caffe2Compatible, torch.nn.Module): + """ + Base class for caffe2-compatible implementation of a meta architecture. + The forward is traceable and its traced graph can be converted to caffe2 + graph through ONNX. + """ + + def __init__(self, cfg, torch_model): + """ + Args: + cfg (CfgNode): + torch_model (nn.Module): the detectron2 model (meta_arch) to be + converted. + """ + super().__init__() + self._wrapped_model = torch_model + self.eval() + set_caffe2_compatible_tensor_mode(self, True) + + def get_caffe2_inputs(self, batched_inputs): + """ + Convert pytorch-style structured inputs to caffe2-style inputs that + are tuples of tensors. + + Args: + batched_inputs (list[dict]): inputs to a detectron2 model + in its standard format. Each dict has "image" (CHW tensor), and optionally + "height" and "width". + + Returns: + tuple[Tensor]: + tuple of tensors that will be the inputs to the + :meth:`forward` method. For existing models, the first + is an NCHW tensor (padded and batched); the second is + a im_info Nx3 tensor, where the rows are + (height, width, unused legacy parameter) + """ + return convert_batched_inputs_to_c2_format( + batched_inputs, + self._wrapped_model.backbone.size_divisibility, + self._wrapped_model.device, + ) + + def encode_additional_info(self, predict_net, init_net): + """ + Save extra metadata that will be used by inference in the output protobuf. + """ + pass + + def forward(self, inputs): + """ + Run the forward in caffe2-style. It has to use caffe2-compatible ops + and the method will be used for tracing. + + Args: + inputs (tuple[Tensor]): inputs defined by :meth:`get_caffe2_input`. + They will be the inputs of the converted caffe2 graph. + + Returns: + tuple[Tensor]: output tensors. They will be the outputs of the + converted caffe2 graph. + """ + raise NotImplementedError + + def _caffe2_preprocess_image(self, inputs): + """ + Caffe2 implementation of preprocess_image, which is called inside each MetaArch's forward. + It normalizes the input images, and the final caffe2 graph assumes the + inputs have been batched already. + """ + data, im_info = inputs + data = alias(data, "data") + im_info = alias(im_info, "im_info") + mean, std = self._wrapped_model.pixel_mean, self._wrapped_model.pixel_std + normalized_data = (data - mean) / std + normalized_data = alias(normalized_data, "normalized_data") + + # Pack (data, im_info) into ImageList which is recognized by self.inference. + images = ImageList(tensor=normalized_data, image_sizes=im_info) + return images + + @staticmethod + def get_outputs_converter(predict_net, init_net): + """ + Creates a function that converts outputs of the caffe2 model to + detectron2's standard format. + The function uses information in `predict_net` and `init_net` that are + available at inferene time. Therefore the function logic can be used in inference. + + The returned function has the following signature: + + def convert(batched_inputs, c2_inputs, c2_results) -> detectron2_outputs + + Where + + * batched_inputs (list[dict]): the original input format of the meta arch + * c2_inputs (dict[str, Tensor]): the caffe2 inputs. + * c2_results (dict[str, Tensor]): the caffe2 output format, + corresponding to the outputs of the :meth:`forward` function. + * detectron2_outputs: the original output format of the meta arch. + + This function can be used to compare the outputs of the original meta arch and + the converted caffe2 graph. + + Returns: + callable: a callable of the above signature. + """ + raise NotImplementedError + + +class Caffe2GeneralizedRCNN(Caffe2MetaArch): + def __init__(self, cfg, torch_model): + assert isinstance(torch_model, meta_arch.GeneralizedRCNN) + torch_model = patch_generalized_rcnn(torch_model) + super().__init__(cfg, torch_model) + + self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads) + + def encode_additional_info(self, predict_net, init_net): + size_divisibility = self._wrapped_model.backbone.size_divisibility + check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility) + check_set_pb_arg( + predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii") + ) + check_set_pb_arg(predict_net, "meta_architecture", "s", b"GeneralizedRCNN") + + @mock_torch_nn_functional_interpolate() + def forward(self, inputs): + if not self.tensor_mode: + return self._wrapped_model.inference(inputs) + images = self._caffe2_preprocess_image(inputs) + features = self._wrapped_model.backbone(images.tensor) + proposals, _ = self._wrapped_model.proposal_generator(images, features) + with self.roi_heads_patcher.mock_roi_heads(): + detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals) + return tuple(detector_results[0].flatten()) + + @staticmethod + def get_outputs_converter(predict_net, init_net): + def f(batched_inputs, c2_inputs, c2_results): + image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] + results = assemble_rcnn_outputs_by_name(image_sizes, c2_results) + return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes) + + return f + + +class Caffe2PanopticFPN(Caffe2MetaArch): + def __init__(self, cfg, torch_model): + assert isinstance(torch_model, meta_arch.PanopticFPN) + torch_model = patch_generalized_rcnn(torch_model) + super().__init__(cfg, torch_model) + + self.roi_heads_patcher = ROIHeadsPatcher(cfg, self._wrapped_model.roi_heads) + + @mock_torch_nn_functional_interpolate() + def forward(self, inputs): + assert self.tensor_mode + images = self._caffe2_preprocess_image(inputs) + features = self._wrapped_model.backbone(images.tensor) + + sem_seg_results, _ = self._wrapped_model.sem_seg_head(features) + sem_seg_results = alias(sem_seg_results, "sem_seg") + + proposals, _ = self._wrapped_model.proposal_generator(images, features) + + with self.roi_heads_patcher.mock_roi_heads(self.tensor_mode): + detector_results, _ = self._wrapped_model.roi_heads(images, features, proposals) + + return tuple(detector_results[0].flatten()) + (sem_seg_results,) + + def encode_additional_info(self, predict_net, init_net): + size_divisibility = self._wrapped_model.backbone.size_divisibility + check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility) + check_set_pb_arg( + predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii") + ) + check_set_pb_arg(predict_net, "meta_architecture", "s", b"PanopticFPN") + + # Inference parameters: + check_set_pb_arg(predict_net, "combine_on", "i", self._wrapped_model.combine_on) + check_set_pb_arg( + predict_net, + "combine_overlap_threshold", + "f", + _cast_to_f32(self._wrapped_model.combine_overlap_threshold), + ) + check_set_pb_arg( + predict_net, + "combine_stuff_area_limit", + "i", + self._wrapped_model.combine_stuff_area_limit, + ) + check_set_pb_arg( + predict_net, + "combine_instances_confidence_threshold", + "f", + _cast_to_f32(self._wrapped_model.combine_instances_confidence_threshold), + ) + + @staticmethod + def get_outputs_converter(predict_net, init_net): + combine_on = get_pb_arg_vali(predict_net, "combine_on", None) + combine_overlap_threshold = get_pb_arg_valf(predict_net, "combine_overlap_threshold", None) + combine_stuff_area_limit = get_pb_arg_vali(predict_net, "combine_stuff_area_limit", None) + combine_instances_confidence_threshold = get_pb_arg_valf( + predict_net, "combine_instances_confidence_threshold", None + ) + + def f(batched_inputs, c2_inputs, c2_results): + image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] + detector_results = assemble_rcnn_outputs_by_name( + image_sizes, c2_results, force_mask_on=True + ) + sem_seg_results = c2_results["sem_seg"] + + # copied from meta_arch/panoptic_fpn.py ... + processed_results = [] + for sem_seg_result, detector_result, input_per_image, image_size in zip( + sem_seg_results, detector_results, batched_inputs, image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) + detector_r = detector_postprocess(detector_result, height, width) + + processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r}) + + if combine_on: + panoptic_r = combine_semantic_and_instance_outputs( + detector_r, + sem_seg_r.argmax(dim=0), + combine_overlap_threshold, + combine_stuff_area_limit, + combine_instances_confidence_threshold, + ) + processed_results[-1]["panoptic_seg"] = panoptic_r + return processed_results + + return f + + +class Caffe2RetinaNet(Caffe2MetaArch): + def __init__(self, cfg, torch_model): + assert isinstance(torch_model, meta_arch.RetinaNet) + super().__init__(cfg, torch_model) + + @mock_torch_nn_functional_interpolate() + def forward(self, inputs): + assert self.tensor_mode + images = self._caffe2_preprocess_image(inputs) + + # explicitly return the images sizes to avoid removing "im_info" by ONNX + # since it's not used in the forward path + return_tensors = [images.image_sizes] + + features = self._wrapped_model.backbone(images.tensor) + features = [features[f] for f in self._wrapped_model.in_features] + for i, feature_i in enumerate(features): + features[i] = alias(feature_i, "feature_{}".format(i), is_backward=True) + return_tensors.append(features[i]) + + box_cls, box_delta = self._wrapped_model.head(features) + for i, (box_cls_i, box_delta_i) in enumerate(zip(box_cls, box_delta)): + return_tensors.append(alias(box_cls_i, "box_cls_{}".format(i))) + return_tensors.append(alias(box_delta_i, "box_delta_{}".format(i))) + + return tuple(return_tensors) + + def encode_additional_info(self, predict_net, init_net): + size_divisibility = self._wrapped_model.backbone.size_divisibility + check_set_pb_arg(predict_net, "size_divisibility", "i", size_divisibility) + check_set_pb_arg( + predict_net, "device", "s", str.encode(str(self._wrapped_model.device), "ascii") + ) + check_set_pb_arg(predict_net, "meta_architecture", "s", b"RetinaNet") + + # Inference parameters: + check_set_pb_arg( + predict_net, "score_threshold", "f", _cast_to_f32(self._wrapped_model.score_threshold) + ) + check_set_pb_arg(predict_net, "topk_candidates", "i", self._wrapped_model.topk_candidates) + check_set_pb_arg( + predict_net, "nms_threshold", "f", _cast_to_f32(self._wrapped_model.nms_threshold) + ) + check_set_pb_arg( + predict_net, + "max_detections_per_image", + "i", + self._wrapped_model.max_detections_per_image, + ) + + check_set_pb_arg( + predict_net, + "bbox_reg_weights", + "floats", + [_cast_to_f32(w) for w in self._wrapped_model.box2box_transform.weights], + ) + self._encode_anchor_generator_cfg(predict_net) + + def _encode_anchor_generator_cfg(self, predict_net): + # serialize anchor_generator for future use + serialized_anchor_generator = io.BytesIO() + torch.save(self._wrapped_model.anchor_generator, serialized_anchor_generator) + # Ideally we can put anchor generating inside the model, then we don't + # need to store this information. + bytes = serialized_anchor_generator.getvalue() + check_set_pb_arg(predict_net, "serialized_anchor_generator", "s", bytes) + + @staticmethod + def get_outputs_converter(predict_net, init_net): + self = types.SimpleNamespace() + serialized_anchor_generator = io.BytesIO( + get_pb_arg_vals(predict_net, "serialized_anchor_generator", None) + ) + self.anchor_generator = torch.load(serialized_anchor_generator) + bbox_reg_weights = get_pb_arg_floats(predict_net, "bbox_reg_weights", None) + self.box2box_transform = Box2BoxTransform(weights=tuple(bbox_reg_weights)) + self.score_threshold = get_pb_arg_valf(predict_net, "score_threshold", None) + self.topk_candidates = get_pb_arg_vali(predict_net, "topk_candidates", None) + self.nms_threshold = get_pb_arg_valf(predict_net, "nms_threshold", None) + self.max_detections_per_image = get_pb_arg_vali( + predict_net, "max_detections_per_image", None + ) + + # hack to reuse inference code from RetinaNet + self.inference = functools.partial(meta_arch.RetinaNet.inference, self) + self.inference_single_image = functools.partial( + meta_arch.RetinaNet.inference_single_image, self + ) + + def f(batched_inputs, c2_inputs, c2_results): + image_sizes = [[int(im[0]), int(im[1])] for im in c2_inputs["im_info"]] + + num_features = len([x for x in c2_results.keys() if x.startswith("box_cls_")]) + box_cls = [c2_results["box_cls_{}".format(i)] for i in range(num_features)] + box_delta = [c2_results["box_delta_{}".format(i)] for i in range(num_features)] + + # For each feature level, feature should have the same batch size and + # spatial dimension as the box_cls and box_delta. + dummy_features = [box_delta[i].clone()[:, 0:0, :, :] for i in range(num_features)] + anchors = self.anchor_generator(dummy_features) + + # self.num_classess can be inferred + self.num_classes = box_cls[0].shape[1] // (box_delta[0].shape[1] // 4) + + results = self.inference(box_cls, box_delta, anchors, image_sizes) + return meta_arch.GeneralizedRCNN._postprocess(results, batched_inputs, image_sizes) + + return f + + +META_ARCH_CAFFE2_EXPORT_TYPE_MAP = { + "GeneralizedRCNN": Caffe2GeneralizedRCNN, + "PanopticFPN": Caffe2PanopticFPN, + "RetinaNet": Caffe2RetinaNet, +} diff --git a/detectron2/export/patcher.py b/detectron2/export/patcher.py new file mode 100644 index 0000000000000000000000000000000000000000..3f0b0fd8122d12c10d06cfc1b0720e3c3374c737 --- /dev/null +++ b/detectron2/export/patcher.py @@ -0,0 +1,153 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import contextlib +import mock +import torch + +from detectron2.modeling import poolers +from detectron2.modeling.proposal_generator import rpn +from detectron2.modeling.roi_heads import keypoint_head, mask_head +from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers + +from .c10 import ( + Caffe2Compatible, + Caffe2FastRCNNOutputsInference, + Caffe2KeypointRCNNInference, + Caffe2MaskRCNNInference, + Caffe2ROIPooler, + Caffe2RPN, +) + + +class GenericMixin(object): + pass + + +class Caffe2CompatibleConverter(object): + """ + A GenericUpdater which implements the `create_from` interface, by modifying + module object and assign it with another class replaceCls. + """ + + def __init__(self, replaceCls): + self.replaceCls = replaceCls + + def create_from(self, module): + # update module's class to the new class + assert isinstance(module, torch.nn.Module) + if issubclass(self.replaceCls, GenericMixin): + # replaceCls should act as mixin, create a new class on-the-fly + new_class = type( + "{}MixedWith{}".format(self.replaceCls.__name__, module.__class__.__name__), + (self.replaceCls, module.__class__), + {}, # {"new_method": lambda self: ...}, + ) + module.__class__ = new_class + else: + # replaceCls is complete class, this allow arbitrary class swap + module.__class__ = self.replaceCls + + # initialize Caffe2Compatible + if isinstance(module, Caffe2Compatible): + module.tensor_mode = False + + return module + + +def patch(model, target, updater, *args, **kwargs): + """ + recursively (post-order) update all modules with the target type and its + subclasses, make a initialization/composition/inheritance/... via the + updater.create_from. + """ + for name, module in model.named_children(): + model._modules[name] = patch(module, target, updater, *args, **kwargs) + if isinstance(model, target): + return updater.create_from(model, *args, **kwargs) + return model + + +def patch_generalized_rcnn(model): + ccc = Caffe2CompatibleConverter + model = patch(model, rpn.RPN, ccc(Caffe2RPN)) + model = patch(model, poolers.ROIPooler, ccc(Caffe2ROIPooler)) + + return model + + +@contextlib.contextmanager +def mock_fastrcnn_outputs_inference( + tensor_mode, check=True, box_predictor_type=FastRCNNOutputLayers +): + with mock.patch.object( + box_predictor_type, + "inference", + autospec=True, + side_effect=Caffe2FastRCNNOutputsInference(tensor_mode), + ) as mocked_func: + yield + if check: + assert mocked_func.call_count > 0 + + +@contextlib.contextmanager +def mock_mask_rcnn_inference(tensor_mode, patched_module, check=True): + with mock.patch( + "{}.mask_rcnn_inference".format(patched_module), side_effect=Caffe2MaskRCNNInference() + ) as mocked_func: + yield + if check: + assert mocked_func.call_count > 0 + + +@contextlib.contextmanager +def mock_keypoint_rcnn_inference(tensor_mode, patched_module, use_heatmap_max_keypoint, check=True): + with mock.patch( + "{}.keypoint_rcnn_inference".format(patched_module), + side_effect=Caffe2KeypointRCNNInference(use_heatmap_max_keypoint), + ) as mocked_func: + yield + if check: + assert mocked_func.call_count > 0 + + +class ROIHeadsPatcher: + def __init__(self, cfg, heads): + self.heads = heads + + self.use_heatmap_max_keypoint = cfg.EXPORT_CAFFE2.USE_HEATMAP_MAX_KEYPOINT + + @contextlib.contextmanager + def mock_roi_heads(self, tensor_mode=True): + """ + Patching several inference functions inside ROIHeads and its subclasses + + Args: + tensor_mode (bool): whether the inputs/outputs are caffe2's tensor + format or not. Default to True. + """ + # NOTE: this requries the `keypoint_rcnn_inference` and `mask_rcnn_inference` + # are called inside the same file as BaseXxxHead due to using mock.patch. + kpt_heads_mod = keypoint_head.BaseKeypointRCNNHead.__module__ + mask_head_mod = mask_head.BaseMaskRCNNHead.__module__ + + mock_ctx_managers = [ + mock_fastrcnn_outputs_inference( + tensor_mode=tensor_mode, + check=True, + box_predictor_type=type(self.heads.box_predictor), + ) + ] + if getattr(self.heads, "keypoint_on", False): + mock_ctx_managers += [ + mock_keypoint_rcnn_inference( + tensor_mode, kpt_heads_mod, self.use_heatmap_max_keypoint + ) + ] + if getattr(self.heads, "mask_on", False): + mock_ctx_managers += [mock_mask_rcnn_inference(tensor_mode, mask_head_mod)] + + with contextlib.ExitStack() as stack: # python 3.3+ + for mgr in mock_ctx_managers: + stack.enter_context(mgr) + yield diff --git a/detectron2/export/shared.py b/detectron2/export/shared.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7ffeb098f21178660572830164126fab63e0e1 --- /dev/null +++ b/detectron2/export/shared.py @@ -0,0 +1,1034 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import collections +import contextlib +import copy +import functools +import logging +import mock +import numpy as np +import os +from typing import Any, Callable, Dict, List, Optional, Tuple, Union +import caffe2.python.utils as putils +import torch +import torch.nn.functional as F +from caffe2.proto import caffe2_pb2 +from caffe2.python import core, net_drawer, workspace +from torch.nn.functional import interpolate as interp + +logger = logging.getLogger(__name__) + + +# ==== torch/utils_toffee/cast.py ======================================= + + +def to_device(t, device_str): + """ + This function is a replacement of .to(another_device) such that it allows the + casting to be traced properly by explicitly calling the underlying copy ops. + It also avoids introducing unncessary op when casting to the same device. + """ + src = t.device + dst = torch.device(device_str) + + if src == dst: + return t + elif src.type == "cuda" and dst.type == "cpu": + return torch.ops._caffe2.CopyGPUToCPU(t) + elif src.type == "cpu" and dst.type == "cuda": + return torch.ops._caffe2.CopyCPUToGPU(t) + else: + raise RuntimeError("Can't cast tensor from device {} to device {}".format(src, dst)) + + +# ==== torch/utils_toffee/interpolate.py ======================================= + + +# Note: borrowed from vision/detection/fair/detectron/detectron/modeling/detector.py +def BilinearInterpolation(tensor_in, up_scale): + assert up_scale % 2 == 0, "Scale should be even" + + def upsample_filt(size): + factor = (size + 1) // 2 + if size % 2 == 1: + center = factor - 1 + else: + center = factor - 0.5 + + og = np.ogrid[:size, :size] + return (1 - abs(og[0] - center) / factor) * (1 - abs(og[1] - center) / factor) + + kernel_size = int(up_scale) * 2 + bil_filt = upsample_filt(kernel_size) + + dim = int(tensor_in.shape[1]) + kernel = np.zeros((dim, dim, kernel_size, kernel_size), dtype=np.float32) + kernel[range(dim), range(dim), :, :] = bil_filt + + tensor_out = F.conv_transpose2d( + tensor_in, + weight=to_device(torch.Tensor(kernel), tensor_in.device), + bias=None, + stride=int(up_scale), + padding=int(up_scale / 2), + ) + + return tensor_out + + +# NOTE: ONNX is incompatible with traced torch.nn.functional.interpolate if +# using dynamic `scale_factor` rather than static `size`. (T43166860) +# NOTE: Caffe2 Int8 conversion might not be able to quantize `size` properly. +def onnx_compatibale_interpolate( + input, size=None, scale_factor=None, mode="nearest", align_corners=None +): + # NOTE: The input dimensions are interpreted in the form: + # `mini-batch x channels x [optional depth] x [optional height] x width`. + if size is None and scale_factor is not None: + if input.dim() == 4: + if isinstance(scale_factor, (int, float)): + height_scale, width_scale = (scale_factor, scale_factor) + else: + assert isinstance(scale_factor, (tuple, list)) + assert len(scale_factor) == 2 + height_scale, width_scale = scale_factor + + assert not align_corners, "No matching C2 op for align_corners == True" + if mode == "nearest": + return torch.ops._caffe2.ResizeNearest( + input, order="NCHW", width_scale=width_scale, height_scale=height_scale + ) + elif mode == "bilinear": + logger.warning( + "Use F.conv_transpose2d for bilinear interpolate" + " because there's no such C2 op, this may cause significant" + " slowdown and the boundary pixels won't be as same as" + " using F.interpolate due to padding." + ) + assert height_scale == width_scale + return BilinearInterpolation(input, up_scale=height_scale) + logger.warning("Output size is not static, it might cause ONNX conversion issue") + + return interp(input, size, scale_factor, mode, align_corners) + + +@contextlib.contextmanager +def mock_torch_nn_functional_interpolate(): + if torch.onnx.is_in_onnx_export(): + with mock.patch( + "torch.nn.functional.interpolate", side_effect=onnx_compatibale_interpolate + ): + yield + else: + yield + + +# ==== torch/utils_caffe2/ws_utils.py ========================================== + + +class ScopedWS(object): + def __init__(self, ws_name, is_reset, is_cleanup=False): + self.ws_name = ws_name + self.is_reset = is_reset + self.is_cleanup = is_cleanup + self.org_ws = "" + + def __enter__(self): + self.org_ws = workspace.CurrentWorkspace() + if self.ws_name is not None: + workspace.SwitchWorkspace(self.ws_name, True) + if self.is_reset: + workspace.ResetWorkspace() + + return workspace + + def __exit__(self, *args): + if self.is_cleanup: + workspace.ResetWorkspace() + if self.ws_name is not None: + workspace.SwitchWorkspace(self.org_ws) + + +def fetch_any_blob(name): + bb = None + try: + bb = workspace.FetchBlob(name) + except TypeError: + bb = workspace.FetchInt8Blob(name) + except Exception as e: + logger.error("Get blob {} error: {}".format(name, e)) + + return bb + + +# ==== torch/utils_caffe2/protobuf.py ========================================== + + +def get_pb_arg(pb, arg_name): + for x in pb.arg: + if x.name == arg_name: + return x + return None + + +def get_pb_arg_valf(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return arg.f if arg is not None else default_val + + +def get_pb_arg_floats(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return list(map(float, arg.floats)) if arg is not None else default_val + + +def get_pb_arg_ints(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return list(map(int, arg.ints)) if arg is not None else default_val + + +def get_pb_arg_vali(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return arg.i if arg is not None else default_val + + +def get_pb_arg_vals(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return arg.s if arg is not None else default_val + + +def get_pb_arg_valstrings(pb, arg_name, default_val): + arg = get_pb_arg(pb, arg_name) + return list(arg.strings) if arg is not None else default_val + + +def check_set_pb_arg(pb, arg_name, arg_attr, arg_value, allow_override=False): + arg = get_pb_arg(pb, arg_name) + if arg is None: + arg = putils.MakeArgument(arg_name, arg_value) + assert hasattr(arg, arg_attr) + pb.arg.extend([arg]) + if allow_override and getattr(arg, arg_attr) != arg_value: + logger.warning( + "Override argument {}: {} -> {}".format(arg_name, getattr(arg, arg_attr), arg_value) + ) + setattr(arg, arg_attr, arg_value) + else: + assert arg is not None + assert getattr(arg, arg_attr) == arg_value, "Existing value {}, new value {}".format( + getattr(arg, arg_attr), arg_value + ) + + +def _create_const_fill_op_from_numpy(name, tensor, device_option=None): + assert type(tensor) == np.ndarray + kTypeNameMapper = { + np.dtype("float32"): "GivenTensorFill", + np.dtype("int32"): "GivenTensorIntFill", + np.dtype("int64"): "GivenTensorInt64Fill", + np.dtype("uint8"): "GivenTensorStringFill", + } + + args_dict = {} + if tensor.dtype == np.dtype("uint8"): + args_dict.update({"values": [str(tensor.data)], "shape": [1]}) + else: + args_dict.update({"values": tensor, "shape": tensor.shape}) + + if device_option is not None: + args_dict["device_option"] = device_option + + return core.CreateOperator(kTypeNameMapper[tensor.dtype], [], [name], **args_dict) + + +def _create_const_fill_op_from_c2_int8_tensor(name, int8_tensor): + assert type(int8_tensor) == workspace.Int8Tensor + kTypeNameMapper = { + np.dtype("int32"): "Int8GivenIntTensorFill", + np.dtype("uint8"): "Int8GivenTensorFill", + } + + tensor = int8_tensor.data + assert tensor.dtype in [np.dtype("uint8"), np.dtype("int32")] + values = tensor.tobytes() if tensor.dtype == np.dtype("uint8") else tensor + + return core.CreateOperator( + kTypeNameMapper[tensor.dtype], + [], + [name], + values=values, + shape=tensor.shape, + Y_scale=int8_tensor.scale, + Y_zero_point=int8_tensor.zero_point, + ) + + +def create_const_fill_op( + name: str, + blob: Union[np.ndarray, workspace.Int8Tensor], + device_option: Optional[caffe2_pb2.DeviceOption] = None, +) -> caffe2_pb2.OperatorDef: + """ + Given a blob object, return the Caffe2 operator that creates this blob + as constant. Currently support NumPy tensor and Caffe2 Int8Tensor. + """ + + tensor_type = type(blob) + assert tensor_type in [ + np.ndarray, + workspace.Int8Tensor, + ], 'Error when creating const fill op for "{}", unsupported blob type: {}'.format( + name, type(blob) + ) + + if tensor_type == np.ndarray: + return _create_const_fill_op_from_numpy(name, blob, device_option) + elif tensor_type == workspace.Int8Tensor: + assert device_option is None + return _create_const_fill_op_from_c2_int8_tensor(name, blob) + + +def construct_init_net_from_params( + params: Dict[str, Any], device_options: Optional[Dict[str, caffe2_pb2.DeviceOption]] = None +) -> caffe2_pb2.NetDef: + """ + Construct the init_net from params dictionary + """ + init_net = caffe2_pb2.NetDef() + device_options = device_options or {} + for name, blob in params.items(): + if isinstance(blob, str): + logger.warning( + ( + "Blob {} with type {} is not supported in generating init net," + " skipped.".format(name, type(blob)) + ) + ) + continue + init_net.op.extend( + [create_const_fill_op(name, blob, device_option=device_options.get(name, None))] + ) + init_net.external_output.append(name) + return init_net + + +def get_producer_map(ssa): + """ + Return dict from versioned blob to (i, j), + where i is index of producer op, j is the index of output of that op. + """ + producer_map = {} + for i in range(len(ssa)): + outputs = ssa[i][1] + for j, outp in enumerate(outputs): + producer_map[outp] = (i, j) + return producer_map + + +def get_consumer_map(ssa): + """ + Return dict from versioned blob to list of (i, j), + where i is index of consumer op, j is the index of input of that op. + """ + consumer_map = collections.defaultdict(list) + for i in range(len(ssa)): + inputs = ssa[i][0] + for j, inp in enumerate(inputs): + consumer_map[inp].append((i, j)) + return consumer_map + + +def get_params_from_init_net( + init_net: caffe2_pb2.NetDef, +) -> [Dict[str, Any], Dict[str, caffe2_pb2.DeviceOption]]: + """ + Take the output blobs from init_net by running it. + Outputs: + params: dict from blob name to numpy array + device_options: dict from blob name to the device option of its creating op + """ + # NOTE: this assumes that the params is determined by producer op with the + # only exception be CopyGPUToCPU which is CUDA op but returns CPU tensor. + def _get_device_option(producer_op): + if producer_op.type == "CopyGPUToCPU": + return caffe2_pb2.DeviceOption() + else: + return producer_op.device_option + + with ScopedWS("__get_params_from_init_net__", is_reset=True, is_cleanup=True) as ws: + ws.RunNetOnce(init_net) + params = {b: fetch_any_blob(b) for b in init_net.external_output} + ssa, versions = core.get_ssa(init_net) + producer_map = get_producer_map(ssa) + device_options = { + b: _get_device_option(init_net.op[producer_map[(b, versions[b])][0]]) + for b in init_net.external_output + } + return params, device_options + + +def _updater_raise(op, input_types, output_types): + raise RuntimeError( + "Failed to apply updater for op {} given input_types {} and" + " output_types {}".format(op, input_types, output_types) + ) + + +def _generic_status_identifier( + predict_net: caffe2_pb2.NetDef, + status_updater: Callable, + known_status: Dict[Tuple[str, int], Any], +) -> Dict[Tuple[str, int], Any]: + """ + Statically infer the status of each blob, the status can be such as device type + (CPU/GPU), layout (NCHW/NHWC), data type (float32/int8), etc. "Blob" here + is versioned blob (Tuple[str, int]) in the format compatible with ssa. + Inputs: + predict_net: the caffe2 network + status_updater: a callable, given an op and the status of its input/output, + it returns the updated status of input/output. `None` is used for + representing unknown status. + known_status: a dict containing known status, used as initialization. + Outputs: + A dict mapping from versioned blob to its status + """ + ssa, versions = core.get_ssa(predict_net) + versioned_ext_input = [(b, 0) for b in predict_net.external_input] + versioned_ext_output = [(b, versions[b]) for b in predict_net.external_output] + all_versioned_blobs = set().union(*[set(x[0] + x[1]) for x in ssa]) + + allowed_vbs = all_versioned_blobs.union(versioned_ext_input).union(versioned_ext_output) + assert all(k in allowed_vbs for k in known_status) + assert all(v is not None for v in known_status.values()) + _known_status = copy.deepcopy(known_status) + + def _check_and_update(key, value): + assert value is not None + if key in _known_status: + if not _known_status[key] == value: + raise RuntimeError( + "Confilict status for {}, existing status {}, new status {}".format( + key, _known_status[key], value + ) + ) + _known_status[key] = value + + def _update_i(op, ssa_i): + versioned_inputs = ssa_i[0] + versioned_outputs = ssa_i[1] + + inputs_status = [_known_status.get(b, None) for b in versioned_inputs] + outputs_status = [_known_status.get(b, None) for b in versioned_outputs] + + new_inputs_status, new_outputs_status = status_updater(op, inputs_status, outputs_status) + + for versioned_blob, status in zip( + versioned_inputs + versioned_outputs, new_inputs_status + new_outputs_status + ): + if status is not None: + _check_and_update(versioned_blob, status) + + for op, ssa_i in zip(predict_net.op, ssa): + _update_i(op, ssa_i) + for op, ssa_i in zip(reversed(predict_net.op), reversed(ssa)): + _update_i(op, ssa_i) + + # NOTE: This strictly checks all the blob from predict_net must be assgined + # a known status. However sometimes it's impossible (eg. having deadend op), + # we may relax this constraint if + for k in all_versioned_blobs: + if k not in _known_status: + raise NotImplementedError( + "Can not infer the status for {}. Currently only support the case where" + " a single forward and backward pass can identify status for all blobs.".format(k) + ) + + return _known_status + + +def infer_device_type( + predict_net: caffe2_pb2.NetDef, + known_status: Dict[Tuple[str, int], Any], + device_name_style: str = "caffe2", +) -> Dict[Tuple[str, int], str]: + """ Return the device type ("cpu" or "gpu"/"cuda") of each (versioned) blob """ + + assert device_name_style in ["caffe2", "pytorch"] + _CPU_STR = "cpu" + _GPU_STR = "gpu" if device_name_style == "caffe2" else "cuda" + + def _copy_cpu_to_gpu_updater(op, input_types, output_types): + if input_types[0] == _GPU_STR or output_types[0] == _CPU_STR: + _updater_raise(op, input_types, output_types) + return ([_CPU_STR], [_GPU_STR]) + + def _copy_gpu_to_cpu_updater(op, input_types, output_types): + if input_types[0] == _CPU_STR or output_types[0] == _GPU_STR: + _updater_raise(op, input_types, output_types) + return ([_GPU_STR], [_CPU_STR]) + + def _other_ops_updater(op, input_types, output_types): + non_none_types = [x for x in input_types + output_types if x is not None] + if len(non_none_types) > 0: + the_type = non_none_types[0] + if not all(x == the_type for x in non_none_types): + _updater_raise(op, input_types, output_types) + else: + the_type = None + return ([the_type for _ in op.input], [the_type for _ in op.output]) + + def _device_updater(op, *args, **kwargs): + return { + "CopyCPUToGPU": _copy_cpu_to_gpu_updater, + "CopyGPUToCPU": _copy_gpu_to_cpu_updater, + }.get(op.type, _other_ops_updater)(op, *args, **kwargs) + + return _generic_status_identifier(predict_net, _device_updater, known_status) + + +# ==== torch/utils_caffe2/vis.py =============================================== + + +def _modify_blob_names(ops, blob_rename_f): + ret = [] + + def _replace_list(blob_list, replaced_list): + del blob_list[:] + blob_list.extend(replaced_list) + + for x in ops: + cur = copy.deepcopy(x) + _replace_list(cur.input, list(map(blob_rename_f, cur.input))) + _replace_list(cur.output, list(map(blob_rename_f, cur.output))) + ret.append(cur) + + return ret + + +def _rename_blob(name, blob_sizes, blob_ranges): + def _list_to_str(bsize): + ret = ", ".join([str(x) for x in bsize]) + ret = "[" + ret + "]" + return ret + + ret = name + if blob_sizes is not None and name in blob_sizes: + ret += "\n" + _list_to_str(blob_sizes[name]) + if blob_ranges is not None and name in blob_ranges: + ret += "\n" + _list_to_str(blob_ranges[name]) + + return ret + + +# graph_name could not contain word 'graph' +def save_graph(net, file_name, graph_name="net", op_only=True, blob_sizes=None, blob_ranges=None): + blob_rename_f = functools.partial(_rename_blob, blob_sizes=blob_sizes, blob_ranges=blob_ranges) + return save_graph_base(net, file_name, graph_name, op_only, blob_rename_f) + + +def save_graph_base(net, file_name, graph_name="net", op_only=True, blob_rename_func=None): + graph = None + ops = net.op + if blob_rename_func is not None: + ops = _modify_blob_names(ops, blob_rename_func) + if not op_only: + graph = net_drawer.GetPydotGraph(ops, graph_name, rankdir="TB") + else: + graph = net_drawer.GetPydotGraphMinimal( + ops, graph_name, rankdir="TB", minimal_dependency=True + ) + + try: + par_dir = os.path.dirname(file_name) + if not os.path.exists(par_dir): + os.makedirs(par_dir) + + format = os.path.splitext(os.path.basename(file_name))[-1] + if format == ".png": + graph.write_png(file_name) + elif format == ".pdf": + graph.write_pdf(file_name) + elif format == ".svg": + graph.write_svg(file_name) + else: + print("Incorrect format {}".format(format)) + except Exception as e: + print("Error when writing graph to image {}".format(e)) + + return graph + + +# ==== torch/utils_toffee/aten_to_caffe2.py ==================================== + + +def group_norm_replace_aten_with_caffe2(predict_net: caffe2_pb2.NetDef): + """ + For ONNX exported model, GroupNorm will be represented as ATen op, + this can be a drop in replacement from ATen to GroupNorm + """ + count = 0 + for op in predict_net.op: + if op.type == "ATen": + op_name = get_pb_arg_vals(op, "operator", None) # return byte in py3 + if op_name and op_name.decode() == "group_norm": + op.arg.remove(get_pb_arg(op, "operator")) + + if get_pb_arg_vali(op, "cudnn_enabled", None): + op.arg.remove(get_pb_arg(op, "cudnn_enabled")) + + num_groups = get_pb_arg_vali(op, "num_groups", None) + if num_groups is not None: + op.arg.remove(get_pb_arg(op, "num_groups")) + check_set_pb_arg(op, "group", "i", num_groups) + + op.type = "GroupNorm" + count += 1 + if count > 1: + logger.info("Replaced {} ATen operator to GroupNormOp".format(count)) + + +# ==== torch/utils_toffee/alias.py ============================================= + + +def alias(x, name, is_backward=False): + if not torch.onnx.is_in_onnx_export(): + return x + assert isinstance(x, torch.Tensor) + return torch.ops._caffe2.AliasWithName(x, name, is_backward=is_backward) + + +def fuse_alias_placeholder(predict_net, init_net): + """ Remove AliasWithName placeholder and rename the input/output of it """ + # First we finish all the re-naming + for i, op in enumerate(predict_net.op): + if op.type == "AliasWithName": + assert len(op.input) == 1 + assert len(op.output) == 1 + name = get_pb_arg_vals(op, "name", None).decode() + is_backward = bool(get_pb_arg_vali(op, "is_backward", 0)) + rename_op_input(predict_net, init_net, i, 0, name, from_producer=is_backward) + rename_op_output(predict_net, i, 0, name) + + # Remove AliasWithName, should be very safe since it's a non-op + new_ops = [] + for op in predict_net.op: + if op.type != "AliasWithName": + new_ops.append(op) + else: + # safety check + assert op.input == op.output + assert op.input[0] == op.arg[0].s.decode() + del predict_net.op[:] + predict_net.op.extend(new_ops) + + +# ==== torch/utils_caffe2/graph_transform.py =================================== + + +class IllegalGraphTransformError(ValueError): + """ When a graph transform function call can't be executed. """ + + +def _rename_versioned_blob_in_proto( + proto: caffe2_pb2.NetDef, + old_name: str, + new_name: str, + version: int, + ssa: List[Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]], + start_versions: Dict[str, int], + end_versions: Dict[str, int], +): + """ In given proto, rename all blobs with matched version """ + # Operater list + for op, i_th_ssa in zip(proto.op, ssa): + versioned_inputs, versioned_outputs = i_th_ssa + for i in range(len(op.input)): + if versioned_inputs[i] == (old_name, version): + op.input[i] = new_name + for i in range(len(op.output)): + if versioned_outputs[i] == (old_name, version): + op.output[i] = new_name + # external_input + if start_versions.get(old_name, 0) == version: + for i in range(len(proto.external_input)): + if proto.external_input[i] == old_name: + proto.external_input[i] = new_name + # external_output + if end_versions.get(old_name, 0) == version: + for i in range(len(proto.external_output)): + if proto.external_output[i] == old_name: + proto.external_output[i] = new_name + + +def rename_op_input( + predict_net: caffe2_pb2.NetDef, + init_net: caffe2_pb2.NetDef, + op_id: int, + input_id: int, + new_name: str, + from_producer: bool = False, +): + """ + Rename the op_id-th operator in predict_net, change it's input_id-th input's + name to the new_name. It also does automatic re-route and change + external_input and init_net if necessary. + - It requires the input is only consumed by this op. + - This function modifies predict_net and init_net in-place. + - When from_producer is enable, this also updates other operators that consumes + the same input. Be cautious because may trigger unintended behavior. + """ + assert isinstance(predict_net, caffe2_pb2.NetDef) + assert isinstance(init_net, caffe2_pb2.NetDef) + + init_net_ssa, init_net_versions = core.get_ssa(init_net) + predict_net_ssa, predict_net_versions = core.get_ssa( + predict_net, copy.deepcopy(init_net_versions) + ) + + versioned_inputs, versioned_outputs = predict_net_ssa[op_id] + old_name, version = versioned_inputs[input_id] + + if from_producer: + producer_map = get_producer_map(predict_net_ssa) + if not (old_name, version) in producer_map: + raise NotImplementedError( + "Can't find producer, the input {} is probably from" + " init_net, this is not supported yet.".format(old_name) + ) + producer = producer_map[(old_name, version)] + rename_op_output(predict_net, producer[0], producer[1], new_name) + return + + def contain_targets(op_ssa): + return (old_name, version) in op_ssa[0] + + is_consumer = [contain_targets(op_ssa) for op_ssa in predict_net_ssa] + if sum(is_consumer) > 1: + raise IllegalGraphTransformError( + ( + "Input '{}' of operator(#{}) are consumed by other ops, please use" + + " rename_op_output on the producer instead. Offending op: \n{}" + ).format(old_name, op_id, predict_net.op[op_id]) + ) + + # update init_net + _rename_versioned_blob_in_proto( + init_net, old_name, new_name, version, init_net_ssa, {}, init_net_versions + ) + # update predict_net + _rename_versioned_blob_in_proto( + predict_net, + old_name, + new_name, + version, + predict_net_ssa, + init_net_versions, + predict_net_versions, + ) + + +def rename_op_output(predict_net: caffe2_pb2.NetDef, op_id: int, output_id: int, new_name: str): + """ + Rename the op_id-th operator in predict_net, change it's output_id-th input's + name to the new_name. It also does automatic re-route and change + external_output and if necessary. + - It allows multiple consumers of its output. + - This function modifies predict_net in-place, doesn't need init_net. + """ + assert isinstance(predict_net, caffe2_pb2.NetDef) + + ssa, blob_versions = core.get_ssa(predict_net) + + versioned_inputs, versioned_outputs = ssa[op_id] + old_name, version = versioned_outputs[output_id] + + # update predict_net + _rename_versioned_blob_in_proto( + predict_net, old_name, new_name, version, ssa, {}, blob_versions + ) + + +def get_sub_graph_external_input_output( + predict_net: caffe2_pb2.NetDef, sub_graph_op_indices: List[int] +) -> Tuple[List[Tuple[str, int]], List[Tuple[str, int]]]: + """ + Return the list of external input/output of sub-graph, + each element is tuple of the name and corresponding version in predict_net. + + external input/output is defined the same way as caffe2 NetDef. + """ + ssa, versions = core.get_ssa(predict_net) + + all_inputs = [] + all_outputs = [] + for op_id in sub_graph_op_indices: + all_inputs += [inp for inp in ssa[op_id][0] if inp not in all_inputs] + all_outputs += list(ssa[op_id][1]) # ssa output won't repeat + + # for versioned blobs, external inputs are just those blob in all_inputs + # but not in all_outputs + ext_inputs = [inp for inp in all_inputs if inp not in all_outputs] + + # external outputs are essentially outputs of this subgraph that are used + # outside of this sub-graph (including predict_net.external_output) + all_other_inputs = sum( + (ssa[i][0] for i in range(len(ssa)) if i not in sub_graph_op_indices), + [(outp, versions[outp]) for outp in predict_net.external_output], + ) + ext_outputs = [outp for outp in all_outputs if outp in set(all_other_inputs)] + + return ext_inputs, ext_outputs + + +class DiGraph: + """ A DAG representation of caffe2 graph, each vertice is a versioned blob. """ + + def __init__(self): + self.vertices = set() + self.graph = collections.defaultdict(list) + + def add_edge(self, u, v): + self.graph[u].append(v) + self.vertices.add(u) + self.vertices.add(v) + + # grab from https://www.geeksforgeeks.org/find-paths-given-source-destination/ + def get_all_paths(self, s, d): + visited = {k: False for k in self.vertices} + path = [] + all_paths = [] + + def _get_all_paths_util(graph, u, d, visited, path): + visited[u] = True + path.append(u) + if u == d: + all_paths.append(copy.deepcopy(path)) + else: + for i in graph[u]: + if not visited[i]: + _get_all_paths_util(graph, i, d, visited, path) + path.pop() + visited[u] = False + + _get_all_paths_util(self.graph, s, d, visited, path) + return all_paths + + @staticmethod + def from_ssa(ssa): + graph = DiGraph() + for op_id in range(len(ssa)): + for inp in ssa[op_id][0]: + for outp in ssa[op_id][1]: + graph.add_edge(inp, outp) + return graph + + +def _get_dependency_chain(ssa, versioned_target, versioned_source): + """ + Return the index list of relevant operator to produce target blob from source blob, + if there's no dependency, return empty list. + """ + + # finding all paths between nodes can be O(N!), thus we can only search + # in the subgraph using the op starting from the first consumer of source blob + # to the producer of the target blob. + consumer_map = get_consumer_map(ssa) + producer_map = get_producer_map(ssa) + start_op = min(x[0] for x in consumer_map[versioned_source]) - 15 + end_op = ( + producer_map[versioned_target][0] + 15 if versioned_target in producer_map else start_op + ) + sub_graph_ssa = ssa[start_op : end_op + 1] + if len(sub_graph_ssa) > 30: + logger.warning( + "Subgraph bebetween {} and {} is large (from op#{} to op#{}), it" + " might take non-trival time to find all paths between them.".format( + versioned_source, versioned_target, start_op, end_op + ) + ) + + dag = DiGraph.from_ssa(sub_graph_ssa) + paths = dag.get_all_paths(versioned_source, versioned_target) # include two ends + ops_in_paths = [[producer_map[blob][0] for blob in path[1:]] for path in paths] + return sorted(set().union(*[set(ops) for ops in ops_in_paths])) + + +def identify_reshape_sub_graph(predict_net: caffe2_pb2.NetDef) -> List[List[int]]: + """ + Idenfity the reshape sub-graph in a protobuf. + The reshape sub-graph is defined as matching the following pattern: + + (input_blob) -> Op_1 -> ... -> Op_N -> (new_shape) -─┐ + └-------------------------------------------> Reshape -> (output_blob) + + Return: + List of sub-graphs, each sub-graph is represented as a list of indices + of the relavent ops, [Op_1, Op_2, ..., Op_N, Reshape] + """ + + ssa, _ = core.get_ssa(predict_net) + + ret = [] + for i, op in enumerate(predict_net.op): + if op.type == "Reshape": + assert len(op.input) == 2 + input_ssa = ssa[i][0] + data_source = input_ssa[0] + shape_source = input_ssa[1] + op_indices = _get_dependency_chain(ssa, shape_source, data_source) + ret.append(op_indices + [i]) + return ret + + +def remove_reshape_for_fc(predict_net, params): + """ + In PyTorch nn.Linear has to take 2D tensor, this often leads to reshape + a 4D tensor to 2D by calling .view(). However this (dynamic) reshaping + doesn't work well with ONNX and Int8 tools, and cause using extra + ops (eg. ExpandDims) that might not be available on mobile. + Luckily Caffe2 supports 4D tensor for FC, so we can remove those reshape + after exporting ONNX model. + """ + from caffe2.python import core + + # find all reshape sub-graph that can be removed, which is now all Reshape + # sub-graph whose output is only consumed by FC. + # TODO: to make it safer, we may need the actually value to better determine + # if a Reshape before FC is removable. + reshape_sub_graphs = identify_reshape_sub_graph(predict_net) + sub_graphs_to_remove = [] + for reshape_sub_graph in reshape_sub_graphs: + reshape_op_id = reshape_sub_graph[-1] + assert predict_net.op[reshape_op_id].type == "Reshape" + ssa, _ = core.get_ssa(predict_net) + reshape_output = ssa[reshape_op_id][1][0] + consumers = [i for i in range(len(ssa)) if reshape_output in ssa[i][0]] + if all(predict_net.op[consumer].type == "FC" for consumer in consumers): + # safety check if the sub-graph is isolated, for this reshape sub-graph, + # it means it has one non-param external input and one external output. + ext_inputs, ext_outputs = get_sub_graph_external_input_output( + predict_net, reshape_sub_graph + ) + non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0] + if len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1: + sub_graphs_to_remove.append(reshape_sub_graph) + + # perform removing subgraph by: + # 1: rename the Reshape's output to its input, then the graph can be + # seen as in-place itentify, meaning whose external input/output are the same. + # 2: simply remove those ops. + remove_op_ids = [] + params_to_remove = [] + for sub_graph in sub_graphs_to_remove: + logger.info( + "Remove Reshape sub-graph:\n{}".format( + "".join(["(#{:>4})\n{}".format(i, predict_net.op[i]) for i in sub_graph]) + ) + ) + reshape_op_id = sub_graph[-1] + new_reshap_output = predict_net.op[reshape_op_id].input[0] + rename_op_output(predict_net, reshape_op_id, 0, new_reshap_output) + ext_inputs, ext_outputs = get_sub_graph_external_input_output(predict_net, sub_graph) + non_params_ext_inputs = [inp for inp in ext_inputs if inp[1] != 0] + params_ext_inputs = [inp for inp in ext_inputs if inp[1] == 0] + assert len(non_params_ext_inputs) == 1 and len(ext_outputs) == 1 + assert ext_outputs[0][0] == non_params_ext_inputs[0][0] + assert ext_outputs[0][1] == non_params_ext_inputs[0][1] + 1 + remove_op_ids.extend(sub_graph) + params_to_remove.extend(params_ext_inputs) + + predict_net = copy.deepcopy(predict_net) + new_ops = [op for i, op in enumerate(predict_net.op) if i not in remove_op_ids] + del predict_net.op[:] + predict_net.op.extend(new_ops) + for versioned_params in params_to_remove: + name = versioned_params[0] + logger.info("Remove params: {} from init_net and predict_net.external_input".format(name)) + del params[name] + predict_net.external_input.remove(name) + + return predict_net, params + + +def fuse_copy_between_cpu_and_gpu(predict_net: caffe2_pb2.NetDef): + """ + In-place fuse extra copy ops between cpu/gpu for the following case: + a -CopyAToB-> b -CopyBToA> c1 -NextOp1-> d1 + -CopyBToA> c2 -NextOp2-> d2 + The fused network will look like: + a -NextOp1-> d1 + -NextOp2-> d2 + """ + + _COPY_OPS = ["CopyCPUToGPU", "CopyGPUToCPU"] + + def _fuse_once(predict_net): + ssa, blob_versions = core.get_ssa(predict_net) + consumer_map = get_consumer_map(ssa) + versioned_external_output = [ + (name, blob_versions[name]) for name in predict_net.external_output + ] + + for op_id, op in enumerate(predict_net.op): + if op.type in _COPY_OPS: + fw_copy_versioned_output = ssa[op_id][1][0] + consumer_ids = [x[0] for x in consumer_map[fw_copy_versioned_output]] + reverse_op_type = _COPY_OPS[1 - _COPY_OPS.index(op.type)] + + is_fusable = ( + len(consumer_ids) > 0 + and fw_copy_versioned_output not in versioned_external_output + and all( + predict_net.op[_op_id].type == reverse_op_type + and ssa[_op_id][1][0] not in versioned_external_output + for _op_id in consumer_ids + ) + ) + + if is_fusable: + for rv_copy_op_id in consumer_ids: + # making each NextOp uses "a" directly and removing Copy ops + rs_copy_versioned_output = ssa[rv_copy_op_id][1][0] + next_op_id, inp_id = consumer_map[rs_copy_versioned_output][0] + predict_net.op[next_op_id].input[inp_id] = op.input[0] + # remove CopyOps + new_ops = [ + op + for i, op in enumerate(predict_net.op) + if i != op_id and i not in consumer_ids + ] + del predict_net.op[:] + predict_net.op.extend(new_ops) + return True + + return False + + # _fuse_once returns False is nothing can be fused + while _fuse_once(predict_net): + pass + + +def remove_dead_end_ops(net_def: caffe2_pb2.NetDef): + """ remove ops if its output is not used or not in external_output """ + ssa, versions = core.get_ssa(net_def) + versioned_external_output = [(name, versions[name]) for name in net_def.external_output] + consumer_map = get_consumer_map(ssa) + removed_op_ids = set() + + def _is_dead_end(versioned_blob): + return not ( + versioned_blob in versioned_external_output + or ( + len(consumer_map[versioned_blob]) > 0 + and all(x[0] not in removed_op_ids for x in consumer_map[versioned_blob]) + ) + ) + + for i, ssa_i in reversed(list(enumerate(ssa))): + versioned_outputs = ssa_i[1] + if all(_is_dead_end(outp) for outp in versioned_outputs): + removed_op_ids.add(i) + + # simply removing those deadend ops should have no effect to external_output + new_ops = [op for i, op in enumerate(net_def.op) if i not in removed_op_ids] + del net_def.op[:] + net_def.op.extend(new_ops) diff --git a/detectron2/layers/__init__.py b/detectron2/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2753739a03659dff5bc5b87f8c8417056d319842 --- /dev/null +++ b/detectron2/layers/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .batch_norm import FrozenBatchNorm2d, get_norm, NaiveSyncBatchNorm +from .deform_conv import DeformConv, ModulatedDeformConv +from .mask_ops import paste_masks_in_image +from .nms import batched_nms, batched_nms_rotated, nms, nms_rotated +from .roi_align import ROIAlign, roi_align +from .roi_align_rotated import ROIAlignRotated, roi_align_rotated +from .shape_spec import ShapeSpec +from .wrappers import BatchNorm2d, Conv2d, ConvTranspose2d, cat, interpolate, Linear +from .blocks import CNNBlockBase + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/layers/batch_norm.py b/detectron2/layers/batch_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..1339c6eaedfbc65c9604043234b738382d07fd40 --- /dev/null +++ b/detectron2/layers/batch_norm.py @@ -0,0 +1,242 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import torch +import torch.distributed as dist +from torch import nn +from torch.autograd.function import Function +from torch.nn import functional as F + +from detectron2.utils import comm + +from .wrappers import BatchNorm2d + +TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2]) + + +class FrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + It contains non-trainable buffers called + "weight" and "bias", "running_mean", "running_var", + initialized to perform identity transformation. + + The pre-trained backbone models from Caffe2 only contain "weight" and "bias", + which are computed from the original four parameters of BN. + The affine transform `x * weight + bias` will perform the equivalent + computation of `(x - running_mean) / sqrt(running_var) * weight + bias`. + When loading a backbone model from Caffe2, "running_mean" and "running_var" + will be left unchanged as identity transformation. + + Other pre-trained backbone models may contain all 4 parameters. + + The forward is implemented by `F.batch_norm(..., training=False)`. + """ + + _version = 3 + + def __init__(self, num_features, eps=1e-5): + super().__init__() + self.num_features = num_features + self.eps = eps + self.register_buffer("weight", torch.ones(num_features)) + self.register_buffer("bias", torch.zeros(num_features)) + self.register_buffer("running_mean", torch.zeros(num_features)) + self.register_buffer("running_var", torch.ones(num_features) - eps) + + def forward(self, x): + if x.requires_grad: + # When gradients are needed, F.batch_norm will use extra memory + # because its backward op computes gradients for weight/bias as well. + scale = self.weight * (self.running_var + self.eps).rsqrt() + bias = self.bias - self.running_mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + return x * scale + bias + else: + # When gradients are not needed, F.batch_norm is a single fused op + # and provide more optimization opportunities. + return F.batch_norm( + x, + self.running_mean, + self.running_var, + self.weight, + self.bias, + training=False, + eps=self.eps, + ) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + version = local_metadata.get("version", None) + + if version is None or version < 2: + # No running_mean/var in early versions + # This will silent the warnings + if prefix + "running_mean" not in state_dict: + state_dict[prefix + "running_mean"] = torch.zeros_like(self.running_mean) + if prefix + "running_var" not in state_dict: + state_dict[prefix + "running_var"] = torch.ones_like(self.running_var) + + if version is not None and version < 3: + logger = logging.getLogger(__name__) + logger.info("FrozenBatchNorm {} is upgraded to version 3.".format(prefix.rstrip("."))) + # In version < 3, running_var are used without +eps. + state_dict[prefix + "running_var"] -= self.eps + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def __repr__(self): + return "FrozenBatchNorm2d(num_features={}, eps={})".format(self.num_features, self.eps) + + @classmethod + def convert_frozen_batchnorm(cls, module): + """ + Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm. + + Args: + module (torch.nn.Module): + + Returns: + If module is BatchNorm/SyncBatchNorm, returns a new module. + Otherwise, in-place convert module and return it. + + Similar to convert_sync_batchnorm in + https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py + """ + bn_module = nn.modules.batchnorm + bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm) + res = module + if isinstance(module, bn_module): + res = cls(module.num_features) + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for name, child in module.named_children(): + new_child = cls.convert_frozen_batchnorm(child) + if new_child is not child: + res.add_module(name, new_child) + return res + + +def get_norm(norm, out_channels): + """ + Args: + norm (str or callable): either one of BN, SyncBN, FrozenBN, GN; + or a callable that takes a channel number and returns + the normalization layer as a nn.Module. + + Returns: + nn.Module or None: the normalization layer + """ + if isinstance(norm, str): + if len(norm) == 0: + return None + norm = { + "BN": BatchNorm2d, + # Fixed in https://github.com/pytorch/pytorch/pull/36382 + "SyncBN": NaiveSyncBatchNorm if TORCH_VERSION <= (1, 5) else nn.SyncBatchNorm, + "FrozenBN": FrozenBatchNorm2d, + "GN": lambda channels: nn.GroupNorm(32, channels), + # for debugging: + "nnSyncBN": nn.SyncBatchNorm, + "naiveSyncBN": NaiveSyncBatchNorm, + }[norm] + return norm(out_channels) + + +class AllReduce(Function): + @staticmethod + def forward(ctx, input): + input_list = [torch.zeros_like(input) for k in range(dist.get_world_size())] + # Use allgather instead of allreduce since I don't trust in-place operations .. + dist.all_gather(input_list, input, async_op=False) + inputs = torch.stack(input_list, dim=0) + return torch.sum(inputs, dim=0) + + @staticmethod + def backward(ctx, grad_output): + dist.all_reduce(grad_output, async_op=False) + return grad_output + + +class NaiveSyncBatchNorm(BatchNorm2d): + """ + In PyTorch<=1.5, `nn.SyncBatchNorm` has incorrect gradient + when the batch size on each worker is different. + (e.g., when scale augmentation is used, or when it is applied to mask head). + + This is a slower but correct alternative to `nn.SyncBatchNorm`. + + Note: + There isn't a single definition of Sync BatchNorm. + + When ``stats_mode==""``, this module computes overall statistics by using + statistics of each worker with equal weight. The result is true statistics + of all samples (as if they are all on one worker) only when all workers + have the same (N, H, W). This mode does not support inputs with zero batch size. + + When ``stats_mode=="N"``, this module computes overall statistics by weighting + the statistics of each worker by their ``N``. The result is true statistics + of all samples (as if they are all on one worker) only when all workers + have the same (H, W). It is slower than ``stats_mode==""``. + + Even though the result of this module may not be the true statistics of all samples, + it may still be reasonable because it might be preferrable to assign equal weights + to all workers, regardless of their (H, W) dimension, instead of putting larger weight + on larger images. From preliminary experiments, little difference is found between such + a simplified implementation and an accurate computation of overall mean & variance. + """ + + def __init__(self, *args, stats_mode="", **kwargs): + super().__init__(*args, **kwargs) + assert stats_mode in ["", "N"] + self._stats_mode = stats_mode + + def forward(self, input): + if comm.get_world_size() == 1 or not self.training: + return super().forward(input) + + B, C = input.shape[0], input.shape[1] + + mean = torch.mean(input, dim=[0, 2, 3]) + meansqr = torch.mean(input * input, dim=[0, 2, 3]) + + if self._stats_mode == "": + assert B > 0, 'SyncBatchNorm(stats_mode="") does not support zero batch size.' + vec = torch.cat([mean, meansqr], dim=0) + vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) + mean, meansqr = torch.split(vec, C) + momentum = self.momentum + else: + if B == 0: + vec = torch.zeros([2 * C + 1], device=mean.device, dtype=mean.dtype) + vec = vec + input.sum() # make sure there is gradient w.r.t input + else: + vec = torch.cat( + [mean, meansqr, torch.ones([1], device=mean.device, dtype=mean.dtype)], dim=0 + ) + vec = AllReduce.apply(vec * B) + + total_batch = vec[-1].detach() + momentum = total_batch.clamp(max=1) * self.momentum # no update if total_batch is 0 + total_batch = torch.max(total_batch, torch.ones_like(total_batch)) # avoid div-by-zero + mean, meansqr, _ = torch.split(vec / total_batch, C) + + var = meansqr - mean * mean + invstd = torch.rsqrt(var + self.eps) + scale = self.weight * invstd + bias = self.bias - mean * scale + scale = scale.reshape(1, -1, 1, 1) + bias = bias.reshape(1, -1, 1, 1) + + self.running_mean += momentum * (mean.detach() - self.running_mean) + self.running_var += momentum * (var.detach() - self.running_var) + return input * scale + bias diff --git a/detectron2/layers/blocks.py b/detectron2/layers/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..1d06fec22e472febbc960c49f747acddd2ab7208 --- /dev/null +++ b/detectron2/layers/blocks.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from torch import nn + +from .batch_norm import FrozenBatchNorm2d + + +class CNNBlockBase(nn.Module): + """ + A CNN block is assumed to have input channels, output channels and a stride. + The input and output of `forward()` method must be NCHW tensors. + The method can perform arbitrary computation but must match the given + channels and stride specification. + + Attribute: + in_channels (int): + out_channels (int): + stride (int): + """ + + def __init__(self, in_channels, out_channels, stride): + """ + The `__init__` method of any subclass should also contain these arguments. + + Args: + in_channels (int): + out_channels (int): + stride (int): + """ + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stride = stride + + def freeze(self): + """ + Make this block not trainable. + This method sets all parameters to `requires_grad=False`, + and convert all BatchNorm layers to FrozenBatchNorm + + Returns: + the block itself + """ + for p in self.parameters(): + p.requires_grad = False + FrozenBatchNorm2d.convert_frozen_batchnorm(self) + return self diff --git a/detectron2/layers/csrc/README.md b/detectron2/layers/csrc/README.md new file mode 100644 index 0000000000000000000000000000000000000000..778ed3da0bae89820831bcd8a72ff7b9cad8d4dd --- /dev/null +++ b/detectron2/layers/csrc/README.md @@ -0,0 +1,7 @@ + + +To add a new Op: + +1. Create a new directory +2. Implement new ops there +3. Delcare its Python interface in `vision.cpp`. diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign.h b/detectron2/layers/csrc/ROIAlign/ROIAlign.h new file mode 100644 index 0000000000000000000000000000000000000000..2d95eac6e29d5e5624afbc6c545776d78ebc709c --- /dev/null +++ b/detectron2/layers/csrc/ROIAlign/ROIAlign.h @@ -0,0 +1,130 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace detectron2 { + +at::Tensor ROIAlign_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + bool aligned); + +at::Tensor ROIAlign_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio, + bool aligned); + +#ifdef WITH_CUDA +at::Tensor ROIAlign_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + bool aligned); + +at::Tensor ROIAlign_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio, + bool aligned); +#endif + +// Interface for Python +inline at::Tensor ROIAlign_forward( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + bool aligned) { + if (input.is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_forward_cuda( + input, + rois, + spatial_scale, + pooled_height, + pooled_width, + sampling_ratio, + aligned); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlign_forward_cpu( + input, + rois, + spatial_scale, + pooled_height, + pooled_width, + sampling_ratio, + aligned); +} + +inline at::Tensor ROIAlign_backward( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio, + bool aligned) { + if (grad.is_cuda()) { +#ifdef WITH_CUDA + return ROIAlign_backward_cuda( + grad, + rois, + spatial_scale, + pooled_height, + pooled_width, + batch_size, + channels, + height, + width, + sampling_ratio, + aligned); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlign_backward_cpu( + grad, + rois, + spatial_scale, + pooled_height, + pooled_width, + batch_size, + channels, + height, + width, + sampling_ratio, + aligned); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp b/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..52fc83f8140b29de7b2ad3cb490b8cb672959e16 --- /dev/null +++ b/detectron2/layers/csrc/ROIAlign/ROIAlign_cpu.cpp @@ -0,0 +1,508 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include "ROIAlign.h" + +namespace { + +// implementation taken from Caffe2 +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T x = xx; + T y = yy; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indices + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void ROIAlignForward( + const int nthreads, + const T* input, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* rois, + T* output, + bool aligned) { + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_rois[1] * spatial_scale - offset; + T roi_start_h = offset_rois[2] * spatial_scale - offset; + T roi_end_w = offset_rois[3] * spatial_scale - offset; + T roi_end_h = offset_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (aligned) { + AT_ASSERTM( + roi_width >= 0 && roi_height >= 0, + "ROIs in ROIAlign cannot have non-negative size!"); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + // When the grid is empty, output zeros == 0/1, instead of NaN. + const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + // we want to precalculate indices and weights shared by all channels, + // this is the key point of optimization + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_input[pc.pos1] + + pc.w2 * offset_input[pc.pos2] + + pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + output[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +template +void bilinear_interpolate_gradient( + const int height, + const int width, + T y, + T x, + T& w1, + T& w2, + T& w3, + T& w4, + int& x_low, + int& x_high, + int& y_low, + int& y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +inline void add(T* address, const T& val) { + *address += val; +} + +template +void ROIAlignBackward( + const int nthreads, + // may not be contiguous, and should be indexed using n_stride, etc + const T* grad_output, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + T* grad_input, + const T* rois, + const int n_stride, + const int c_stride, + const int h_stride, + const int w_stride, + bool aligned) { + for (int index = 0; index < nthreads; index++) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_rois[1] * spatial_scale - offset; + T roi_start_h = offset_rois[2] * spatial_scale - offset; + T roi_end_w = offset_rois[3] * spatial_scale - offset; + T roi_end_h = offset_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (aligned) { + AT_ASSERTM( + roi_width >= 0 && roi_height >= 0, + "ROIs in ROIAlign do not have non-negative size!"); + } else { // for backward-compatibility only + roi_width = std::max(roi_width, (T)1.); + roi_height = std::max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + int output_offset = n * n_stride + c * c_stride; + const T* offset_grad_output = grad_output + output_offset; + const T grad_output_this_bin = + offset_grad_output[ph * h_stride + pw * w_stride]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient( + height, + width, + y, + x, + w1, + w2, + w3, + w4, + x_low, + x_high, + y_low, + y_high, + index); + + T g1 = grad_output_this_bin * w1 / count; + T g2 = grad_output_this_bin * w2 / count; + T g3 = grad_output_this_bin * w3 / count; + T g4 = grad_output_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + // atomic add is not needed for now since it is single threaded + add(offset_grad_input + y_low * width + x_low, static_cast(g1)); + add(offset_grad_input + y_low * width + x_high, static_cast(g2)); + add(offset_grad_input + y_high * width + x_low, static_cast(g3)); + add(offset_grad_input + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // for +} // ROIAlignBackward + +} // namespace + +namespace detectron2 { + +at::Tensor ROIAlign_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + bool aligned) { + AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor"); + AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); + + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlign_forward_cpu"; + at::checkAllSameType(c, {input_t, rois_t}); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + at::Tensor output = at::zeros( + {num_rois, channels, pooled_height, pooled_width}, input.options()); + + auto output_size = num_rois * pooled_height * pooled_width * channels; + + if (output.numel() == 0) + return output; + + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "ROIAlign_forward", [&] { + ROIAlignForward( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois_.data_ptr(), + output.data_ptr(), + aligned); + }); + return output; +} + +at::Tensor ROIAlign_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio, + bool aligned) { + AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor"); + AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); + + at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlign_backward_cpu"; + at::checkAllSameType(c, {grad_t, rois_t}); + + at::Tensor grad_input = + at::zeros({batch_size, channels, height, width}, grad.options()); + + // handle possibly empty gradients + if (grad.numel() == 0) { + return grad_input; + } + + // get stride values to ensure indexing into gradients is correct. + int n_stride = grad.stride(0); + int c_stride = grad.stride(1); + int h_stride = grad.stride(2); + int w_stride = grad.stride(3); + + auto rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "ROIAlign_forward", [&] { + ROIAlignBackward( + grad.numel(), + grad.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data_ptr(), + rois_.data_ptr(), + n_stride, + c_stride, + h_stride, + w_stride, + aligned); + }); + return grad_input; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu b/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..2e05953b03089203d29bc304726afbca7ee5d464 --- /dev/null +++ b/detectron2/layers/csrc/ROIAlign/ROIAlign_cuda.cu @@ -0,0 +1,430 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + +template +__device__ T bilinear_interpolate( + const T* bottom_data, + const int height, + const int width, + T y, + T x, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + return 0; + } + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = bottom_data[y_low * width + x_low]; + T v2 = bottom_data[y_low * width + x_high]; + T v3 = bottom_data[y_high * width + x_low]; + T v4 = bottom_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__global__ void RoIAlignForward( + const int nthreads, + const T* bottom_data, + const T spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* bottom_rois, + T* top_data, + bool aligned) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + // When the grid is empty, output zeros == 0/1, instead of NaN. + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +template +__device__ void bilinear_interpolate_gradient( + const int height, + const int width, + T y, + T x, + T& w1, + T& w2, + T& w3, + T& w4, + int& x_low, + int& x_high, + int& y_low, + int& y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) + y = 0; + if (x <= 0) + x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = bottom_data[y_low * width + x_low]; + // T v2 = bottom_data[y_low * width + x_high]; + // T v3 = bottom_data[y_high * width + x_low]; + // T v4 = bottom_data[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +__global__ void RoIAlignBackwardFeature( + const int nthreads, + const T* top_diff, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + T* bottom_diff, + const T* bottom_rois, + bool aligned) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient( + height, + width, + y, + x, + w1, + w2, + w3, + w4, + x_low, + x_high, + y_low, + y_high, + index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd( + offset_bottom_diff + y_low * width + x_low, static_cast(g1)); + atomicAdd( + offset_bottom_diff + y_low * width + x_high, static_cast(g2)); + atomicAdd( + offset_bottom_diff + y_high * width + x_low, static_cast(g3)); + atomicAdd( + offset_bottom_diff + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + +namespace detectron2 { + +at::Tensor ROIAlign_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + bool aligned) { + AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlign_forward_cuda"; + at::checkAllSameGPU(c, {input_t, rois_t}); + at::checkAllSameType(c, {input_t, rois_t}); + at::cuda::CUDAGuard device_guard(input.device()); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty( + {num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min( + at::cuda::ATenCeilDiv( + static_cast(output_size), static_cast(512)), + static_cast(4096))); + dim3 block(512); + + if (output.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return output; + } + + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] { + RoIAlignForward<<>>( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois_.data_ptr(), + output.data_ptr(), + aligned); + }); + cudaDeviceSynchronize(); + AT_CUDA_CHECK(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlign_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio, + bool aligned) { + AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + + at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; + at::CheckedFrom c = "ROIAlign_backward_cuda"; + at::checkAllSameGPU(c, {grad_t, rois_t}); + at::checkAllSameType(c, {grad_t, rois_t}); + at::cuda::CUDAGuard device_guard(grad.device()); + + auto num_rois = rois.size(0); + auto grad_input = + at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min( + at::cuda::ATenCeilDiv( + static_cast(grad.numel()), static_cast(512)), + static_cast(4096))); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; + } + + auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] { + RoIAlignBackwardFeature<<>>( + grad.numel(), + grad_.data_ptr(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data_ptr(), + rois_.data_ptr(), + aligned); + }); + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h new file mode 100644 index 0000000000000000000000000000000000000000..a99c8ebddaa4936e26437b42d62e2b8355c655aa --- /dev/null +++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated.h @@ -0,0 +1,115 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace detectron2 { + +at::Tensor ROIAlignRotated_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + +at::Tensor ROIAlignRotated_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio); + +#ifdef WITH_CUDA +at::Tensor ROIAlignRotated_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio); + +at::Tensor ROIAlignRotated_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio); +#endif + +// Interface for Python +inline at::Tensor ROIAlignRotated_forward( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + if (input.is_cuda()) { +#ifdef WITH_CUDA + return ROIAlignRotated_forward_cuda( + input, + rois, + spatial_scale, + pooled_height, + pooled_width, + sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlignRotated_forward_cpu( + input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); +} + +inline at::Tensor ROIAlignRotated_backward( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + if (grad.is_cuda()) { +#ifdef WITH_CUDA + return ROIAlignRotated_backward_cuda( + grad, + rois, + spatial_scale, + pooled_height, + pooled_width, + batch_size, + channels, + height, + width, + sampling_ratio); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + return ROIAlignRotated_backward_cpu( + grad, + rois, + spatial_scale, + pooled_height, + pooled_width, + batch_size, + channels, + height, + width, + sampling_ratio); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e5e1ffdccd0e2ced15fa34b4906388d371bffe2 --- /dev/null +++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cpu.cpp @@ -0,0 +1,522 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include "ROIAlignRotated.h" + +// Note: this implementation originates from the Caffe2 ROIAlignRotated Op +// and PyTorch ROIAlign (non-rotated) Op implementations. +// The key difference between this implementation and those ones is +// we don't do "legacy offset" in this version, as there aren't many previous +// works, if any, using the "legacy" ROIAlignRotated Op. +// This would make the interface a bit cleaner. + +namespace detectron2 { + +namespace { +template +struct PreCalc { + int pos1; + int pos2; + int pos3; + int pos4; + T w1; + T w2; + T w3; + T w4; +}; + +template +void pre_calc_for_bilinear_interpolate( + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int iy_upper, + const int ix_upper, + T roi_start_h, + T roi_start_w, + T bin_size_h, + T bin_size_w, + int roi_bin_grid_h, + int roi_bin_grid_w, + T roi_center_h, + T roi_center_w, + T cos_theta, + T sin_theta, + std::vector>& pre_calc) { + int pre_calc_index = 0; + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + for (int iy = 0; iy < iy_upper; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < ix_upper; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + // In image space, (y, x) is the order for Right Handed System, + // and this is essentially multiplying the point by a rotation matrix + // to rotate it counterclockwise through angle theta. + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + // deal with: inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + PreCalc pc; + pc.pos1 = 0; + pc.pos2 = 0; + pc.pos3 = 0; + pc.pos4 = 0; + pc.w1 = 0; + pc.w2 = 0; + pc.w3 = 0; + pc.w4 = 0; + pre_calc[pre_calc_index] = pc; + pre_calc_index += 1; + continue; + } + + if (y < 0) { + y = 0; + } + if (x < 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + // save weights and indices + PreCalc pc; + pc.pos1 = y_low * width + x_low; + pc.pos2 = y_low * width + x_high; + pc.pos3 = y_high * width + x_low; + pc.pos4 = y_high * width + x_high; + pc.w1 = w1; + pc.w2 = w2; + pc.w3 = w3; + pc.w4 = w4; + pre_calc[pre_calc_index] = pc; + + pre_calc_index += 1; + } + } + } + } +} + +template +void bilinear_interpolate_gradient( + const int height, + const int width, + T y, + T x, + T& w1, + T& w2, + T& w3, + T& w4, + int& x_low, + int& x_high, + int& y_low, + int& y_high) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y < 0) { + y = 0; + } + + if (x < 0) { + x = 0; + } + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template +inline void add(T* address, const T& val) { + *address += val; +} + +} // namespace + +template +void ROIAlignRotatedForward( + const int nthreads, + const T* input, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* rois, + T* output) { + int n_rois = nthreads / channels / pooled_width / pooled_height; + // (n, c, ph, pw) is an element in the pooled output + // can be parallelized using omp + // #pragma omp parallel for num_threads(32) + for (int n = 0; n < n_rois; n++) { + int index_n = n * channels * pooled_width * pooled_height; + + const T* current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + // ROIAlignRotated supports align == true, i.e., continuous coordinate + // by default, thus the 0.5 offset + T offset = (T)0.5; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5] * M_PI / 180.0; + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + AT_ASSERTM( + roi_width >= 0 && roi_height >= 0, + "ROIs in ROIAlignRotated do not have non-negative size!"); + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + // we want to precalculate indices and weights shared by all channels, + // this is the key point of optimization + std::vector> pre_calc( + roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + pre_calc_for_bilinear_interpolate( + height, + width, + pooled_height, + pooled_width, + roi_bin_grid_h, + roi_bin_grid_w, + roi_start_h, + roi_start_w, + bin_size_h, + bin_size_w, + roi_bin_grid_h, + roi_bin_grid_w, + roi_center_h, + roi_center_w, + cos_theta, + sin_theta, + pre_calc); + + for (int c = 0; c < channels; c++) { + int index_n_c = index_n + c * pooled_width * pooled_height; + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + int pre_calc_index = 0; + + for (int ph = 0; ph < pooled_height; ph++) { + for (int pw = 0; pw < pooled_width; pw++) { + int index = index_n_c + ph * pooled_width + pw; + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + PreCalc pc = pre_calc[pre_calc_index]; + output_val += pc.w1 * offset_input[pc.pos1] + + pc.w2 * offset_input[pc.pos2] + + pc.w3 * offset_input[pc.pos3] + pc.w4 * offset_input[pc.pos4]; + + pre_calc_index += 1; + } + } + output_val /= count; + + output[index] = output_val; + } // for pw + } // for ph + } // for c + } // for n +} + +template +void ROIAlignRotatedBackward( + const int nthreads, + // may not be contiguous. should index using n_stride, etc + const T* grad_output, + const T& spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + T* grad_input, + const T* rois, + const int n_stride, + const int c_stride, + const int h_stride, + const int w_stride) { + for (int index = 0; index < nthreads; index++) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + // ROIAlignRotated supports align == true, i.e., continuous coordinate + // by default, thus the 0.5 offset + T offset = (T)0.5; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5] * M_PI / 180.0; + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + AT_ASSERTM( + roi_width >= 0 && roi_height >= 0, + "ROIs in ROIAlignRotated do not have non-negative size!"); + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_grad_input = + grad_input + ((roi_batch_ind * channels + c) * height * width); + + int output_offset = n * n_stride + c * c_stride; + const T* offset_grad_output = grad_output + output_offset; + const T grad_output_this_bin = + offset_grad_output[ph * h_stride + pw * w_stride]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient( + height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high); + + T g1 = grad_output_this_bin * w1 / count; + T g2 = grad_output_this_bin * w2 / count; + T g3 = grad_output_this_bin * w3 / count; + T g4 = grad_output_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + // atomic add is not needed for now since it is single threaded + add(offset_grad_input + y_low * width + x_low, static_cast(g1)); + add(offset_grad_input + y_low * width + x_high, static_cast(g2)); + add(offset_grad_input + y_high * width + x_low, static_cast(g3)); + add(offset_grad_input + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // for +} // ROIAlignRotatedBackward + +at::Tensor ROIAlignRotated_forward_cpu( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(input.device().is_cpu(), "input must be a CPU tensor"); + AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); + + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlign_forward_cpu"; + at::checkAllSameType(c, {input_t, rois_t}); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + at::Tensor output = at::zeros( + {num_rois, channels, pooled_height, pooled_width}, input.options()); + + auto output_size = num_rois * pooled_height * pooled_width * channels; + + if (output.numel() == 0) { + return output; + } + + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "ROIAlignRotated_forward", [&] { + ROIAlignRotatedForward( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois_.data_ptr(), + output.data_ptr()); + }); + return output; +} + +at::Tensor ROIAlignRotated_backward_cpu( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + AT_ASSERTM(grad.device().is_cpu(), "grad must be a CPU tensor"); + AT_ASSERTM(rois.device().is_cpu(), "rois must be a CPU tensor"); + + at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlignRotated_backward_cpu"; + at::checkAllSameType(c, {grad_t, rois_t}); + + at::Tensor grad_input = + at::zeros({batch_size, channels, height, width}, grad.options()); + + // handle possibly empty gradients + if (grad.numel() == 0) { + return grad_input; + } + + // get stride values to ensure indexing into gradients is correct. + int n_stride = grad.stride(0); + int c_stride = grad.stride(1); + int h_stride = grad.stride(2); + int w_stride = grad.stride(3); + + auto rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + grad.scalar_type(), "ROIAlignRotated_forward", [&] { + ROIAlignRotatedBackward( + grad.numel(), + grad.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data_ptr(), + rois_.data_ptr(), + n_stride, + c_stride, + h_stride, + w_stride); + }); + return grad_input; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..9c376fc6973b75b34967faf870a9f85a3ee430be --- /dev/null +++ b/detectron2/layers/csrc/ROIAlignRotated/ROIAlignRotated_cuda.cu @@ -0,0 +1,443 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + +// Note: this implementation originates from the Caffe2 ROIAlignRotated Op +// and PyTorch ROIAlign (non-rotated) Op implementations. +// The key difference between this implementation and those ones is +// we don't do "legacy offset" in this version, as there aren't many previous +// works, if any, using the "legacy" ROIAlignRotated Op. +// This would make the interface a bit cleaner. + +namespace detectron2 { + +namespace { + +template +__device__ T bilinear_interpolate( + const T* input, + const int height, + const int width, + T y, + T x) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + return 0; + } + + if (y < 0) { + y = 0; + } + + if (x < 0) { + x = 0; + } + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = input[y_low * width + x_low]; + T v2 = input[y_low * width + x_high]; + T v3 = input[y_high * width + x_low]; + T v4 = input[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template +__device__ void bilinear_interpolate_gradient( + const int height, + const int width, + T y, + T x, + T& w1, + T& w2, + T& w3, + T& w4, + int& x_low, + int& x_high, + int& y_low, + int& y_high) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y < 0) { + y = 0; + } + + if (x < 0) { + x = 0; + } + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = input[y_low * width + x_low]; + // T v2 = input[y_low * width + x_high]; + // T v3 = input[y_high * width + x_low]; + // T v4 = input[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +} // namespace + +template +__global__ void RoIAlignRotatedForward( + const int nthreads, + const T* input, + const T spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + const T* rois, + T* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + // ROIAlignRotated supports align == true, i.e., continuous coordinate + // by default, thus the 0.5 offset + T offset = (T)0.5; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5] * M_PI / 180.0; + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + const T* offset_input = + input + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + // We do average (inte gral) pooling inside a bin + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + + T val = bilinear_interpolate(offset_input, height, width, y, x); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +template +__global__ void RoIAlignRotatedBackwardFeature( + const int nthreads, + const T* top_diff, + const int num_rois, + const T spatial_scale, + const int channels, + const int height, + const int width, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, + T* bottom_diff, + const T* rois) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* current_roi = rois + n * 6; + int roi_batch_ind = current_roi[0]; + + // Do not use rounding; this implementation detail is critical + // ROIAlignRotated supports align == true, i.e., continuous coordinate + // by default, thus the 0.5 offset + T offset = (T)0.5; + T roi_center_w = current_roi[1] * spatial_scale - offset; + T roi_center_h = current_roi[2] * spatial_scale - offset; + T roi_width = current_roi[3] * spatial_scale; + T roi_height = current_roi[4] * spatial_scale; + T theta = current_roi[5] * M_PI / 180.0; + T cos_theta = cos(theta); + T sin_theta = sin(theta); + + T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); + T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); + + T* offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + T roi_start_h = -roi_height / 2.0; + T roi_start_w = -roi_width / 2.0; + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T yy = roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + T y = yy * cos_theta - xx * sin_theta + roi_center_h; + T x = yy * sin_theta + xx * cos_theta + roi_center_w; + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient( + height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd( + offset_bottom_diff + y_low * width + x_low, static_cast(g1)); + atomicAdd( + offset_bottom_diff + y_low * width + x_high, static_cast(g2)); + atomicAdd( + offset_bottom_diff + y_high * width + x_low, static_cast(g3)); + atomicAdd( + offset_bottom_diff + y_high * width + x_high, static_cast(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignRotatedBackward + +at::Tensor ROIAlignRotated_forward_cuda( + const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio) { + AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlignRotated_forward_cuda"; + at::checkAllSameGPU(c, {input_t, rois_t}); + at::checkAllSameType(c, {input_t, rois_t}); + at::cuda::CUDAGuard device_guard(input.device()); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty( + {num_rois, channels, pooled_height, pooled_width}, input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min( + at::cuda::ATenCeilDiv( + static_cast(output_size), static_cast(512)), + static_cast(4096))); + dim3 block(512); + + if (output.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return output; + } + + auto input_ = input.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES( + input.scalar_type(), "ROIAlignRotated_forward", [&] { + RoIAlignRotatedForward<<>>( + output_size, + input_.data_ptr(), + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + rois_.data_ptr(), + output.data_ptr()); + }); + cudaDeviceSynchronize(); + AT_CUDA_CHECK(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlignRotated_backward_cuda( + const at::Tensor& grad, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int batch_size, + const int channels, + const int height, + const int width, + const int sampling_ratio) { + AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + + at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; + at::CheckedFrom c = "ROIAlign_backward_cuda"; + at::checkAllSameGPU(c, {grad_t, rois_t}); + at::checkAllSameType(c, {grad_t, rois_t}); + at::cuda::CUDAGuard device_guard(grad.device()); + + auto num_rois = rois.size(0); + auto grad_input = + at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min( + at::cuda::ATenCeilDiv( + static_cast(grad.numel()), static_cast(512)), + static_cast(4096))); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; + } + + auto grad_ = grad.contiguous(), rois_ = rois.contiguous(); + AT_DISPATCH_FLOATING_TYPES( + grad.scalar_type(), "ROIAlignRotated_backward", [&] { + RoIAlignRotatedBackwardFeature<<>>( + grad.numel(), + grad_.data_ptr(), + num_rois, + spatial_scale, + channels, + height, + width, + pooled_height, + pooled_width, + sampling_ratio, + grad_input.data_ptr(), + rois_.data_ptr()); + }); + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h new file mode 100644 index 0000000000000000000000000000000000000000..7c389c6cbdbefdfb623296b0918c27c634d621bb --- /dev/null +++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h @@ -0,0 +1,35 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace detectron2 { + +at::Tensor box_iou_rotated_cpu( + const at::Tensor& boxes1, + const at::Tensor& boxes2); + +#ifdef WITH_CUDA +at::Tensor box_iou_rotated_cuda( + const at::Tensor& boxes1, + const at::Tensor& boxes2); +#endif + +// Interface for Python +// inline is needed to prevent multiple function definitions when this header is +// included by different cpps +inline at::Tensor box_iou_rotated( + const at::Tensor& boxes1, + const at::Tensor& boxes2) { + assert(boxes1.device().is_cuda() == boxes2.device().is_cuda()); + if (boxes1.device().is_cuda()) { +#ifdef WITH_CUDA + return box_iou_rotated_cuda(boxes1.contiguous(), boxes2.contiguous()); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + return box_iou_rotated_cpu(boxes1.contiguous(), boxes2.contiguous()); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f2b02d171077d96fcaf29b585fa6a678af1f2842 --- /dev/null +++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp @@ -0,0 +1,39 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include "box_iou_rotated.h" +#include "box_iou_rotated_utils.h" + +namespace detectron2 { + +template +void box_iou_rotated_cpu_kernel( + const at::Tensor& boxes1, + const at::Tensor& boxes2, + at::Tensor& ious) { + auto num_boxes1 = boxes1.size(0); + auto num_boxes2 = boxes2.size(0); + + for (int i = 0; i < num_boxes1; i++) { + for (int j = 0; j < num_boxes2; j++) { + ious[i * num_boxes2 + j] = single_box_iou_rotated( + boxes1[i].data_ptr(), boxes2[j].data_ptr()); + } + } +} + +at::Tensor box_iou_rotated_cpu( + // input must be contiguous: + const at::Tensor& boxes1, + const at::Tensor& boxes2) { + auto num_boxes1 = boxes1.size(0); + auto num_boxes2 = boxes2.size(0); + at::Tensor ious = + at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); + + box_iou_rotated_cpu_kernel(boxes1, boxes2, ious); + + // reshape from 1d array to 2d array + auto shape = std::vector{num_boxes1, num_boxes2}; + return ious.reshape(shape); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..e3403c11796cb313771b8b6350c793b9fbdfbcaa --- /dev/null +++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu @@ -0,0 +1,130 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include +#include +#include +#include "box_iou_rotated_utils.h" + +namespace detectron2 { + +// 2D block with 32 * 16 = 512 threads per block +const int BLOCK_DIM_X = 32; +const int BLOCK_DIM_Y = 16; + +template +__global__ void box_iou_rotated_cuda_kernel( + const int n_boxes1, + const int n_boxes2, + const T* dev_boxes1, + const T* dev_boxes2, + T* dev_ious) { + const int row_start = blockIdx.x * blockDim.x; + const int col_start = blockIdx.y * blockDim.y; + + const int row_size = min(n_boxes1 - row_start, blockDim.x); + const int col_size = min(n_boxes2 - col_start, blockDim.y); + + __shared__ float block_boxes1[BLOCK_DIM_X * 5]; + __shared__ float block_boxes2[BLOCK_DIM_Y * 5]; + + // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y + if (threadIdx.x < row_size && threadIdx.y == 0) { + block_boxes1[threadIdx.x * 5 + 0] = + dev_boxes1[(row_start + threadIdx.x) * 5 + 0]; + block_boxes1[threadIdx.x * 5 + 1] = + dev_boxes1[(row_start + threadIdx.x) * 5 + 1]; + block_boxes1[threadIdx.x * 5 + 2] = + dev_boxes1[(row_start + threadIdx.x) * 5 + 2]; + block_boxes1[threadIdx.x * 5 + 3] = + dev_boxes1[(row_start + threadIdx.x) * 5 + 3]; + block_boxes1[threadIdx.x * 5 + 4] = + dev_boxes1[(row_start + threadIdx.x) * 5 + 4]; + } + + if (threadIdx.x < col_size && threadIdx.y == 0) { + block_boxes2[threadIdx.x * 5 + 0] = + dev_boxes2[(col_start + threadIdx.x) * 5 + 0]; + block_boxes2[threadIdx.x * 5 + 1] = + dev_boxes2[(col_start + threadIdx.x) * 5 + 1]; + block_boxes2[threadIdx.x * 5 + 2] = + dev_boxes2[(col_start + threadIdx.x) * 5 + 2]; + block_boxes2[threadIdx.x * 5 + 3] = + dev_boxes2[(col_start + threadIdx.x) * 5 + 3]; + block_boxes2[threadIdx.x * 5 + 4] = + dev_boxes2[(col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size && threadIdx.y < col_size) { + int offset = (row_start + threadIdx.x) * n_boxes2 + col_start + threadIdx.y; + dev_ious[offset] = single_box_iou_rotated( + block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); + } +} + +at::Tensor box_iou_rotated_cuda( + // input must be contiguous + const at::Tensor& boxes1, + const at::Tensor& boxes2) { + using scalar_t = float; + AT_ASSERTM( + boxes1.scalar_type() == at::kFloat, "boxes1 must be a float tensor"); + AT_ASSERTM( + boxes2.scalar_type() == at::kFloat, "boxes2 must be a float tensor"); + AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor"); + AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(boxes1.device()); + + auto num_boxes1 = boxes1.size(0); + auto num_boxes2 = boxes2.size(0); + + at::Tensor ious = + at::empty({num_boxes1 * num_boxes2}, boxes1.options().dtype(at::kFloat)); + + bool transpose = false; + if (num_boxes1 > 0 && num_boxes2 > 0) { + scalar_t *data1 = boxes1.data_ptr(), + *data2 = boxes2.data_ptr(); + + if (num_boxes2 > 65535 * BLOCK_DIM_Y) { + AT_ASSERTM( + num_boxes1 <= 65535 * BLOCK_DIM_Y, + "Too many boxes for box_iou_rotated_cuda!"); + // x dim is allowed to be large, but y dim cannot, + // so we transpose the two to avoid "invalid configuration argument" + // error. We assume one of them is small. Otherwise the result is hard to + // fit in memory anyway. + std::swap(num_boxes1, num_boxes2); + std::swap(data1, data2); + transpose = true; + } + + const int blocks_x = + at::cuda::ATenCeilDiv(static_cast(num_boxes1), BLOCK_DIM_X); + const int blocks_y = + at::cuda::ATenCeilDiv(static_cast(num_boxes2), BLOCK_DIM_Y); + + dim3 blocks(blocks_x, blocks_y); + dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + box_iou_rotated_cuda_kernel<<>>( + num_boxes1, + num_boxes2, + data1, + data2, + (scalar_t*)ious.data_ptr()); + + AT_CUDA_CHECK(cudaGetLastError()); + } + + // reshape from 1d array to 2d array + auto shape = std::vector{num_boxes1, num_boxes2}; + if (transpose) { + return ious.view(shape).t(); + } else { + return ious.view(shape); + } +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..d8757ec376e8703e1edc5f76bf5ef214620bd69f --- /dev/null +++ b/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h @@ -0,0 +1,363 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once + +#include +#include + +#ifdef __CUDACC__ +// Designates functions callable from the host (CPU) and the device (GPU) +#define HOST_DEVICE __host__ __device__ +#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ +#else +#include +#define HOST_DEVICE +#define HOST_DEVICE_INLINE HOST_DEVICE inline +#endif + +namespace detectron2 { + +namespace { + +template +struct RotatedBox { + T x_ctr, y_ctr, w, h, a; +}; + +template +struct Point { + T x, y; + HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {} + HOST_DEVICE_INLINE Point operator+(const Point& p) const { + return Point(x + p.x, y + p.y); + } + HOST_DEVICE_INLINE Point& operator+=(const Point& p) { + x += p.x; + y += p.y; + return *this; + } + HOST_DEVICE_INLINE Point operator-(const Point& p) const { + return Point(x - p.x, y - p.y); + } + HOST_DEVICE_INLINE Point operator*(const T coeff) const { + return Point(x * coeff, y * coeff); + } +}; + +template +HOST_DEVICE_INLINE T dot_2d(const Point& A, const Point& B) { + return A.x * B.x + A.y * B.y; +} + +// R: result type. can be different from input type +template +HOST_DEVICE_INLINE R cross_2d(const Point& A, const Point& B) { + return static_cast(A.x) * static_cast(B.y) - + static_cast(B.x) * static_cast(A.y); +} + +template +HOST_DEVICE_INLINE void get_rotated_vertices( + const RotatedBox& box, + Point (&pts)[4]) { + // M_PI / 180. == 0.01745329251 + double theta = box.a * 0.01745329251; + T cosTheta2 = (T)cos(theta) * 0.5f; + T sinTheta2 = (T)sin(theta) * 0.5f; + + // y: top --> down; x: left --> right + pts[0].x = box.x_ctr + sinTheta2 * box.h + cosTheta2 * box.w; + pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; + pts[1].x = box.x_ctr - sinTheta2 * box.h + cosTheta2 * box.w; + pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; + pts[2].x = 2 * box.x_ctr - pts[0].x; + pts[2].y = 2 * box.y_ctr - pts[0].y; + pts[3].x = 2 * box.x_ctr - pts[1].x; + pts[3].y = 2 * box.y_ctr - pts[1].y; +} + +template +HOST_DEVICE_INLINE int get_intersection_points( + const Point (&pts1)[4], + const Point (&pts2)[4], + Point (&intersections)[24]) { + // Line vector + // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] + Point vec1[4], vec2[4]; + for (int i = 0; i < 4; i++) { + vec1[i] = pts1[(i + 1) % 4] - pts1[i]; + vec2[i] = pts2[(i + 1) % 4] - pts2[i]; + } + + // Line test - test all line combos for intersection + int num = 0; // number of intersections + for (int i = 0; i < 4; i++) { + for (int j = 0; j < 4; j++) { + // Solve for 2x2 Ax=b + T det = cross_2d(vec2[j], vec1[i]); + + // This takes care of parallel lines + if (fabs(det) <= 1e-14) { + continue; + } + + auto vec12 = pts2[j] - pts1[i]; + + T t1 = cross_2d(vec2[j], vec12) / det; + T t2 = cross_2d(vec1[i], vec12) / det; + + if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { + intersections[num++] = pts1[i] + vec1[i] * t1; + } + } + } + + // Check for vertices of rect1 inside rect2 + { + const auto& AB = vec2[0]; + const auto& DA = vec2[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + // assume ABCD is the rectangle, and P is the point to be judged + // P is inside ABCD iff. P's projection on AB lies within AB + // and P's projection on AD lies within AD + + auto AP = pts1[i] - pts2[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts1[i]; + } + } + } + + // Reverse the check - check for vertices of rect2 inside rect1 + { + const auto& AB = vec1[0]; + const auto& DA = vec1[3]; + auto ABdotAB = dot_2d(AB, AB); + auto ADdotAD = dot_2d(DA, DA); + for (int i = 0; i < 4; i++) { + auto AP = pts2[i] - pts1[0]; + + auto APdotAB = dot_2d(AP, AB); + auto APdotAD = -dot_2d(AP, DA); + + if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && + (APdotAD <= ADdotAD)) { + intersections[num++] = pts2[i]; + } + } + } + + return num; +} + +template +HOST_DEVICE_INLINE int convex_hull_graham( + const Point (&p)[24], + const int& num_in, + Point (&q)[24], + bool shift_to_zero = false) { + assert(num_in >= 2); + + // Step 1: + // Find point with minimum y + // if more than 1 points have the same minimum y, + // pick the one with the minimum x. + int t = 0; + for (int i = 1; i < num_in; i++) { + if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { + t = i; + } + } + auto& start = p[t]; // starting point + + // Step 2: + // Subtract starting point from every points (for sorting in the next step) + for (int i = 0; i < num_in; i++) { + q[i] = p[i] - start; + } + + // Swap the starting point to position 0 + auto tmp = q[0]; + q[0] = q[t]; + q[t] = tmp; + + // Step 3: + // Sort point 1 ~ num_in according to their relative cross-product values + // (essentially sorting according to angles) + // If the angles are the same, sort according to their distance to origin + T dist[24]; +#ifdef __CUDACC__ + // compute distance to origin before sort, and sort them together with the + // points + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } + + // CUDA version + // In the future, we can potentially use thrust + // for sorting here to improve speed (though not guaranteed) + for (int i = 1; i < num_in - 1; i++) { + for (int j = i + 1; j < num_in; j++) { + T crossProduct = cross_2d(q[i], q[j]); + if ((crossProduct < -1e-6) || + (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { + auto q_tmp = q[i]; + q[i] = q[j]; + q[j] = q_tmp; + auto dist_tmp = dist[i]; + dist[i] = dist[j]; + dist[j] = dist_tmp; + } + } + } +#else + // CPU version + std::sort( + q + 1, q + num_in, [](const Point& A, const Point& B) -> bool { + T temp = cross_2d(A, B); + if (fabs(temp) < 1e-6) { + return dot_2d(A, A) < dot_2d(B, B); + } else { + return temp > 0; + } + }); + // compute distance to origin after sort, since the points are now different. + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } +#endif + + // Step 4: + // Make sure there are at least 2 points (that don't overlap with each other) + // in the stack + int k; // index of the non-overlapped second point + for (k = 1; k < num_in; k++) { + if (dist[k] > 1e-8) { + break; + } + } + if (k == num_in) { + // We reach the end, which means the convex hull is just one point + q[0] = p[t]; + return 1; + } + q[1] = q[k]; + int m = 2; // 2 points in the stack + // Step 5: + // Finally we can start the scanning process. + // When a non-convex relationship between the 3 points is found + // (either concave shape or duplicated points), + // we pop the previous point from the stack + // until the 3-point relationship is convex again, or + // until the stack only contains two points + for (int i = k + 1; i < num_in; i++) { + while (m > 1) { + auto q1 = q[i] - q[m - 2], q2 = q[m - 1] - q[m - 2]; + // cross_2d() uses FMA and therefore computes round(round(q1.x*q2.y) - + // q2.x*q1.y) So it may not return 0 even when q1==q2. Therefore we + // compare round(q1.x*q2.y) and round(q2.x*q1.y) directly. (round means + // round to nearest floating point). + if (q1.x * q2.y >= q2.x * q1.y) + m--; + else + break; + } + // Using double also helps, but float can solve the issue for now. + // while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) + // >= 0) { + // m--; + // } + q[m++] = q[i]; + } + + // Step 6 (Optional): + // In general sense we need the original coordinates, so we + // need to shift the points back (reverting Step 2) + // But if we're only interested in getting the area/perimeter of the shape + // We can simply return. + if (!shift_to_zero) { + for (int i = 0; i < m; i++) { + q[i] += start; + } + } + + return m; +} + +template +HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int& m) { + if (m <= 2) { + return 0; + } + + T area = 0; + for (int i = 1; i < m - 1; i++) { + area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); + } + + return area / 2.0; +} + +template +HOST_DEVICE_INLINE T rotated_boxes_intersection( + const RotatedBox& box1, + const RotatedBox& box2) { + // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned + // from rotated_rect_intersection_pts + Point intersectPts[24], orderedPts[24]; + + Point pts1[4]; + Point pts2[4]; + get_rotated_vertices(box1, pts1); + get_rotated_vertices(box2, pts2); + + int num = get_intersection_points(pts1, pts2, intersectPts); + + if (num <= 2) { + return 0.0; + } + + // Convex Hull to order the intersection points in clockwise order and find + // the contour area. + int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); + return polygon_area(orderedPts, num_convex); +} + +} // namespace + +template +HOST_DEVICE_INLINE T +single_box_iou_rotated(T const* const box1_raw, T const* const box2_raw) { + // shift center to the middle point to achieve higher precision in result + RotatedBox box1, box2; + auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; + auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; + box1.x_ctr = box1_raw[0] - center_shift_x; + box1.y_ctr = box1_raw[1] - center_shift_y; + box1.w = box1_raw[2]; + box1.h = box1_raw[3]; + box1.a = box1_raw[4]; + box2.x_ctr = box2_raw[0] - center_shift_x; + box2.y_ctr = box2_raw[1] - center_shift_y; + box2.w = box2_raw[2]; + box2.h = box2_raw[3]; + box2.a = box2_raw[4]; + + T area1 = box1.w * box1.h; + T area2 = box2.w * box2.h; + if (area1 < 1e-14 || area2 < 1e-14) { + return 0.f; + } + + T intersection = rotated_boxes_intersection(box1, box2); + T iou = intersection / (area1 + area2 - intersection); + return iou; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/cuda_version.cu b/detectron2/layers/csrc/cuda_version.cu new file mode 100644 index 0000000000000000000000000000000000000000..af088e7572f6f27b9d653b4d7178f4e03de6befc --- /dev/null +++ b/detectron2/layers/csrc/cuda_version.cu @@ -0,0 +1,9 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include + +namespace detectron2 { +int get_cudart_version() { + return CUDART_VERSION; +} +} // namespace detectron2 diff --git a/detectron2/layers/csrc/deformable/deform_conv.h b/detectron2/layers/csrc/deformable/deform_conv.h new file mode 100644 index 0000000000000000000000000000000000000000..49ccd868ace8fd79f6fcbde6fe41f2b95873c414 --- /dev/null +++ b/detectron2/layers/csrc/deformable/deform_conv.h @@ -0,0 +1,377 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace detectron2 { + +#ifdef WITH_CUDA +int deform_conv_forward_cuda( + at::Tensor input, + at::Tensor weight, + at::Tensor offset, + at::Tensor output, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step); + +int deform_conv_backward_input_cuda( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradInput, + at::Tensor gradOffset, + at::Tensor weight, + at::Tensor columns, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step); + +int deform_conv_backward_parameters_cuda( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + float scale, + int im2col_step); + +void modulated_deform_conv_cuda_forward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor output, + at::Tensor columns, + int kernel_h, + int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int group, + const int deformable_group, + const bool with_bias); + +void modulated_deform_conv_cuda_backward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor columns, + at::Tensor grad_input, + at::Tensor grad_weight, + at::Tensor grad_bias, + at::Tensor grad_offset, + at::Tensor grad_mask, + at::Tensor grad_output, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_h, + int pad_w, + int dilation_h, + int dilation_w, + int group, + int deformable_group, + const bool with_bias); + +#endif + +inline int deform_conv_forward( + at::Tensor input, + at::Tensor weight, + at::Tensor offset, + at::Tensor output, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step) { + if (input.is_cuda()) { +#ifdef WITH_CUDA + TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); + TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); + return deform_conv_forward_cuda( + input, + weight, + offset, + output, + columns, + ones, + kW, + kH, + dW, + dH, + padW, + padH, + dilationW, + dilationH, + group, + deformable_group, + im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +inline int deform_conv_backward_input( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradInput, + at::Tensor gradOffset, + at::Tensor weight, + at::Tensor columns, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step) { + if (gradOutput.is_cuda()) { +#ifdef WITH_CUDA + TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); + TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); + TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); + return deform_conv_backward_input_cuda( + input, + offset, + gradOutput, + gradInput, + gradOffset, + weight, + columns, + kW, + kH, + dW, + dH, + padW, + padH, + dilationW, + dilationH, + group, + deformable_group, + im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +inline int deform_conv_backward_filter( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + float scale, + int im2col_step) { + if (gradOutput.is_cuda()) { +#ifdef WITH_CUDA + TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); + TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); + return deform_conv_backward_parameters_cuda( + input, + offset, + gradOutput, + gradWeight, + columns, + ones, + kW, + kH, + dW, + dH, + padW, + padH, + dilationW, + dilationH, + group, + deformable_group, + scale, + im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +inline void modulated_deform_conv_forward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor output, + at::Tensor columns, + int kernel_h, + int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int group, + const int deformable_group, + const bool with_bias) { + if (input.is_cuda()) { +#ifdef WITH_CUDA + TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); + TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!"); + TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); + return modulated_deform_conv_cuda_forward( + input, + weight, + bias, + ones, + offset, + mask, + output, + columns, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + group, + deformable_group, + with_bias); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +inline void modulated_deform_conv_backward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor columns, + at::Tensor grad_input, + at::Tensor grad_weight, + at::Tensor grad_bias, + at::Tensor grad_offset, + at::Tensor grad_mask, + at::Tensor grad_output, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_h, + int pad_w, + int dilation_h, + int dilation_w, + int group, + int deformable_group, + const bool with_bias) { + if (grad_output.is_cuda()) { +#ifdef WITH_CUDA + TORCH_CHECK(input.is_cuda(), "input tensor is not on GPU!"); + TORCH_CHECK(weight.is_cuda(), "weight tensor is not on GPU!"); + TORCH_CHECK(bias.is_cuda(), "bias tensor is not on GPU!"); + TORCH_CHECK(offset.is_cuda(), "offset tensor is not on GPU!"); + return modulated_deform_conv_cuda_backward( + input, + weight, + bias, + ones, + offset, + mask, + columns, + grad_input, + grad_weight, + grad_bias, + grad_offset, + grad_mask, + grad_output, + kernel_h, + kernel_w, + stride_h, + stride_w, + pad_h, + pad_w, + dilation_h, + dilation_w, + group, + deformable_group, + with_bias); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..5376db0cc4d93e245cfc9fea0f3b5715a1f88db2 --- /dev/null +++ b/detectron2/layers/csrc/deformable/deform_conv_cuda.cu @@ -0,0 +1,1131 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +// modified from +// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp +// Original license: Apache 2.0 + +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c +// Original license: Apache 2.0 + +#include + +#include "deform_conv.h" + +#include +#include + +namespace detectron2 { + +void deformable_im2col( + const at::Tensor data_im, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor data_col); + +void deformable_col2im( + const at::Tensor data_col, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor grad_im); + +void deformable_col2im_coord( + const at::Tensor data_col, + const at::Tensor data_im, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor grad_offset); + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kenerl_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor data_col); + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kenerl_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor grad_im); + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, + const at::Tensor data_im, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kenerl_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor grad_offset, + at::Tensor grad_mask); + +void shape_check( + at::Tensor input, + at::Tensor offset, + at::Tensor* gradOutput, + at::Tensor weight, + int kH, + int kW, + int dH, + int dW, + int padH, + int padW, + int dilationH, + int dilationW, + int group, + int deformable_group) { + TORCH_CHECK( + weight.ndimension() == 4, + "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s", + weight.ndimension()); + + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + TORCH_CHECK( + kW > 0 && kH > 0, + "kernel size should be greater than zero, but got kH: %d kW: %d", + kH, + kW); + + TORCH_CHECK( + (weight.size(2) == kH && weight.size(3) == kW), + "kernel size should be consistent with weight, ", + "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", + kH, + kW, + weight.size(2), + weight.size(3)); + + TORCH_CHECK( + dW > 0 && dH > 0, + "stride should be greater than zero, but got dH: %d dW: %d", + dH, + dW); + + TORCH_CHECK( + dilationW > 0 && dilationH > 0, + "dilation should be greater than 0, but got dilationH: %d dilationW: %d", + dilationH, + dilationW); + + int ndim = input.ndimension(); + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + TORCH_CHECK( + ndim == 3 || ndim == 4, + "3D or 4D input tensor expected but got: %s", + ndim); + + long nInputPlane = weight.size(1) * group; + long inputHeight = input.size(dimh); + long inputWidth = input.size(dimw); + long nOutputPlane = weight.size(0); + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + TORCH_CHECK( + nInputPlane % deformable_group == 0, + "input channels must divide deformable group size"); + + if (outputWidth < 1 || outputHeight < 1) + AT_ERROR( + "Given input size: (%ld x %ld x %ld). " + "Calculated output size: (%ld x %ld x %ld). Output size is too small", + nInputPlane, + inputHeight, + inputWidth, + nOutputPlane, + outputHeight, + outputWidth); + + TORCH_CHECK( + input.size(1) == nInputPlane, + "invalid number of input planes, expected: %d, but got: %d", + nInputPlane, + input.size(1)); + + TORCH_CHECK( + (inputHeight >= kH && inputWidth >= kW), + "input image is smaller than kernel"); + + TORCH_CHECK( + (offset.size(2) == outputHeight && offset.size(3) == outputWidth), + "invalid spatial size of offset, expected height: %d width: %d, but " + "got height: %d width: %d", + outputHeight, + outputWidth, + offset.size(2), + offset.size(3)); + + TORCH_CHECK( + (offset.size(1) == deformable_group * 2 * kH * kW), + "invalid number of channels of offset"); + + if (gradOutput != NULL) { + TORCH_CHECK( + gradOutput->size(dimf) == nOutputPlane, + "invalid number of gradOutput planes, expected: %d, but got: %d", + nOutputPlane, + gradOutput->size(dimf)); + + TORCH_CHECK( + (gradOutput->size(dimh) == outputHeight && + gradOutput->size(dimw) == outputWidth), + "invalid size of gradOutput, expected height: %d width: %d , but " + "got height: %d width: %d", + outputHeight, + outputWidth, + gradOutput->size(dimh), + gradOutput->size(dimw)); + } +} + +int deform_conv_forward_cuda( + at::Tensor input, + at::Tensor weight, + at::Tensor offset, + at::Tensor output, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step) { + // todo: resize columns to include im2col: done + // todo: add im2col_step as input + // todo: add new output buffer and transpose it to output (or directly + // transpose output) todo: possibly change data indexing because of + // parallel_imgs + + shape_check( + input, + offset, + NULL, + weight, + kH, + kW, + dH, + dW, + padH, + padW, + dilationH, + dilationW, + group, + deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input.unsqueeze_(0); + offset.unsqueeze_(0); + } + + // todo: assert batchsize dividable by im2col_step + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + output = output.view({batchSize / im2col_step, + im2col_step, + nOutputPlane, + outputHeight, + outputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < outputHeight * outputWidth) { + ones = at::ones({outputHeight, outputWidth}, input.options()); + } + + input = input.view({batchSize / im2col_step, + im2col_step, + nInputPlane, + inputHeight, + inputWidth}); + offset = offset.view({batchSize / im2col_step, + im2col_step, + deformable_group * 2 * kH * kW, + outputHeight, + outputWidth}); + + at::Tensor output_buffer = at::zeros( + {batchSize / im2col_step, + nOutputPlane, + im2col_step * outputHeight, + outputWidth}, + output.options()); + + output_buffer = output_buffer.view({output_buffer.size(0), + group, + output_buffer.size(1) / group, + output_buffer.size(2), + output_buffer.size(3)}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col( + input[elt], + offset[elt], + nInputPlane, + inputHeight, + inputWidth, + kH, + kW, + padH, + padW, + dH, + dW, + dilationH, + dilationW, + im2col_step, + deformable_group, + columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, + weight.size(0) / group, + weight.size(1), + weight.size(2), + weight.size(3)}); + + for (int g = 0; g < group; g++) { + output_buffer[elt][g] = output_buffer[elt][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output_buffer[elt][g]); + } + } + + output_buffer = + output_buffer.view({output_buffer.size(0), + output_buffer.size(1) * output_buffer.size(2), + output_buffer.size(3), + output_buffer.size(4)}); + + output_buffer = output_buffer.view({batchSize / im2col_step, + nOutputPlane, + im2col_step, + outputHeight, + outputWidth}); + output_buffer.transpose_(1, 2); + output.copy_(output_buffer); + output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + output = output.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_input_cuda( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradInput, + at::Tensor gradOffset, + at::Tensor weight, + at::Tensor columns, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + int im2col_step) { + shape_check( + input, + offset, + &gradOutput, + weight, + kH, + kW, + dH, + dW, + padH, + padW, + dilationH, + dilationW, + group, + deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + weight = weight.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view({1, input.size(0), input.size(1), input.size(2)}); + offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)}); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = weight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + // change order of grad output + gradOutput = gradOutput.view({batchSize / im2col_step, + im2col_step, + nOutputPlane, + outputHeight, + outputWidth}); + gradOutput.transpose_(1, 2); + + gradInput = gradInput.view({batchSize / im2col_step, + im2col_step, + nInputPlane, + inputHeight, + inputWidth}); + input = input.view({batchSize / im2col_step, + im2col_step, + nInputPlane, + inputHeight, + inputWidth}); + gradOffset = gradOffset.view({batchSize / im2col_step, + im2col_step, + deformable_group * 2 * kH * kW, + outputHeight, + outputWidth}); + offset = offset.view({batchSize / im2col_step, + im2col_step, + deformable_group * 2 * kH * kW, + outputHeight, + outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + // divide into groups + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, + weight.size(0) / group, + weight.size(1), + weight.size(2), + weight.size(3)}); + gradOutput = gradOutput.view({gradOutput.size(0), + group, + gradOutput.size(1) / group, + gradOutput.size(2), + gradOutput.size(3), + gradOutput.size(4)}); + + for (int g = 0; g < group; g++) { + columns[g] = columns[g].addmm_( + weight[g].flatten(1).transpose(0, 1), + gradOutput[elt][g].flatten(1), + 0.0f, + 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradOutput = gradOutput.view({gradOutput.size(0), + gradOutput.size(1) * gradOutput.size(2), + gradOutput.size(3), + gradOutput.size(4), + gradOutput.size(5)}); + + deformable_col2im_coord( + columns, + input[elt], + offset[elt], + nInputPlane, + inputHeight, + inputWidth, + kH, + kW, + padH, + padW, + dH, + dW, + dilationH, + dilationW, + im2col_step, + deformable_group, + gradOffset[elt]); + + deformable_col2im( + columns, + offset[elt], + nInputPlane, + inputHeight, + inputWidth, + kH, + kW, + padH, + padW, + dH, + dW, + dilationH, + dilationW, + im2col_step, + deformable_group, + gradInput[elt]); + } + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + gradOffset = gradOffset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth}); + offset = offset.view({offset.size(1), offset.size(2), offset.size(3)}); + gradOffset = + gradOffset.view({offset.size(1), offset.size(2), offset.size(3)}); + } + + return 1; +} + +int deform_conv_backward_parameters_cuda( + at::Tensor input, + at::Tensor offset, + at::Tensor gradOutput, + at::Tensor gradWeight, // at::Tensor gradBias, + at::Tensor columns, + at::Tensor ones, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + int group, + int deformable_group, + float scale, + int im2col_step) { + // todo: transpose and reshape outGrad + // todo: reshape columns + // todo: add im2col_step as input + + shape_check( + input, + offset, + &gradOutput, + gradWeight, + kH, + kW, + dH, + dW, + padH, + padW, + dilationH, + dilationW, + group, + deformable_group); + + input = input.contiguous(); + offset = offset.contiguous(); + gradOutput = gradOutput.contiguous(); + + int batch = 1; + + if (input.ndimension() == 3) { + // Force batch + batch = 0; + input = input.view( + at::IntList({1, input.size(0), input.size(1), input.size(2)})); + gradOutput = gradOutput.view( + {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)}); + } + + long batchSize = input.size(0); + long nInputPlane = input.size(1); + long inputHeight = input.size(2); + long inputWidth = input.size(3); + + long nOutputPlane = gradWeight.size(0); + + long outputWidth = + (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = + (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + + columns = at::zeros( + {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, + input.options()); + + gradOutput = gradOutput.view({batchSize / im2col_step, + im2col_step, + nOutputPlane, + outputHeight, + outputWidth}); + gradOutput.transpose_(1, 2); + + at::Tensor gradOutputBuffer = at::zeros_like(gradOutput); + gradOutputBuffer = gradOutputBuffer.view({batchSize / im2col_step, + nOutputPlane, + im2col_step, + outputHeight, + outputWidth}); + gradOutputBuffer.copy_(gradOutput); + // gradOutput is not contiguous, so we do reshape (instead of view) next + gradOutputBuffer = gradOutputBuffer.reshape({batchSize / im2col_step, + nOutputPlane, + im2col_step * outputHeight, + outputWidth}); + + gradOutput.transpose_(1, 2); + gradOutput = + gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth}); + + input = input.view({batchSize / im2col_step, + im2col_step, + nInputPlane, + inputHeight, + inputWidth}); + offset = offset.view({batchSize / im2col_step, + im2col_step, + deformable_group * 2 * kH * kW, + outputHeight, + outputWidth}); + + for (int elt = 0; elt < batchSize / im2col_step; elt++) { + deformable_im2col( + input[elt], + offset[elt], + nInputPlane, + inputHeight, + inputWidth, + kH, + kW, + padH, + padW, + dH, + dW, + dilationH, + dilationW, + im2col_step, + deformable_group, + columns); + + // divide into group + gradOutputBuffer = gradOutputBuffer.view({gradOutputBuffer.size(0), + group, + gradOutputBuffer.size(1) / group, + gradOutputBuffer.size(2), + gradOutputBuffer.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + gradWeight = gradWeight.view({group, + gradWeight.size(0) / group, + gradWeight.size(1), + gradWeight.size(2), + gradWeight.size(3)}); + + for (int g = 0; g < group; g++) { + gradWeight[g] = gradWeight[g] + .flatten(1) + .addmm_( + gradOutputBuffer[elt][g].flatten(1), + columns[g].transpose(1, 0), + 1.0, + scale) + .view_as(gradWeight[g]); + } + gradOutputBuffer = gradOutputBuffer.view( + {gradOutputBuffer.size(0), + gradOutputBuffer.size(1) * gradOutputBuffer.size(2), + gradOutputBuffer.size(3), + gradOutputBuffer.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1), + gradWeight.size(2), + gradWeight.size(3), + gradWeight.size(4)}); + } + + input = input.view({batchSize, nInputPlane, inputHeight, inputWidth}); + offset = offset.view( + {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth}); + + if (batch == 0) { + gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth}); + input = input.view({nInputPlane, inputHeight, inputWidth}); + } + + return 1; +} + +void modulated_deform_conv_cuda_forward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor output, + at::Tensor columns, + int kernel_h, + int kernel_w, + const int stride_h, + const int stride_w, + const int pad_h, + const int pad_w, + const int dilation_h, + const int dilation_w, + const int group, + const int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_out = weight.size(0); + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR( + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, + kernel_w, + kernel_h_, + kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR( + "Input shape and kernel channels wont match: (%d vs %d).", + channels, + channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + // resize output + output = output.view({batch, channels_out, height_out, width_out}).zero_(); + // resize temporary columns + columns = at::zeros( + {channels * kernel_h * kernel_w, 1 * height_out * width_out}, + input.options()); + + output = output.view({output.size(0), + group, + output.size(1) / group, + output.size(2), + output.size(3)}); + + for (int b = 0; b < batch; b++) { + modulated_deformable_im2col_cuda( + input[b], + offset[b], + mask[b], + 1, + channels, + height, + width, + height_out, + width_out, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + deformable_group, + columns); + + // divide into group + weight = weight.view({group, + weight.size(0) / group, + weight.size(1), + weight.size(2), + weight.size(3)}); + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + + for (int g = 0; g < group; g++) { + output[b][g] = output[b][g] + .flatten(1) + .addmm_(weight[g].flatten(1), columns[g]) + .view_as(output[b][g]); + } + + weight = weight.view({weight.size(0) * weight.size(1), + weight.size(2), + weight.size(3), + weight.size(4)}); + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + } + + output = output.view({output.size(0), + output.size(1) * output.size(2), + output.size(3), + output.size(4)}); + + if (with_bias) { + output += bias.view({1, bias.size(0), 1, 1}); + } +} + +void modulated_deform_conv_cuda_backward( + at::Tensor input, + at::Tensor weight, + at::Tensor bias, + at::Tensor ones, + at::Tensor offset, + at::Tensor mask, + at::Tensor columns, + at::Tensor grad_input, + at::Tensor grad_weight, + at::Tensor grad_bias, + at::Tensor grad_offset, + at::Tensor grad_mask, + at::Tensor grad_output, + int kernel_h, + int kernel_w, + int stride_h, + int stride_w, + int pad_h, + int pad_w, + int dilation_h, + int dilation_w, + int group, + int deformable_group, + const bool with_bias) { + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + + const int batch = input.size(0); + const int channels = input.size(1); + const int height = input.size(2); + const int width = input.size(3); + + const int channels_kernel = weight.size(1); + const int kernel_h_ = weight.size(2); + const int kernel_w_ = weight.size(3); + if (kernel_h_ != kernel_h || kernel_w_ != kernel_w) + AT_ERROR( + "Input shape and kernel shape wont match: (%d x %d vs %d x %d).", + kernel_h_, + kernel_w, + kernel_h_, + kernel_w_); + if (channels != channels_kernel * group) + AT_ERROR( + "Input shape and kernel channels wont match: (%d vs %d).", + channels, + channels_kernel * group); + + const int height_out = + (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_out = + (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + + if (ones.ndimension() != 2 || + ones.size(0) * ones.size(1) < height_out * width_out) { + // Resize plane and fill with ones... + ones = at::ones({height_out, width_out}, input.options()); + } + + grad_input = grad_input.view({batch, channels, height, width}); + columns = at::zeros( + {channels * kernel_h * kernel_w, height_out * width_out}, + input.options()); + + grad_output = grad_output.view({grad_output.size(0), + group, + grad_output.size(1) / group, + grad_output.size(2), + grad_output.size(3)}); + + for (int b = 0; b < batch; b++) { + // divide int group + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + weight = weight.view({group, + weight.size(0) / group, + weight.size(1), + weight.size(2), + weight.size(3)}); + + for (int g = 0; g < group; g++) { + columns[g].addmm_( + weight[g].flatten(1).transpose(0, 1), + grad_output[b][g].flatten(1), + 0.0f, + 1.0f); + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + weight = weight.view({weight.size(0) * weight.size(1), + weight.size(2), + weight.size(3), + weight.size(4)}); + + // gradient w.r.t. input coordinate data + modulated_deformable_col2im_coord_cuda( + columns, + input[b], + offset[b], + mask[b], + 1, + channels, + height, + width, + height_out, + width_out, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + deformable_group, + grad_offset[b], + grad_mask[b]); + // gradient w.r.t. input data + modulated_deformable_col2im_cuda( + columns, + offset[b], + mask[b], + 1, + channels, + height, + width, + height_out, + width_out, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + deformable_group, + grad_input[b]); + + // gradient w.r.t. weight, dWeight should accumulate across the batch and + // group + modulated_deformable_im2col_cuda( + input[b], + offset[b], + mask[b], + 1, + channels, + height, + width, + height_out, + width_out, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + deformable_group, + columns); + + columns = columns.view({group, columns.size(0) / group, columns.size(1)}); + grad_weight = grad_weight.view({group, + grad_weight.size(0) / group, + grad_weight.size(1), + grad_weight.size(2), + grad_weight.size(3)}); + if (with_bias) + grad_bias = grad_bias.view({group, grad_bias.size(0) / group}); + + for (int g = 0; g < group; g++) { + grad_weight[g] = + grad_weight[g] + .flatten(1) + .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1)) + .view_as(grad_weight[g]); + if (with_bias) { + grad_bias[g] = + grad_bias[g] + .view({-1, 1}) + .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1})) + .view(-1); + } + } + + columns = + columns.view({columns.size(0) * columns.size(1), columns.size(2)}); + grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1), + grad_weight.size(2), + grad_weight.size(3), + grad_weight.size(4)}); + if (with_bias) + grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)}); + } + grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1), + grad_output.size(2), + grad_output.size(3), + grad_output.size(4)}); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu new file mode 100644 index 0000000000000000000000000000000000000000..841f3166c902e7f1c17fe58137d42a58e4f66d69 --- /dev/null +++ b/detectron2/layers/csrc/deformable/deform_conv_cuda_kernel.cu @@ -0,0 +1,1288 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +// modified from +// https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu +// Original license: Apache 2.0 +// clang-format off + +// modify from +// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu + +/*! + ******************* BEGIN Caffe Copyright Notice and Disclaimer ***************** + * + * COPYRIGHT + * + * All contributions by the University of California: + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * + * All other contributions: + * Copyright (c) 2014-2017, the respective contributors + * All rights reserved. + * + * Caffe uses a shared copyright model: each contributor holds copyright over + * their contributions to Caffe. The project versioning records all such + * contribution and copyright details. If a contributor wants to further mark + * their specific copyright on a particular contribution, they should indicate + * their copyright solely in the commit message of the change when it is + * committed. + * + * LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * CONTRIBUTION AGREEMENT + * + * By contributing to the BVLC/caffe repository through pull-request, comment, + * or otherwise, the contributor releases their content to the + * license and copyright terms herein. + * + ***************** END Caffe Copyright Notice and Disclaimer ********************* + * + * Copyright (c) 2018 Microsoft + * Licensed under The MIT License [see LICENSE for details] + * \file modulated_deformable_im2col.cuh + * \brief Function definitions of converting an image to + * column matrix based on kernel, padding, dilation, and offset. + * These functions are mainly used in deformable convolution operators. + * \ref: https://arxiv.org/abs/1703.06211 + * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng + */ + +#include +#include +#include +#include +#include +#include + +using namespace at; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + + +namespace { + +const int CUDA_NUM_THREADS = 1024; +const int kMaxGridNum = 65535; + +inline int GET_BLOCKS(const int N) { + return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS); +} + +} + +template +__device__ scalar_t deformable_im2col_bilinear( + const scalar_t* bottom_data, + const int data_width, + const int height, + const int width, + scalar_t h, + scalar_t w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t get_gradient_weight( + scalar_t argmax_h, + scalar_t argmax_w, + const int h, + const int w, + const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t get_coordinate_weight( + scalar_t argmax_h, + scalar_t argmax_w, + const int height, + const int width, + const scalar_t* im_data, + const int data_width, + const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void deformable_im2col_gpu_kernel( + const int n, + const scalar_t* data_im, + const scalar_t* data_offset, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* data_col) { + CUDA_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + scalar_t* data_col_ptr = data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + // const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * + // height + h_in) * width + w_in; + const scalar_t* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t* data_offset_ptr = data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + // const scalar_t map_h = i * dilation_h + offset_h; + // const scalar_t map_w = j * dilation_w + offset_w; + // const int cur_height = height - h_in; + // const int cur_width = width - w_in; + // val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, + // cur_width, map_h, map_w); + val = deformable_im2col_bilinear( + data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + + +template +__global__ void deformable_col2im_gpu_kernel( + const int n, + const scalar_t* data_col, + const scalar_t* data_offset, + const int channels, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* grad_im) { + CUDA_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index]; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = get_gradient_weight( + cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + + +template +__global__ void deformable_col2im_coord_gpu_kernel( + const int n, + const scalar_t* data_col, + const scalar_t* data_im, + const scalar_t* data_offset, + const int channels, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int offset_channels, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* grad_offset) { + CUDA_KERNEL_LOOP(index, n) { + scalar_t val = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t* data_col_ptr = data_col + + deformable_group_index * channel_per_deformable_group * batch_size * + width_col * height_col; + const scalar_t* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } + const scalar_t weight = get_coordinate_weight( + inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + + grad_offset[index] = val; + } +} + + +namespace detectron2 { + +void deformable_im2col( + const at::Tensor data_im, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor data_col) { + // num_axes should be smaller than block size + // todo: check parallel_imgs is correctly passed in + int height_col = + (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = + (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = channels * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + at::cuda::CUDAGuard device_guard(data_im.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.scalar_type(), "deformable_im2col_gpu", ([&] { + const scalar_t* data_im_ = data_im.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + scalar_t* data_col_ = data_col.data_ptr(); + + deformable_im2col_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_im_, + data_offset_, + height, + width, + ksize_h, + ksize_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + parallel_imgs, + channels, + deformable_group, + height_col, + width_col, + data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in deformable_im2col: %s\n", cudaGetErrorString(err)); + } +} + + +void deformable_col2im( + const at::Tensor data_col, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor grad_im) { + // todo: make sure parallel_imgs is passed in correctly + int height_col = + (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = + (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = + channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs; + int channel_per_deformable_group = channels / deformable_group; + + at::cuda::CUDAGuard device_guard(data_col.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "deformable_col2im_gpu", ([&] { + const scalar_t* data_col_ = data_col.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + scalar_t* grad_im_ = grad_im.data_ptr(); + + deformable_col2im_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_col_, + data_offset_, + channels, + height, + width, + ksize_h, + ksize_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + parallel_imgs, + deformable_group, + height_col, + width_col, + grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("error in deformable_col2im: %s\n", cudaGetErrorString(err)); + } +} + + +void deformable_col2im_coord( + const at::Tensor data_col, + const at::Tensor data_im, + const at::Tensor data_offset, + const int channels, + const int height, + const int width, + const int ksize_h, + const int ksize_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int parallel_imgs, + const int deformable_group, + at::Tensor grad_offset) { + int height_col = + (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; + int width_col = + (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; + int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * + deformable_group * parallel_imgs; + int channel_per_deformable_group = + channels * ksize_h * ksize_w / deformable_group; + + at::cuda::CUDAGuard device_guard(data_col.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] { + const scalar_t* data_col_ = data_col.data_ptr(); + const scalar_t* data_im_ = data_im.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + scalar_t* grad_offset_ = grad_offset.data_ptr(); + + deformable_col2im_coord_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_col_, + data_im_, + data_offset_, + channels, + height, + width, + ksize_h, + ksize_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + parallel_imgs, + 2 * ksize_h * ksize_w * deformable_group, + deformable_group, + height_col, + width_col, + grad_offset_); + })); +} + +} // namespace detectron2 + + +template +__device__ scalar_t dmcn_im2col_bilinear( + const scalar_t* bottom_data, + const int data_width, + const int height, + const int width, + scalar_t h, + scalar_t w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + scalar_t lh = h - h_low; + scalar_t lw = w - w_low; + scalar_t hh = 1 - lh, hw = 1 - lw; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + v1 = bottom_data[h_low * data_width + w_low]; + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__device__ scalar_t dmcn_get_gradient_weight( + scalar_t argmax_h, + scalar_t argmax_w, + const int h, + const int w, + const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__device__ scalar_t dmcn_get_coordinate_weight( + scalar_t argmax_h, + scalar_t argmax_w, + const int height, + const int width, + const scalar_t* im_data, + const int data_width, + const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + scalar_t weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +template +__global__ void modulated_deformable_im2col_gpu_kernel( + const int n, + const scalar_t* data_im, + const scalar_t* data_offset, + const scalar_t* data_mask, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int num_channels, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* data_col) { + CUDA_KERNEL_LOOP(index, n) { + // index index of output matrix + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + scalar_t* data_col_ptr = data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + // const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * + // height + h_in) * width + w_in; + const scalar_t* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const scalar_t* data_offset_ptr = data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + + const scalar_t* data_mask_ptr = data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t val = static_cast(0); + const scalar_t h_im = h_in + i * dilation_h + offset_h; + const scalar_t w_im = w_in + j * dilation_w + offset_w; + // if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) { + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + // const float map_h = i * dilation_h + offset_h; + // const float map_w = j * dilation_w + offset_w; + // const int cur_height = height - h_in; + // const int cur_width = width - w_in; + // val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, + // cur_width, map_h, map_w); + val = dmcn_im2col_bilinear( + data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + // data_col_ptr += height_col * width_col; + } + } + } +} + +template +__global__ void modulated_deformable_col2im_gpu_kernel( + const int n, + const scalar_t* data_col, + const scalar_t* data_offset, + const scalar_t* data_mask, + const int channels, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* grad_im) { + CUDA_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const scalar_t* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const scalar_t* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * + height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h; + const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const scalar_t cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + scalar_t weight = dmcn_get_gradient_weight( + cur_inv_h_data, + cur_inv_w_data, + cur_h + dy, + cur_w + dx, + height, + width); + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +__global__ void modulated_deformable_col2im_coord_gpu_kernel( + const int n, + const scalar_t* data_col, + const scalar_t* data_im, + const scalar_t* data_offset, + const scalar_t* data_mask, + const int channels, + const int height, + const int width, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int channel_per_deformable_group, + const int batch_size, + const int offset_channels, + const int deformable_group, + const int height_col, + const int width_col, + scalar_t* grad_offset, + scalar_t* grad_mask) { + CUDA_KERNEL_LOOP(index, n) { + scalar_t val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const scalar_t* data_col_ptr = data_col + + deformable_group_index * channel_per_deformable_group * batch_size * + width_col * height_col; + const scalar_t* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * height * width; + const scalar_t* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const scalar_t* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * + height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr]; + const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr]; + const scalar_t mask = data_mask_ptr[data_mask_hw_ptr]; + scalar_t inv_h = h_in + i * dilation_h + offset_h; + scalar_t inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + dmcn_im2col_bilinear( + data_im_ptr + cnt * height * width, + width, + height, + width, + inv_h, + inv_w); + } + const scalar_t weight = dmcn_get_coordinate_weight( + inv_h, + inv_w, + height, + width, + data_im_ptr + cnt * height * width, + width, + bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + grad_offset[index] = val; + if (offset_c % 2 == 0) + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * + // height_col + h) * width_col + w], mask_req, mval); + grad_mask + [(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + } +} + + +namespace detectron2 { + +void modulated_deformable_im2col_cuda( + const at::Tensor data_im, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kenerl_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor data_col) { + // num_axes should be smaller than block size + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = channels * batch_size * height_col * width_col; + + at::cuda::CUDAGuard device_guard(data_im.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] { + const scalar_t* data_im_ = data_im.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + const scalar_t* data_mask_ = data_mask.data_ptr(); + scalar_t* data_col_ = data_col.data_ptr(); + + modulated_deformable_im2col_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_im_, + data_offset_, + data_mask_, + height_im, + width_im, + kernel_h, + kenerl_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + batch_size, + channels, + deformable_group, + height_col, + width_col, + data_col_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf( + "error in modulated_deformable_im2col_cuda: %s\n", + cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_cuda( + const at::Tensor data_col, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor grad_im) { + const int channel_per_deformable_group = channels / deformable_group; + const int num_kernels = + channels * kernel_h * kernel_w * batch_size * height_col * width_col; + + at::cuda::CUDAGuard device_guard(data_col.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] { + const scalar_t* data_col_ = data_col.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + const scalar_t* data_mask_ = data_mask.data_ptr(); + scalar_t* grad_im_ = grad_im.data_ptr(); + + modulated_deformable_col2im_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_col_, + data_offset_, + data_mask_, + channels, + height_im, + width_im, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + batch_size, + deformable_group, + height_col, + width_col, + grad_im_); + })); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf( + "error in modulated_deformable_col2im_cuda: %s\n", + cudaGetErrorString(err)); + } +} + +void modulated_deformable_col2im_coord_cuda( + const at::Tensor data_col, + const at::Tensor data_im, + const at::Tensor data_offset, + const at::Tensor data_mask, + const int batch_size, + const int channels, + const int height_im, + const int width_im, + const int height_col, + const int width_col, + const int kernel_h, + const int kernel_w, + const int pad_h, + const int pad_w, + const int stride_h, + const int stride_w, + const int dilation_h, + const int dilation_w, + const int deformable_group, + at::Tensor grad_offset, + at::Tensor grad_mask) { + const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * + kernel_w * deformable_group; + const int channel_per_deformable_group = + channels * kernel_h * kernel_w / deformable_group; + + at::cuda::CUDAGuard device_guard(data_col.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] { + const scalar_t* data_col_ = data_col.data_ptr(); + const scalar_t* data_im_ = data_im.data_ptr(); + const scalar_t* data_offset_ = data_offset.data_ptr(); + const scalar_t* data_mask_ = data_mask.data_ptr(); + scalar_t* grad_offset_ = grad_offset.data_ptr(); + scalar_t* grad_mask_ = grad_mask.data_ptr(); + + modulated_deformable_col2im_coord_gpu_kernel<<< + GET_BLOCKS(num_kernels), + CUDA_NUM_THREADS, + 0, + stream>>>( + num_kernels, + data_col_, + data_im_, + data_offset_, + data_mask_, + channels, + height_im, + width_im, + kernel_h, + kernel_w, + pad_h, + pad_w, + stride_h, + stride_w, + dilation_h, + dilation_w, + channel_per_deformable_group, + batch_size, + 2 * kernel_h * kernel_w * deformable_group, + deformable_group, + height_col, + width_col, + grad_offset_, + grad_mask_); + })); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf( + "error in modulated_deformable_col2im_coord_cuda: %s\n", + cudaGetErrorString(err)); + } +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated.h b/detectron2/layers/csrc/nms_rotated/nms_rotated.h new file mode 100644 index 0000000000000000000000000000000000000000..9c86c8d55cd24fb5322657b9d2f676fc3e1373ba --- /dev/null +++ b/detectron2/layers/csrc/nms_rotated/nms_rotated.h @@ -0,0 +1,39 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace detectron2 { + +at::Tensor nms_rotated_cpu( + const at::Tensor& dets, + const at::Tensor& scores, + const float iou_threshold); + +#ifdef WITH_CUDA +at::Tensor nms_rotated_cuda( + const at::Tensor& dets, + const at::Tensor& scores, + const float iou_threshold); +#endif + +// Interface for Python +// inline is needed to prevent multiple function definitions when this header is +// included by different cpps +inline at::Tensor nms_rotated( + const at::Tensor& dets, + const at::Tensor& scores, + const float iou_threshold) { + assert(dets.device().is_cuda() == scores.device().is_cuda()); + if (dets.device().is_cuda()) { +#ifdef WITH_CUDA + return nms_rotated_cuda( + dets.contiguous(), scores.contiguous(), iou_threshold); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + + return nms_rotated_cpu(dets.contiguous(), scores.contiguous(), iou_threshold); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0658e388df005748c358dcbf3a1ad2a59da6cac8 --- /dev/null +++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp @@ -0,0 +1,75 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include "../box_iou_rotated/box_iou_rotated_utils.h" +#include "nms_rotated.h" + +namespace detectron2 { + +template +at::Tensor nms_rotated_cpu_kernel( + const at::Tensor& dets, + const at::Tensor& scores, + const float iou_threshold) { + // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel, + // however, the code in this function is much shorter because + // we delegate the IoU computation for rotated boxes to + // the single_box_iou_rotated function in box_iou_rotated_utils.h + AT_ASSERTM(dets.device().is_cpu(), "dets must be a CPU tensor"); + AT_ASSERTM(scores.device().is_cpu(), "scores must be a CPU tensor"); + AT_ASSERTM( + dets.scalar_type() == scores.scalar_type(), + "dets should have the same type as scores"); + + if (dets.numel() == 0) { + return at::empty({0}, dets.options().dtype(at::kLong)); + } + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + + auto ndets = dets.size(0); + at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte)); + at::Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong)); + + auto suppressed = suppressed_t.data_ptr(); + auto keep = keep_t.data_ptr(); + auto order = order_t.data_ptr(); + + int64_t num_to_keep = 0; + + for (int64_t _i = 0; _i < ndets; _i++) { + auto i = order[_i]; + if (suppressed[i] == 1) { + continue; + } + + keep[num_to_keep++] = i; + + for (int64_t _j = _i + 1; _j < ndets; _j++) { + auto j = order[_j]; + if (suppressed[j] == 1) { + continue; + } + + auto ovr = single_box_iou_rotated( + dets[i].data_ptr(), dets[j].data_ptr()); + if (ovr >= iou_threshold) { + suppressed[j] = 1; + } + } + } + return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep); +} + +at::Tensor nms_rotated_cpu( + // input must be contiguous + const at::Tensor& dets, + const at::Tensor& scores, + const float iou_threshold) { + auto result = at::empty({0}, dets.options()); + + AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] { + result = nms_rotated_cpu_kernel(dets, scores, iou_threshold); + }); + return result; +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..40977a0da1761fe807205fbcf8029d56bf75786c --- /dev/null +++ b/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu @@ -0,0 +1,139 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include +#include +#include +#include "../box_iou_rotated/box_iou_rotated_utils.h" + +using namespace detectron2; + +namespace { +int const threadsPerBlock = sizeof(unsigned long long) * 8; +} + +template +__global__ void nms_rotated_cuda_kernel( + const int n_boxes, + const float iou_threshold, + const T* dev_boxes, + unsigned long long* dev_mask) { + // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel + + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + // if (row_start > col_start) return; + + const int row_size = + min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + // Compared to nms_cuda_kernel, where each box is represented with 4 values + // (x1, y1, x2, y2), each rotated box is represented with 5 values + // (x_center, y_center, width, height, angle_degrees) here. + __shared__ T block_boxes[threadsPerBlock * 5]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 5 + 0] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; + block_boxes[threadIdx.x * 5 + 1] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; + block_boxes[threadIdx.x * 5 + 2] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; + block_boxes[threadIdx.x * 5 + 3] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; + block_boxes[threadIdx.x * 5 + 4] = + dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; + const T* cur_box = dev_boxes + cur_box_idx * 5; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + // Instead of devIoU used by original horizontal nms, here + // we use the single_box_iou_rotated function from box_iou_rotated_utils.h + if (single_box_iou_rotated(cur_box, block_boxes + i * 5) > + iou_threshold) { + t |= 1ULL << i; + } + } + const int col_blocks = at::cuda::ATenCeilDiv(n_boxes, threadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +namespace detectron2 { + +at::Tensor nms_rotated_cuda( + // input must be contiguous + const at::Tensor& dets, + const at::Tensor& scores, + float iou_threshold) { + // using scalar_t = float; + AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor"); + AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(dets.device()); + + auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); + auto dets_sorted = dets.index_select(0, order_t); + + auto dets_num = dets.size(0); + + const int col_blocks = + at::cuda::ATenCeilDiv(static_cast(dets_num), threadsPerBlock); + + at::Tensor mask = + at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong)); + + dim3 blocks(col_blocks, col_blocks); + dim3 threads(threadsPerBlock); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + AT_DISPATCH_FLOATING_TYPES( + dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] { + nms_rotated_cuda_kernel<<>>( + dets_num, + iou_threshold, + dets_sorted.data_ptr(), + (unsigned long long*)mask.data_ptr()); + }); + + at::Tensor mask_cpu = mask.to(at::kCPU); + unsigned long long* mask_host = + (unsigned long long*)mask_cpu.data_ptr(); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); + + at::Tensor keep = + at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU)); + int64_t* keep_out = keep.data_ptr(); + + int num_to_keep = 0; + for (int i = 0; i < dets_num; i++) { + int nblock = i / threadsPerBlock; + int inblock = i % threadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + keep_out[num_to_keep++] = i; + unsigned long long* p = mask_host + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + + AT_CUDA_CHECK(cudaGetLastError()); + return order_t.index( + {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep) + .to(order_t.device(), keep.scalar_type())}); +} + +} // namespace detectron2 diff --git a/detectron2/layers/csrc/vision.cpp b/detectron2/layers/csrc/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fa7942e881af704d33a79e8b2ecd1ac5b6f3a7ef --- /dev/null +++ b/detectron2/layers/csrc/vision.cpp @@ -0,0 +1,102 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +#include +#include "ROIAlign/ROIAlign.h" +#include "ROIAlignRotated/ROIAlignRotated.h" +#include "box_iou_rotated/box_iou_rotated.h" +#include "deformable/deform_conv.h" +#include "nms_rotated/nms_rotated.h" + +namespace detectron2 { + +#ifdef WITH_CUDA +extern int get_cudart_version(); +#endif + +std::string get_cuda_version() { +#ifdef WITH_CUDA + std::ostringstream oss; + + // copied from + // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 + auto printCudaStyleVersion = [&](int v) { + oss << (v / 1000) << "." << (v / 10 % 100); + if (v % 10 != 0) { + oss << "." << (v % 10); + } + }; + printCudaStyleVersion(get_cudart_version()); + return oss.str(); +#else + return std::string("not available"); +#endif +} + +// similar to +// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp +std::string get_compiler_version() { + std::ostringstream ss; +#if defined(__GNUC__) +#ifndef __clang__ + +#if ((__GNUC__ <= 4) && (__GNUC_MINOR__ <= 8)) +#error "GCC >= 4.9 is required!" +#endif + + { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } +#endif +#endif + +#if defined(__clang_major__) + { + ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." + << __clang_patchlevel__; + } +#endif + +#if defined(_MSC_VER) + { ss << "MSVC " << _MSC_FULL_VER; } +#endif + return ss.str(); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("get_compiler_version", &get_compiler_version, "get_compiler_version"); + m.def("get_cuda_version", &get_cuda_version, "get_cuda_version"); + + m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes"); + + m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward"); + m.def( + "deform_conv_backward_input", + &deform_conv_backward_input, + "deform_conv_backward_input"); + m.def( + "deform_conv_backward_filter", + &deform_conv_backward_filter, + "deform_conv_backward_filter"); + m.def( + "modulated_deform_conv_forward", + &modulated_deform_conv_forward, + "modulated_deform_conv_forward"); + m.def( + "modulated_deform_conv_backward", + &modulated_deform_conv_backward, + "modulated_deform_conv_backward"); + + m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes"); + + m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); + m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); + + m.def( + "roi_align_rotated_forward", + &ROIAlignRotated_forward, + "Forward pass for Rotated ROI-Align Operator"); + m.def( + "roi_align_rotated_backward", + &ROIAlignRotated_backward, + "Backward pass for Rotated ROI-Align Operator"); +} + +} // namespace detectron2 diff --git a/detectron2/layers/deform_conv.py b/detectron2/layers/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..ba8c6498ffdfffa281e1f02037d40cbbb6e66164 --- /dev/null +++ b/detectron2/layers/deform_conv.py @@ -0,0 +1,494 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from functools import lru_cache +import torch +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from detectron2 import _C + +from .wrappers import _NewEmptyTensorOp + + +class _DeformConv(Function): + @staticmethod + def forward( + ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + im2col_step=64, + ): + if input is not None and input.dim() != 4: + raise ValueError( + "Expected 4D tensor as input, got {}D tensor instead.".format(input.dim()) + ) + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.im2col_step = im2col_step + + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + _DeformConv._output_size(input, weight, ctx.padding, ctx.dilation, ctx.stride) + ) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + if not input.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step) + assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize" + + _C.deform_conv_forward( + input, + weight, + offset, + output, + ctx.bufs_[0], + ctx.bufs_[1], + weight.size(3), + weight.size(2), + ctx.stride[1], + ctx.stride[0], + ctx.padding[1], + ctx.padding[0], + ctx.dilation[1], + ctx.dilation[0], + ctx.groups, + ctx.deformable_groups, + cur_im2col_step, + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + if not grad_output.is_cuda: + raise NotImplementedError + else: + cur_im2col_step = _DeformConv._cal_im2col_step(input.shape[0], ctx.im2col_step) + assert (input.shape[0] % cur_im2col_step) == 0, "im2col step must divide batchsize" + + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + _C.deform_conv_backward_input( + input, + offset, + grad_output, + grad_input, + grad_offset, + weight, + ctx.bufs_[0], + weight.size(3), + weight.size(2), + ctx.stride[1], + ctx.stride[0], + ctx.padding[1], + ctx.padding[0], + ctx.dilation[1], + ctx.dilation[0], + ctx.groups, + ctx.deformable_groups, + cur_im2col_step, + ) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + _C.deform_conv_backward_filter( + input, + offset, + grad_output, + grad_weight, + ctx.bufs_[0], + ctx.bufs_[1], + weight.size(3), + weight.size(2), + ctx.stride[1], + ctx.stride[0], + ctx.padding[1], + ctx.padding[0], + ctx.dilation[1], + ctx.dilation[0], + ctx.groups, + ctx.deformable_groups, + 1, + cur_im2col_step, + ) + + return grad_input, grad_offset, grad_weight, None, None, None, None, None, None + + @staticmethod + def _output_size(input, weight, padding, dilation, stride): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = padding[d] + kernel = dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1,) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + "convolution input is too small (output would be {})".format( + "x".join(map(str, output_size)) + ) + ) + return output_size + + @staticmethod + @lru_cache(maxsize=128) + def _cal_im2col_step(input_size, default_size): + """ + Calculate proper im2col step size, which should be divisible by input_size and not larger + than prefer_size. Meanwhile the step size should be as large as possible to be more + efficient. So we choose the largest one among all divisors of input_size which are smaller + than prefer_size. + :param input_size: input batch size . + :param default_size: default preferred im2col step size. + :return: the largest proper step size. + """ + if input_size <= default_size: + return input_size + best_step = 1 + for step in range(2, min(int(math.sqrt(input_size)) + 1, default_size)): + if input_size % step == 0: + if input_size // step <= default_size: + return input_size // step + best_step = step + + return best_step + + +class _ModulatedDeformConv(Function): + @staticmethod + def forward( + ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + ): + ctx.stride = stride + ctx.padding = padding + ctx.dilation = dilation + ctx.groups = groups + ctx.deformable_groups = deformable_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(1) # fake tensor + if not input.is_cuda: + raise NotImplementedError + if ( + weight.requires_grad + or mask.requires_grad + or offset.requires_grad + or input.requires_grad + ): + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty(_ModulatedDeformConv._infer_shape(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + _C.modulated_deform_conv_forward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + output, + ctx._bufs[1], + weight.shape[2], + weight.shape[3], + ctx.stride, + ctx.stride, + ctx.padding, + ctx.padding, + ctx.dilation, + ctx.dilation, + ctx.groups, + ctx.deformable_groups, + ctx.with_bias, + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + if not grad_output.is_cuda: + raise NotImplementedError + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + _C.modulated_deform_conv_backward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + ctx._bufs[1], + grad_input, + grad_weight, + grad_bias, + grad_offset, + grad_mask, + grad_output, + weight.shape[2], + weight.shape[3], + ctx.stride, + ctx.stride, + ctx.padding, + ctx.padding, + ctx.dilation, + ctx.dilation, + ctx.groups, + ctx.deformable_groups, + ctx.with_bias, + ) + if not ctx.with_bias: + grad_bias = None + + return ( + grad_input, + grad_offset, + grad_mask, + grad_weight, + grad_bias, + None, + None, + None, + None, + None, + ) + + @staticmethod + def _infer_shape(ctx, input, weight): + n = input.size(0) + channels_out = weight.size(0) + height, width = input.shape[2:4] + kernel_h, kernel_w = weight.shape[2:4] + height_out = ( + height + 2 * ctx.padding - (ctx.dilation * (kernel_h - 1) + 1) + ) // ctx.stride + 1 + width_out = ( + width + 2 * ctx.padding - (ctx.dilation * (kernel_w - 1) + 1) + ) // ctx.stride + 1 + return n, channels_out, height_out, width_out + + +deform_conv = _DeformConv.apply +modulated_deform_conv = _ModulatedDeformConv.apply + + +class DeformConv(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=False, + norm=None, + activation=None, + ): + """ + Deformable convolution from :paper:`deformconv`. + + Arguments are similar to :class:`Conv2D`. Extra arguments: + + Args: + deformable_groups (int): number of groups used in deformable convolution. + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + """ + super(DeformConv, self).__init__() + + assert not bias + assert in_channels % groups == 0, "in_channels {} cannot be divisible by groups {}".format( + in_channels, groups + ) + assert ( + out_channels % groups == 0 + ), "out_channels {} cannot be divisible by groups {}".format(out_channels, groups) + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deformable_groups = deformable_groups + self.norm = norm + self.activation = activation + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, *self.kernel_size) + ) + self.bias = None + + nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") + + def forward(self, x, offset): + if x.numel() == 0: + # When input is empty, we want to return a empty tensor with "correct" shape, + # So that the following operations will not panic + # if they check for the shape of the tensor. + # This computes the height and width of the output tensor + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // s + 1 + for i, p, di, k, s in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + x = deform_conv( + x, + offset, + self.weight, + self.stride, + self.padding, + self.dilation, + self.groups, + self.deformable_groups, + ) + if self.norm is not None: + x = self.norm(x) + if self.activation is not None: + x = self.activation(x) + return x + + def extra_repr(self): + tmpstr = "in_channels=" + str(self.in_channels) + tmpstr += ", out_channels=" + str(self.out_channels) + tmpstr += ", kernel_size=" + str(self.kernel_size) + tmpstr += ", stride=" + str(self.stride) + tmpstr += ", padding=" + str(self.padding) + tmpstr += ", dilation=" + str(self.dilation) + tmpstr += ", groups=" + str(self.groups) + tmpstr += ", deformable_groups=" + str(self.deformable_groups) + tmpstr += ", bias=False" + return tmpstr + + +class ModulatedDeformConv(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deformable_groups=1, + bias=True, + norm=None, + activation=None, + ): + """ + Modulated deformable convolution from :paper:`deformconv2`. + + Arguments are similar to :class:`Conv2D`. Extra arguments: + + Args: + deformable_groups (int): number of groups used in deformable convolution. + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + """ + super(ModulatedDeformConv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = stride + self.padding = padding + self.dilation = dilation + self.groups = groups + self.deformable_groups = deformable_groups + self.with_bias = bias + self.norm = norm + self.activation = activation + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) + ) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.bias = None + + nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") + if self.bias is not None: + nn.init.constant_(self.bias, 0) + + def forward(self, x, offset, mask): + if x.numel() == 0: + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // s + 1 + for i, p, di, k, s in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + return _NewEmptyTensorOp.apply(x, output_shape) + + x = modulated_deform_conv( + x, + offset, + mask, + self.weight, + self.bias, + self.stride, + self.padding, + self.dilation, + self.groups, + self.deformable_groups, + ) + if self.norm is not None: + x = self.norm(x) + if self.activation is not None: + x = self.activation(x) + return x + + def extra_repr(self): + tmpstr = "in_channels=" + str(self.in_channels) + tmpstr += ", out_channels=" + str(self.out_channels) + tmpstr += ", kernel_size=" + str(self.kernel_size) + tmpstr += ", stride=" + str(self.stride) + tmpstr += ", padding=" + str(self.padding) + tmpstr += ", dilation=" + str(self.dilation) + tmpstr += ", groups=" + str(self.groups) + tmpstr += ", deformable_groups=" + str(self.deformable_groups) + tmpstr += ", bias=" + str(self.with_bias) + return tmpstr diff --git a/detectron2/layers/mask_ops.py b/detectron2/layers/mask_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..0fe115dbbe15c354575c67d7d10f055eab0bdf91 --- /dev/null +++ b/detectron2/layers/mask_ops.py @@ -0,0 +1,248 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import torch +from PIL import Image +from torch.nn import functional as F + +__all__ = ["paste_masks_in_image"] + + +BYTES_PER_FLOAT = 4 +# TODO: This memory limit may be too much or too little. It would be better to +# determine it based on available resources. +GPU_MEM_LIMIT = 1024 ** 3 # 1 GB memory limit + + +def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True): + """ + Args: + masks: N, 1, H, W + boxes: N, 4 + img_h, img_w (int): + skip_empty (bool): only paste masks within the region that + tightly bound all boxes, and returns the results this region only. + An important optimization for CPU. + + Returns: + if skip_empty == False, a mask of shape (N, img_h, img_w) + if skip_empty == True, a mask of shape (N, h', w'), and the slice + object for the corresponding region. + """ + # On GPU, paste all masks together (up to chunk size) + # by using the entire image to sample the masks + # Compared to pasting them one by one, + # this has more operations but is faster on COCO-scale dataset. + device = masks.device + if skip_empty: + x0_int, y0_int = torch.clamp(boxes.min(dim=0).values.floor()[:2] - 1, min=0).to( + dtype=torch.int32 + ) + x1_int = torch.clamp(boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32) + y1_int = torch.clamp(boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32) + else: + x0_int, y0_int = 0, 0 + x1_int, y1_int = img_w, img_h + x0, y0, x1, y1 = torch.split(boxes, 1, dim=1) # each is Nx1 + + N = masks.shape[0] + + img_y = torch.arange(y0_int, y1_int, device=device, dtype=torch.float32) + 0.5 + img_x = torch.arange(x0_int, x1_int, device=device, dtype=torch.float32) + 0.5 + img_y = (img_y - y0) / (y1 - y0) * 2 - 1 + img_x = (img_x - x0) / (x1 - x0) * 2 - 1 + # img_x, img_y have shapes (N, w), (N, h) + + gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1)) + gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) + grid = torch.stack([gx, gy], dim=3) + + img_masks = F.grid_sample(masks.to(dtype=torch.float32), grid, align_corners=False) + + if skip_empty: + return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int)) + else: + return img_masks[:, 0], () + + +def paste_masks_in_image(masks, boxes, image_shape, threshold=0.5): + """ + Paste a set of masks that are of a fixed resolution (e.g., 28 x 28) into an image. + The location, height, and width for pasting each mask is determined by their + corresponding bounding boxes in boxes. + + Note: + This is a complicated but more accurate implementation. In actual deployment, it is + often enough to use a faster but less accurate implementation. + See :func:`paste_mask_in_image_old` in this file for an alternative implementation. + + Args: + masks (tensor): Tensor of shape (Bimg, Hmask, Wmask), where Bimg is the number of + detected object instances in the image and Hmask, Wmask are the mask width and mask + height of the predicted mask (e.g., Hmask = Wmask = 28). Values are in [0, 1]. + boxes (Boxes or Tensor): A Boxes of length Bimg or Tensor of shape (Bimg, 4). + boxes[i] and masks[i] correspond to the same object instance. + image_shape (tuple): height, width + threshold (float): A threshold in [0, 1] for converting the (soft) masks to + binary masks. + + Returns: + img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the + number of detected object instances and Himage, Wimage are the image width + and height. img_masks[i] is a binary mask for object instance i. + """ + + assert masks.shape[-1] == masks.shape[-2], "Only square mask predictions are supported" + N = len(masks) + if N == 0: + return masks.new_empty((0,) + image_shape, dtype=torch.uint8) + if not isinstance(boxes, torch.Tensor): + boxes = boxes.tensor + device = boxes.device + assert len(boxes) == N, boxes.shape + + img_h, img_w = image_shape + + # The actual implementation split the input into chunks, + # and paste them chunk by chunk. + if device.type == "cpu": + # CPU is most efficient when they are pasted one by one with skip_empty=True + # so that it performs minimal number of operations. + num_chunks = N + else: + # GPU benefits from parallelism for larger chunks, but may have memory issue + # int(img_h) because shape may be tensors in tracing + num_chunks = int(np.ceil(N * int(img_h) * int(img_w) * BYTES_PER_FLOAT / GPU_MEM_LIMIT)) + assert ( + num_chunks <= N + ), "Default GPU_MEM_LIMIT in mask_ops.py is too small; try increasing it" + chunks = torch.chunk(torch.arange(N, device=device), num_chunks) + + img_masks = torch.zeros( + N, img_h, img_w, device=device, dtype=torch.bool if threshold >= 0 else torch.uint8 + ) + for inds in chunks: + masks_chunk, spatial_inds = _do_paste_mask( + masks[inds, None, :, :], boxes[inds], img_h, img_w, skip_empty=device.type == "cpu" + ) + + if threshold >= 0: + masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool) + else: + # for visualization and debugging + masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8) + + img_masks[(inds,) + spatial_inds] = masks_chunk + return img_masks + + +# The below are the original paste function (from Detectron1) which has +# larger quantization error. +# It is faster on CPU, while the aligned one is faster on GPU thanks to grid_sample. + + +def paste_mask_in_image_old(mask, box, img_h, img_w, threshold): + """ + Paste a single mask in an image. + This is a per-box implementation of :func:`paste_masks_in_image`. + This function has larger quantization error due to incorrect pixel + modeling and is not used any more. + + Args: + mask (Tensor): A tensor of shape (Hmask, Wmask) storing the mask of a single + object instance. Values are in [0, 1]. + box (Tensor): A tensor of shape (4, ) storing the x0, y0, x1, y1 box corners + of the object instance. + img_h, img_w (int): Image height and width. + threshold (float): Mask binarization threshold in [0, 1]. + + Returns: + im_mask (Tensor): + The resized and binarized object mask pasted into the original + image plane (a tensor of shape (img_h, img_w)). + """ + # Conversion from continuous box coordinates to discrete pixel coordinates + # via truncation (cast to int32). This determines which pixels to paste the + # mask onto. + box = box.to(dtype=torch.int32) # Continuous to discrete coordinate conversion + # An example (1D) box with continuous coordinates (x0=0.7, x1=4.3) will map to + # a discrete coordinates (x0=0, x1=4). Note that box is mapped to 5 = x1 - x0 + 1 + # pixels (not x1 - x0 pixels). + samples_w = box[2] - box[0] + 1 # Number of pixel samples, *not* geometric width + samples_h = box[3] - box[1] + 1 # Number of pixel samples, *not* geometric height + + # Resample the mask from it's original grid to the new samples_w x samples_h grid + mask = Image.fromarray(mask.cpu().numpy()) + mask = mask.resize((samples_w, samples_h), resample=Image.BILINEAR) + mask = np.array(mask, copy=False) + + if threshold >= 0: + mask = np.array(mask > threshold, dtype=np.uint8) + mask = torch.from_numpy(mask) + else: + # for visualization and debugging, we also + # allow it to return an unmodified mask + mask = torch.from_numpy(mask * 255).to(torch.uint8) + + im_mask = torch.zeros((img_h, img_w), dtype=torch.uint8) + x_0 = max(box[0], 0) + x_1 = min(box[2] + 1, img_w) + y_0 = max(box[1], 0) + y_1 = min(box[3] + 1, img_h) + + im_mask[y_0:y_1, x_0:x_1] = mask[ + (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0]) + ] + return im_mask + + +# Our pixel modeling requires extrapolation for any continuous +# coordinate < 0.5 or > length - 0.5. When sampling pixels on the masks, +# we would like this extrapolation to be an interpolation between boundary values and zero, +# instead of using absolute zero or boundary values. +# Therefore `paste_mask_in_image_old` is often used with zero padding around the masks like this: +# masks, scale = pad_masks(masks[:, 0, :, :], 1) +# boxes = scale_boxes(boxes.tensor, scale) + + +def pad_masks(masks, padding): + """ + Args: + masks (tensor): A tensor of shape (B, M, M) representing B masks. + padding (int): Number of cells to pad on all sides. + + Returns: + The padded masks and the scale factor of the padding size / original size. + """ + B = masks.shape[0] + M = masks.shape[-1] + pad2 = 2 * padding + scale = float(M + pad2) / M + padded_masks = masks.new_zeros((B, M + pad2, M + pad2)) + padded_masks[:, padding:-padding, padding:-padding] = masks + return padded_masks, scale + + +def scale_boxes(boxes, scale): + """ + Args: + boxes (tensor): A tensor of shape (B, 4) representing B boxes with 4 + coords representing the corners x0, y0, x1, y1, + scale (float): The box scaling factor. + + Returns: + Scaled boxes. + """ + w_half = (boxes[:, 2] - boxes[:, 0]) * 0.5 + h_half = (boxes[:, 3] - boxes[:, 1]) * 0.5 + x_c = (boxes[:, 2] + boxes[:, 0]) * 0.5 + y_c = (boxes[:, 3] + boxes[:, 1]) * 0.5 + + w_half *= scale + h_half *= scale + + scaled_boxes = torch.zeros_like(boxes) + scaled_boxes[:, 0] = x_c - w_half + scaled_boxes[:, 2] = x_c + w_half + scaled_boxes[:, 1] = y_c - h_half + scaled_boxes[:, 3] = y_c + h_half + return scaled_boxes diff --git a/detectron2/layers/nms.py b/detectron2/layers/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..aafe29b3aa551caeeda769dd17b8834b08c7f11c --- /dev/null +++ b/detectron2/layers/nms.py @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import torch +from torchvision.ops import boxes as box_ops +from torchvision.ops import nms # BC-compat + + +def batched_nms(boxes, scores, idxs, iou_threshold): + """ + Same as torchvision.ops.boxes.batched_nms, but safer. + """ + assert boxes.shape[-1] == 4 + # TODO may need better strategy. + # Investigate after having a fully-cuda NMS op. + if len(boxes) < 40000: + return box_ops.batched_nms(boxes, scores, idxs, iou_threshold) + + result_mask = scores.new_zeros(scores.size(), dtype=torch.bool) + for id in torch.unique(idxs).cpu().tolist(): + mask = (idxs == id).nonzero().view(-1) + keep = nms(boxes[mask], scores[mask], iou_threshold) + result_mask[mask[keep]] = True + keep = result_mask.nonzero().view(-1) + keep = keep[scores[keep].argsort(descending=True)] + return keep + + +# Note: this function (nms_rotated) might be moved into +# torchvision/ops/boxes.py in the future +def nms_rotated(boxes, scores, iou_threshold): + """ + Performs non-maximum suppression (NMS) on the rotated boxes according + to their intersection-over-union (IoU). + + Rotated NMS iteratively removes lower scoring rotated boxes which have an + IoU greater than iou_threshold with another (higher scoring) rotated box. + + Note that RotatedBox (5, 3, 4, 2, -90) covers exactly the same region as + RotatedBox (5, 3, 4, 2, 90) does, and their IoU will be 1. However, they + can be representing completely different objects in certain tasks, e.g., OCR. + + As for the question of whether rotated-NMS should treat them as faraway boxes + even though their IOU is 1, it depends on the application and/or ground truth annotation. + + As an extreme example, consider a single character v and the square box around it. + + If the angle is 0 degree, the object (text) would be read as 'v'; + + If the angle is 90 degrees, the object (text) would become '>'; + + If the angle is 180 degrees, the object (text) would become '^'; + + If the angle is 270/-90 degrees, the object (text) would become '<' + + All of these cases have IoU of 1 to each other, and rotated NMS that only + uses IoU as criterion would only keep one of them with the highest score - + which, practically, still makes sense in most cases because typically + only one of theses orientations is the correct one. Also, it does not matter + as much if the box is only used to classify the object (instead of transcribing + them with a sequential OCR recognition model) later. + + On the other hand, when we use IoU to filter proposals that are close to the + ground truth during training, we should definitely take the angle into account if + we know the ground truth is labeled with the strictly correct orientation (as in, + upside-down words are annotated with -180 degrees even though they can be covered + with a 0/90/-90 degree box, etc.) + + The way the original dataset is annotated also matters. For example, if the dataset + is a 4-point polygon dataset that does not enforce ordering of vertices/orientation, + we can estimate a minimum rotated bounding box to this polygon, but there's no way + we can tell the correct angle with 100% confidence (as shown above, there could be 4 different + rotated boxes, with angles differed by 90 degrees to each other, covering the exactly + same region). In that case we have to just use IoU to determine the box + proximity (as many detection benchmarks (even for text) do) unless there're other + assumptions we can make (like width is always larger than height, or the object is not + rotated by more than 90 degrees CCW/CW, etc.) + + In summary, not considering angles in rotated NMS seems to be a good option for now, + but we should be aware of its implications. + + Args: + boxes (Tensor[N, 5]): Rotated boxes to perform NMS on. They are expected to be in + (x_center, y_center, width, height, angle_degrees) format. + scores (Tensor[N]): Scores for each one of the rotated boxes + iou_threshold (float): Discards all overlapping rotated boxes with IoU < iou_threshold + + Returns: + keep (Tensor): int64 tensor with the indices of the elements that have been kept + by Rotated NMS, sorted in decreasing order of scores + """ + from detectron2 import _C + + return _C.nms_rotated(boxes, scores, iou_threshold) + + +# Note: this function (batched_nms_rotated) might be moved into +# torchvision/ops/boxes.py in the future +def batched_nms_rotated(boxes, scores, idxs, iou_threshold): + """ + Performs non-maximum suppression in a batched fashion. + + Each index value correspond to a category, and NMS + will not be applied between elements of different categories. + + Args: + boxes (Tensor[N, 5]): + boxes where NMS will be performed. They + are expected to be in (x_ctr, y_ctr, width, height, angle_degrees) format + scores (Tensor[N]): + scores for each one of the boxes + idxs (Tensor[N]): + indices of the categories for each one of the boxes. + iou_threshold (float): + discards all overlapping boxes + with IoU < iou_threshold + + Returns: + Tensor: + int64 tensor with the indices of the elements that have been kept + by NMS, sorted in decreasing order of scores + """ + assert boxes.shape[-1] == 5 + + if boxes.numel() == 0: + return torch.empty((0,), dtype=torch.int64, device=boxes.device) + # Strategy: in order to perform NMS independently per class, + # we add an offset to all the boxes. The offset is dependent + # only on the class idx, and is large enough so that boxes + # from different classes do not overlap + + # Note that batched_nms in torchvision/ops/boxes.py only uses max_coordinate, + # which won't handle negative coordinates correctly. + # Here by using min_coordinate we can make sure the negative coordinates are + # correctly handled. + max_coordinate = ( + torch.max(boxes[:, 0], boxes[:, 1]) + torch.max(boxes[:, 2], boxes[:, 3]) / 2 + ).max() + min_coordinate = ( + torch.min(boxes[:, 0], boxes[:, 1]) - torch.max(boxes[:, 2], boxes[:, 3]) / 2 + ).min() + offsets = idxs.to(boxes) * (max_coordinate - min_coordinate + 1) + boxes_for_nms = boxes.clone() # avoid modifying the original values in boxes + boxes_for_nms[:, :2] += offsets[:, None] + keep = nms_rotated(boxes_for_nms, scores, iou_threshold) + return keep diff --git a/detectron2/layers/roi_align.py b/detectron2/layers/roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..f8c4ce1d747ec77329fab34436f5efa0e958ef32 --- /dev/null +++ b/detectron2/layers/roi_align.py @@ -0,0 +1,105 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from detectron2 import _C + + +class _ROIAlign(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio, aligned): + ctx.save_for_backward(roi) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.input_shape = input.size() + ctx.aligned = aligned + output = _C.roi_align_forward( + input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + (rois,) = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + sampling_ratio = ctx.sampling_ratio + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_align_backward( + grad_output, + rois, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + sampling_ratio, + ctx.aligned, + ) + return grad_input, None, None, None, None, None + + +roi_align = _ROIAlign.apply + + +class ROIAlign(nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio, aligned=True): + """ + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio (int): number of inputs samples to take for each output + sample. 0 to take samples densely. + aligned (bool): if False, use the legacy implementation in + Detectron. If True, align the results more perfectly. + + Note: + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel indices (in our + pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, + c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled + from the underlying signal at continuous coordinates 0.5 and 1.5). But the original + roi_align (aligned=False) does not subtract the 0.5 when computing neighboring + pixel indices and therefore it uses pixels with a slightly incorrect alignment + (relative to our pixel model) when performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; see + detectron2/tests/test_roi_align.py for verification. + + The difference does not make a difference to the model's performance if + ROIAlign is used together with conv layers. + """ + super(ROIAlign, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + self.aligned = aligned + + def forward(self, input, rois): + """ + Args: + input: NCHW images + rois: Bx5 boxes. First column is the index into N. The other 4 columns are xyxy. + """ + assert rois.dim() == 2 and rois.size(1) == 5 + return roi_align( + input, rois, self.output_size, self.spatial_scale, self.sampling_ratio, self.aligned + ) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) + tmpstr += ", aligned=" + str(self.aligned) + tmpstr += ")" + return tmpstr diff --git a/detectron2/layers/roi_align_rotated.py b/detectron2/layers/roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..6ed87e69d5e738f8dbaa7c73c5c8de65343de0fd --- /dev/null +++ b/detectron2/layers/roi_align_rotated.py @@ -0,0 +1,88 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from detectron2 import _C + + +class _ROIAlignRotated(Function): + @staticmethod + def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): + ctx.save_for_backward(roi) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + ctx.input_shape = input.size() + output = _C.roi_align_rotated_forward( + input, roi, spatial_scale, output_size[0], output_size[1], sampling_ratio + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + (rois,) = ctx.saved_tensors + output_size = ctx.output_size + spatial_scale = ctx.spatial_scale + sampling_ratio = ctx.sampling_ratio + bs, ch, h, w = ctx.input_shape + grad_input = _C.roi_align_rotated_backward( + grad_output, + rois, + spatial_scale, + output_size[0], + output_size[1], + bs, + ch, + h, + w, + sampling_ratio, + ) + return grad_input, None, None, None, None, None + + +roi_align_rotated = _ROIAlignRotated.apply + + +class ROIAlignRotated(nn.Module): + def __init__(self, output_size, spatial_scale, sampling_ratio): + """ + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio (int): number of inputs samples to take for each output + sample. 0 to take samples densely. + + Note: + ROIAlignRotated supports continuous coordinate by default: + Given a continuous coordinate c, its two neighboring pixel indices (in our + pixel model) are computed by floor(c - 0.5) and ceil(c - 0.5). For example, + c=1.3 has pixel neighbors with discrete indices [0] and [1] (which are sampled + from the underlying signal at continuous coordinates 0.5 and 1.5). + """ + super(ROIAlignRotated, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + self.sampling_ratio = sampling_ratio + + def forward(self, input, rois): + """ + Args: + input: NCHW images + rois: Bx6 boxes. First column is the index into N. + The other 5 columns are (x_ctr, y_ctr, width, height, angle_degrees). + """ + assert rois.dim() == 2 and rois.size(1) == 6 + return roi_align_rotated( + input, rois, self.output_size, self.spatial_scale, self.sampling_ratio + ) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "output_size=" + str(self.output_size) + tmpstr += ", spatial_scale=" + str(self.spatial_scale) + tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) + tmpstr += ")" + return tmpstr diff --git a/detectron2/layers/rotated_boxes.py b/detectron2/layers/rotated_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..ea9b08583da79aae871b500bcffc19f8a352da6e --- /dev/null +++ b/detectron2/layers/rotated_boxes.py @@ -0,0 +1,22 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from __future__ import absolute_import, division, print_function, unicode_literals + +from detectron2 import _C + + +def pairwise_iou_rotated(boxes1, boxes2): + """ + Return intersection-over-union (Jaccard index) of boxes. + + Both sets of boxes are expected to be in + (x_center, y_center, width, height, angle) format. + + Arguments: + boxes1 (Tensor[N, 5]) + boxes2 (Tensor[M, 5]) + + Returns: + iou (Tensor[N, M]): the NxM matrix containing the pairwise + IoU values for every element in boxes1 and boxes2 + """ + return _C.box_iou_rotated(boxes1, boxes2) diff --git a/detectron2/layers/shape_spec.py b/detectron2/layers/shape_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..ed7f0d08268a2342cfb8246cc032686f2343ef8f --- /dev/null +++ b/detectron2/layers/shape_spec.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from collections import namedtuple + + +class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): + """ + A simple structure that contains basic shape specification about a tensor. + It is often used as the auxiliary inputs/outputs of models, + to obtain the shape inference ability among pytorch modules. + + Attributes: + channels: + height: + width: + stride: + """ + + def __new__(cls, *, channels=None, height=None, width=None, stride=None): + return super().__new__(cls, channels, height, width, stride) diff --git a/detectron2/layers/wrappers.py b/detectron2/layers/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..7e3935e90c61f02e000568af79ed458dd491fed7 --- /dev/null +++ b/detectron2/layers/wrappers.py @@ -0,0 +1,215 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Wrappers around on some nn functions, mainly to support empty tensors. + +Ideally, add support directly in PyTorch to empty tensors in those functions. + +These can be removed once https://github.com/pytorch/pytorch/issues/12013 +is implemented +""" + +import math +import torch +from torch.nn.modules.utils import _ntuple + +TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2]) + + +def cat(tensors, dim=0): + """ + Efficient version of torch.cat that avoids a copy if there is only a single element in a list + """ + assert isinstance(tensors, (list, tuple)) + if len(tensors) == 1: + return tensors[0] + return torch.cat(tensors, dim) + + +class _NewEmptyTensorOp(torch.autograd.Function): + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return _NewEmptyTensorOp.apply(grad, shape), None + + +class Conv2d(torch.nn.Conv2d): + """ + A wrapper around :class:`torch.nn.Conv2d` to support empty inputs and more features. + """ + + def __init__(self, *args, **kwargs): + """ + Extra keyword arguments supported in addition to those in `torch.nn.Conv2d`: + + Args: + norm (nn.Module, optional): a normalization layer + activation (callable(Tensor) -> Tensor): a callable activation function + + It assumes that norm layer is used before activation. + """ + norm = kwargs.pop("norm", None) + activation = kwargs.pop("activation", None) + super().__init__(*args, **kwargs) + + self.norm = norm + self.activation = activation + + def forward(self, x): + if x.numel() == 0 and self.training: + # https://github.com/pytorch/pytorch/issues/12013 + assert not isinstance( + self.norm, torch.nn.SyncBatchNorm + ), "SyncBatchNorm does not support empty inputs!" + + if x.numel() == 0 and TORCH_VERSION <= (1, 4): + assert not isinstance( + self.norm, torch.nn.GroupNorm + ), "GroupNorm does not support empty inputs in PyTorch <=1.4!" + # When input is empty, we want to return a empty tensor with "correct" shape, + # So that the following operations will not panic + # if they check for the shape of the tensor. + # This computes the height and width of the output tensor + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // s + 1 + for i, p, di, k, s in zip( + x.shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [x.shape[0], self.weight.shape[0]] + output_shape + empty = _NewEmptyTensorOp.apply(x, output_shape) + if self.training: + # This is to make DDP happy. + # DDP expects all workers to have gradient w.r.t the same set of parameters. + _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + _dummy + else: + return empty + + x = super().forward(x) + if self.norm is not None: + x = self.norm(x) + if self.activation is not None: + x = self.activation(x) + return x + + +if TORCH_VERSION > (1, 4): + ConvTranspose2d = torch.nn.ConvTranspose2d +else: + + class ConvTranspose2d(torch.nn.ConvTranspose2d): + """ + A wrapper around :class:`torch.nn.ConvTranspose2d` to support zero-size tensor. + """ + + def forward(self, x): + if x.numel() > 0: + return super(ConvTranspose2d, self).forward(x) + # get output shape + + # When input is empty, we want to return a empty tensor with "correct" shape, + # So that the following operations will not panic + # if they check for the shape of the tensor. + # This computes the height and width of the output tensor + output_shape = [ + (i - 1) * d - 2 * p + (di * (k - 1) + 1) + op + for i, p, di, k, d, op in zip( + x.shape[-2:], + self.padding, + self.dilation, + self.kernel_size, + self.stride, + self.output_padding, + ) + ] + output_shape = [x.shape[0], self.out_channels] + output_shape + # This is to make DDP happy. + # DDP expects all workers to have gradient w.r.t the same set of parameters. + _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return _NewEmptyTensorOp.apply(x, output_shape) + _dummy + + +if TORCH_VERSION > (1, 4): + BatchNorm2d = torch.nn.BatchNorm2d +else: + + class BatchNorm2d(torch.nn.BatchNorm2d): + """ + A wrapper around :class:`torch.nn.BatchNorm2d` to support zero-size tensor. + """ + + def forward(self, x): + if x.numel() > 0: + return super(BatchNorm2d, self).forward(x) + # get output shape + output_shape = x.shape + return _NewEmptyTensorOp.apply(x, output_shape) + + +if TORCH_VERSION > (1, 5): + Linear = torch.nn.Linear +else: + + class Linear(torch.nn.Linear): + """ + A wrapper around :class:`torch.nn.Linear` to support empty inputs and more features. + Because of https://github.com/pytorch/pytorch/issues/34202 + """ + + def forward(self, x): + if x.numel() == 0: + output_shape = [x.shape[0], self.weight.shape[0]] + + empty = _NewEmptyTensorOp.apply(x, output_shape) + if self.training: + # This is to make DDP happy. + # DDP expects all workers to have gradient w.r.t the same set of parameters. + _dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + _dummy + else: + return empty + + x = super().forward(x) + return x + + +def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + """ + A wrapper around :func:`torch.nn.functional.interpolate` to support zero-size tensor. + """ + if TORCH_VERSION > (1, 4) or input.numel() > 0: + return torch.nn.functional.interpolate( + input, size, scale_factor, mode, align_corners=align_corners + ) + + def _check_size_scale_factor(dim): + if size is None and scale_factor is None: + raise ValueError("either size or scale_factor should be defined") + if size is not None and scale_factor is not None: + raise ValueError("only one of size or scale_factor should be defined") + if ( + scale_factor is not None + and isinstance(scale_factor, tuple) + and len(scale_factor) != dim + ): + raise ValueError( + "scale_factor shape must match input shape. " + "Input is {}D, scale_factor size is {}".format(dim, len(scale_factor)) + ) + + def _output_size(dim): + _check_size_scale_factor(dim) + if size is not None: + return size + scale_factors = _ntuple(dim)(scale_factor) + # math.floor might return float in py2.7 + return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)] + + output_shape = tuple(_output_size(2)) + output_shape = input.shape[:-2] + output_shape + return _NewEmptyTensorOp.apply(input, output_shape) diff --git a/detectron2/model_zoo/__init__.py b/detectron2/model_zoo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..886616f8e11ef31ea85d7a7ba9a75308befceedf --- /dev/null +++ b/detectron2/model_zoo/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Model Zoo API for Detectron2: a collection of functions to create common model architectures and +optionally load pre-trained weights as released in +`MODEL_ZOO.md `_. +""" +from .model_zoo import get, get_config_file, get_checkpoint_url + +__all__ = ["get_checkpoint_url", "get", "get_config_file"] diff --git a/detectron2/model_zoo/model_zoo.py b/detectron2/model_zoo/model_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..68d0ce5dc442864474bb1086bf04d6e40708c190 --- /dev/null +++ b/detectron2/model_zoo/model_zoo.py @@ -0,0 +1,150 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import os +import pkg_resources +import torch + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.modeling import build_model + + +class _ModelZooUrls(object): + """ + Mapping from names to officially released Detectron2 pre-trained models. + """ + + S3_PREFIX = "https://dl.fbaipublicfiles.com/detectron2/" + + # format: {config_path.yaml} -> model_id/model_final_{commit}.pkl + CONFIG_PATH_TO_URL_SUFFIX = { + # COCO Detection with Faster R-CNN + "COCO-Detection/faster_rcnn_R_50_C4_1x.yaml": "137257644/model_final_721ade.pkl", + "COCO-Detection/faster_rcnn_R_50_DC5_1x.yaml": "137847829/model_final_51d356.pkl", + "COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml": "137257794/model_final_b275ba.pkl", + "COCO-Detection/faster_rcnn_R_50_C4_3x.yaml": "137849393/model_final_f97cb7.pkl", + "COCO-Detection/faster_rcnn_R_50_DC5_3x.yaml": "137849425/model_final_68d202.pkl", + "COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml": "137849458/model_final_280758.pkl", + "COCO-Detection/faster_rcnn_R_101_C4_3x.yaml": "138204752/model_final_298dad.pkl", + "COCO-Detection/faster_rcnn_R_101_DC5_3x.yaml": "138204841/model_final_3e0943.pkl", + "COCO-Detection/faster_rcnn_R_101_FPN_3x.yaml": "137851257/model_final_f6e8b1.pkl", + "COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml": "139173657/model_final_68b088.pkl", + # COCO Detection with RetinaNet + "COCO-Detection/retinanet_R_50_FPN_1x.yaml": "137593951/model_final_b796dc.pkl", + "COCO-Detection/retinanet_R_50_FPN_3x.yaml": "137849486/model_final_4cafe0.pkl", + "COCO-Detection/retinanet_R_101_FPN_3x.yaml": "138363263/model_final_59f53c.pkl", + # COCO Detection with RPN and Fast R-CNN + "COCO-Detection/rpn_R_50_C4_1x.yaml": "137258005/model_final_450694.pkl", + "COCO-Detection/rpn_R_50_FPN_1x.yaml": "137258492/model_final_02ce48.pkl", + "COCO-Detection/fast_rcnn_R_50_FPN_1x.yaml": "137635226/model_final_e5f7ce.pkl", + # COCO Instance Segmentation Baselines with Mask R-CNN + "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_1x.yaml": "137259246/model_final_9243eb.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_1x.yaml": "137260150/model_final_4f86c3.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "137260431/model_final_a54504.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x.yaml": "137849525/model_final_4ce675.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x.yaml": "137849551/model_final_84107b.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml": "137849600/model_final_f10217.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_101_C4_3x.yaml": "138363239/model_final_a2914c.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_101_DC5_3x.yaml": "138363294/model_final_0464b7.pkl", + "COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml": "138205316/model_final_a3ec72.pkl", + "COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x.yaml": "139653917/model_final_2d9806.pkl", # noqa + # COCO Person Keypoint Detection Baselines with Keypoint R-CNN + "COCO-Keypoints/keypoint_rcnn_R_50_FPN_1x.yaml": "137261548/model_final_04e291.pkl", + "COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x.yaml": "137849621/model_final_a6e10b.pkl", + "COCO-Keypoints/keypoint_rcnn_R_101_FPN_3x.yaml": "138363331/model_final_997cc7.pkl", + "COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x.yaml": "139686956/model_final_5ad38f.pkl", + # COCO Panoptic Segmentation Baselines with Panoptic FPN + "COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml": "139514544/model_final_dbfeb4.pkl", + "COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml": "139514569/model_final_c10459.pkl", + "COCO-PanopticSegmentation/panoptic_fpn_R_101_3x.yaml": "139514519/model_final_cafdb1.pkl", + # LVIS Instance Segmentation Baselines with Mask R-CNN + "LVIS-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml": "144219072/model_final_571f7c.pkl", + "LVIS-InstanceSegmentation/mask_rcnn_R_101_FPN_1x.yaml": "144219035/model_final_824ab5.pkl", + "LVIS-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml": "144219108/model_final_5e3439.pkl", # noqa + # Cityscapes & Pascal VOC Baselines + "Cityscapes/mask_rcnn_R_50_FPN.yaml": "142423278/model_final_af9cf5.pkl", + "PascalVOC-Detection/faster_rcnn_R_50_C4.yaml": "142202221/model_final_b1acc2.pkl", + # Other Settings + "Misc/mask_rcnn_R_50_FPN_1x_dconv_c3-c5.yaml": "138602867/model_final_65c703.pkl", + "Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5.yaml": "144998336/model_final_821d0b.pkl", + "Misc/cascade_mask_rcnn_R_50_FPN_1x.yaml": "138602847/model_final_e9d89b.pkl", + "Misc/cascade_mask_rcnn_R_50_FPN_3x.yaml": "144998488/model_final_480dd8.pkl", + "Misc/mask_rcnn_R_50_FPN_3x_syncbn.yaml": "169527823/model_final_3b3c51.pkl", + "Misc/mask_rcnn_R_50_FPN_3x_gn.yaml": "138602888/model_final_dc5d9e.pkl", + "Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml": "138602908/model_final_01ca85.pkl", + "Misc/panoptic_fpn_R_101_dconv_cascade_gn_3x.yaml": "139797668/model_final_be35db.pkl", + "Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv.yaml": "18131413/model_0039999_e76410.pkl", # noqa + # D1 Comparisons + "Detectron1-Comparisons/faster_rcnn_R_50_FPN_noaug_1x.yaml": "137781054/model_final_7ab50c.pkl", # noqa + "Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml": "137781281/model_final_62ca52.pkl", # noqa + "Detectron1-Comparisons/keypoint_rcnn_R_50_FPN_1x.yaml": "137781195/model_final_cce136.pkl", + } + + +def get_checkpoint_url(config_path): + """ + Returns the URL to the model trained using the given config + + Args: + config_path (str): config file name relative to detectron2's "configs/" + directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" + + Returns: + str: a URL to the model + """ + name = config_path.replace(".yaml", "") + if config_path in _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX: + suffix = _ModelZooUrls.CONFIG_PATH_TO_URL_SUFFIX[config_path] + return _ModelZooUrls.S3_PREFIX + name + "/" + suffix + raise RuntimeError("{} not available in Model Zoo!".format(name)) + + +def get_config_file(config_path): + """ + Returns path to a builtin config file. + + Args: + config_path (str): config file name relative to detectron2's "configs/" + directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" + + Returns: + str: the real path to the config file. + """ + cfg_file = pkg_resources.resource_filename( + "detectron2.model_zoo", os.path.join("configs", config_path) + ) + if not os.path.exists(cfg_file): + raise RuntimeError("{} not available in Model Zoo!".format(config_path)) + return cfg_file + + +def get(config_path, trained: bool = False): + """ + Get a model specified by relative path under Detectron2's official ``configs/`` directory. + + Args: + config_path (str): config file name relative to detectron2's "configs/" + directory, e.g., "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" + trained (bool): If True, will initialize the model with the trained model zoo weights. + If False, the checkpoint specified in the config file's ``MODEL.WEIGHTS`` is used + instead; this will typically (though not always) initialize a subset of weights using + an ImageNet pre-trained model, while randomly initializing the other weights. + + Example: + + .. code-block:: python + + from detectron2 import model_zoo + model = model_zoo.get("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml", trained=True) + """ + cfg_file = get_config_file(config_path) + + cfg = get_cfg() + cfg.merge_from_file(cfg_file) + if trained: + cfg.MODEL.WEIGHTS = get_checkpoint_url(config_path) + if not torch.cuda.is_available(): + cfg.MODEL.DEVICE = "cpu" + + model = build_model(cfg) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + return model diff --git a/detectron2/modeling/__init__.py b/detectron2/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9e23fe4a7037c8ece8f4c553b4cfda1631b79c9c --- /dev/null +++ b/detectron2/modeling/__init__.py @@ -0,0 +1,56 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch + +from detectron2.layers import ShapeSpec + +from .anchor_generator import build_anchor_generator, ANCHOR_GENERATOR_REGISTRY +from .backbone import ( + BACKBONE_REGISTRY, + FPN, + Backbone, + ResNet, + ResNetBlockBase, + build_backbone, + build_resnet_backbone, + make_stage, +) +from .meta_arch import ( + META_ARCH_REGISTRY, + SEM_SEG_HEADS_REGISTRY, + GeneralizedRCNN, + PanopticFPN, + ProposalNetwork, + RetinaNet, + SemanticSegmentor, + build_model, + build_sem_seg_head, +) +from .postprocessing import detector_postprocess +from .proposal_generator import ( + PROPOSAL_GENERATOR_REGISTRY, + build_proposal_generator, + RPN_HEAD_REGISTRY, + build_rpn_head, +) +from .roi_heads import ( + ROI_BOX_HEAD_REGISTRY, + ROI_HEADS_REGISTRY, + ROI_KEYPOINT_HEAD_REGISTRY, + ROI_MASK_HEAD_REGISTRY, + ROIHeads, + StandardROIHeads, + BaseMaskRCNNHead, + BaseKeypointRCNNHead, + build_box_head, + build_keypoint_head, + build_mask_head, + build_roi_heads, +) +from .test_time_augmentation import DatasetMapperTTA, GeneralizedRCNNWithTTA + +_EXCLUDE = {"torch", "ShapeSpec"} +__all__ = [k for k in globals().keys() if k not in _EXCLUDE and not k.startswith("_")] + +assert ( + torch.Tensor([1]) == torch.Tensor([2]) +).dtype == torch.bool, "Your Pytorch is too old. Please update to contain https://github.com/pytorch/pytorch/pull/21113" diff --git a/detectron2/modeling/anchor_generator.py b/detectron2/modeling/anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..93927bc1c16106710bc1ca1da4d186f7710e1606 --- /dev/null +++ b/detectron2/modeling/anchor_generator.py @@ -0,0 +1,382 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from typing import List +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.layers import ShapeSpec +from detectron2.structures import Boxes, RotatedBoxes +from detectron2.utils.registry import Registry + +ANCHOR_GENERATOR_REGISTRY = Registry("ANCHOR_GENERATOR") +ANCHOR_GENERATOR_REGISTRY.__doc__ = """ +Registry for modules that creates object detection anchors for feature maps. + +The registered object will be called with `obj(cfg, input_shape)`. +""" + + +class BufferList(nn.Module): + """ + Similar to nn.ParameterList, but for buffers + """ + + def __init__(self, buffers=None): + super(BufferList, self).__init__() + if buffers is not None: + self.extend(buffers) + + def extend(self, buffers): + offset = len(self) + for i, buffer in enumerate(buffers): + self.register_buffer(str(offset + i), buffer) + return self + + def __len__(self): + return len(self._buffers) + + def __iter__(self): + return iter(self._buffers.values()) + + +def _create_grid_offsets(size: List[int], stride: int, offset: float, device: torch.device): + grid_height, grid_width = size + shifts_x = torch.arange( + offset * stride, grid_width * stride, step=stride, dtype=torch.float32, device=device + ) + shifts_y = torch.arange( + offset * stride, grid_height * stride, step=stride, dtype=torch.float32, device=device + ) + + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shift_x = shift_x.reshape(-1) + shift_y = shift_y.reshape(-1) + return shift_x, shift_y + + +def _broadcast_params(params, num_features, name): + """ + If one size (or aspect ratio) is specified and there are multiple feature + maps, we "broadcast" anchors of that single size (or aspect ratio) + over all feature maps. + + If params is list[float], or list[list[float]] with len(params) == 1, repeat + it num_features time. + + Returns: + list[list[float]]: param for each feature + """ + assert isinstance( + params, (list, tuple) + ), f"{name} in anchor generator has to be a list! Got {params}." + assert len(params), f"{name} in anchor generator cannot be empty!" + if not isinstance(params[0], (list, tuple)): # list[float] + return [params] * num_features + if len(params) == 1: + return list(params) * num_features + assert len(params) == num_features, ( + f"Got {name} of length {len(params)} in anchor generator, " + f"but the number of input features is {num_features}!" + ) + return params + + +@ANCHOR_GENERATOR_REGISTRY.register() +class DefaultAnchorGenerator(nn.Module): + """ + Compute anchors in the standard ways described in + "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks". + """ + + box_dim: int = 4 + """ + the dimension of each anchor box. + """ + + @configurable + def __init__(self, *, sizes, aspect_ratios, strides, offset=0.5): + """ + This interface is experimental. + + Args: + sizes (list[list[float]] or list[float]): + If sizes is list[list[float]], sizes[i] is the list of anchor sizes + (i.e. sqrt of anchor area) to use for the i-th feature map. + If sizes is list[float], the sizes are used for all feature maps. + Anchor sizes are given in absolute lengths in units of + the input image; they do not dynamically scale if the input image size changes. + aspect_ratios (list[list[float]] or list[float]): list of aspect ratios + (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies. + strides (list[int]): stride of each input feature. + offset (float): Relative offset between the center of the first anchor and the top-left + corner of the image. Value has to be in [0, 1). + Recommend to use 0.5, which means half stride. + """ + super().__init__() + + self.strides = strides + self.num_features = len(self.strides) + sizes = _broadcast_params(sizes, self.num_features, "sizes") + aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios") + self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios) + + self.offset = offset + assert 0.0 <= self.offset < 1.0, self.offset + + @classmethod + def from_config(cls, cfg, input_shape: List[ShapeSpec]): + return { + "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES, + "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS, + "strides": [x.stride for x in input_shape], + "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET, + } + + def _calculate_anchors(self, sizes, aspect_ratios): + cell_anchors = [ + self.generate_cell_anchors(s, a).float() for s, a in zip(sizes, aspect_ratios) + ] + return BufferList(cell_anchors) + + @property + def num_cell_anchors(self): + """ + Alias of `num_anchors`. + """ + return self.num_anchors + + @property + def num_anchors(self): + """ + Returns: + list[int]: Each int is the number of anchors at every pixel + location, on that feature map. + For example, if at every pixel we use anchors of 3 aspect + ratios and 5 sizes, the number of anchors is 15. + (See also ANCHOR_GENERATOR.SIZES and ANCHOR_GENERATOR.ASPECT_RATIOS in config) + + In standard RPN models, `num_anchors` on every feature map is the same. + """ + return [len(cell_anchors) for cell_anchors in self.cell_anchors] + + def _grid_anchors(self, grid_sizes: List[List[int]]): + """ + Returns: + list[Tensor]: #featuremap tensors, each is (#locations x #cell_anchors) x 4 + """ + anchors = [] + for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): + shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device) + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1) + + anchors.append((shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)) + + return anchors + + def generate_cell_anchors(self, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)): + """ + Generate a tensor storing canonical anchor boxes, which are all anchor + boxes of different sizes and aspect_ratios centered at (0, 0). + We can later build the set of anchors for a full feature map by + shifting and tiling these tensors (see `meth:_grid_anchors`). + + Args: + sizes (tuple[float]): + aspect_ratios (tuple[float]]): + + Returns: + Tensor of shape (len(sizes) * len(aspect_ratios), 4) storing anchor boxes + in XYXY format. + """ + + # This is different from the anchor generator defined in the original Faster R-CNN + # code or Detectron. They yield the same AP, however the old version defines cell + # anchors in a less natural way with a shift relative to the feature grid and + # quantization that results in slightly different sizes for different aspect ratios. + # See also https://github.com/facebookresearch/Detectron/issues/227 + + anchors = [] + for size in sizes: + area = size ** 2.0 + for aspect_ratio in aspect_ratios: + # s * s = w * h + # a = h / w + # ... some algebra ... + # w = sqrt(s * s / a) + # h = a * w + w = math.sqrt(area / aspect_ratio) + h = aspect_ratio * w + x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0 + anchors.append([x0, y0, x1, y1]) + return torch.tensor(anchors) + + def forward(self, features): + """ + Args: + features (list[Tensor]): list of backbone feature maps on which to generate anchors. + + Returns: + list[Boxes]: a list of Boxes containing all the anchors for each feature map + (i.e. the cell anchors repeated over all locations in the feature map). + The number of anchors of each feature map is Hi x Wi x num_cell_anchors, + where Hi, Wi are resolution of the feature map divided by anchor stride. + """ + grid_sizes = [feature_map.shape[-2:] for feature_map in features] + anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) + return [Boxes(x) for x in anchors_over_all_feature_maps] + + +@ANCHOR_GENERATOR_REGISTRY.register() +class RotatedAnchorGenerator(nn.Module): + """ + Compute rotated anchors used by Rotated RPN (RRPN), described in + "Arbitrary-Oriented Scene Text Detection via Rotation Proposals". + """ + + box_dim: int = 5 + """ + the dimension of each anchor box. + """ + + @configurable + def __init__(self, *, sizes, aspect_ratios, strides, angles, offset=0.5): + """ + This interface is experimental. + + Args: + sizes (list[list[float]] or list[float]): + If sizes is list[list[float]], sizes[i] is the list of anchor sizes + (i.e. sqrt of anchor area) to use for the i-th feature map. + If sizes is list[float], the sizes are used for all feature maps. + Anchor sizes are given in absolute lengths in units of + the input image; they do not dynamically scale if the input image size changes. + aspect_ratios (list[list[float]] or list[float]): list of aspect ratios + (i.e. height / width) to use for anchors. Same "broadcast" rule for `sizes` applies. + strides (list[int]): stride of each input feature. + angles (list[list[float]] or list[float]): list of angles (in degrees CCW) + to use for anchors. Same "broadcast" rule for `sizes` applies. + offset (float): Relative offset between the center of the first anchor and the top-left + corner of the image. Value has to be in [0, 1). + Recommend to use 0.5, which means half stride. + """ + super().__init__() + + self.strides = strides + self.num_features = len(self.strides) + sizes = _broadcast_params(sizes, self.num_features, "sizes") + aspect_ratios = _broadcast_params(aspect_ratios, self.num_features, "aspect_ratios") + angles = _broadcast_params(angles, self.num_features, "angles") + self.cell_anchors = self._calculate_anchors(sizes, aspect_ratios, angles) + + self.offset = offset + assert 0.0 <= self.offset < 1.0, self.offset + + @classmethod + def from_config(cls, cfg, input_shape: List[ShapeSpec]): + return { + "sizes": cfg.MODEL.ANCHOR_GENERATOR.SIZES, + "aspect_ratios": cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS, + "strides": [x.stride for x in input_shape], + "offset": cfg.MODEL.ANCHOR_GENERATOR.OFFSET, + "angles": cfg.MODEL.ANCHOR_GENERATOR.ANGLES, + } + + def _calculate_anchors(self, sizes, aspect_ratios, angles): + cell_anchors = [ + self.generate_cell_anchors(size, aspect_ratio, angle).float() + for size, aspect_ratio, angle in zip(sizes, aspect_ratios, angles) + ] + return BufferList(cell_anchors) + + @property + def num_cell_anchors(self): + """ + Alias of `num_anchors`. + """ + return self.num_anchors + + @property + def num_anchors(self): + """ + Returns: + list[int]: Each int is the number of anchors at every pixel + location, on that feature map. + For example, if at every pixel we use anchors of 3 aspect + ratios, 2 sizes and 5 angles, the number of anchors is 30. + (See also ANCHOR_GENERATOR.SIZES, ANCHOR_GENERATOR.ASPECT_RATIOS + and ANCHOR_GENERATOR.ANGLES in config) + + In standard RRPN models, `num_anchors` on every feature map is the same. + """ + return [len(cell_anchors) for cell_anchors in self.cell_anchors] + + def _grid_anchors(self, grid_sizes): + anchors = [] + for size, stride, base_anchors in zip(grid_sizes, self.strides, self.cell_anchors): + shift_x, shift_y = _create_grid_offsets(size, stride, self.offset, base_anchors.device) + zeros = torch.zeros_like(shift_x) + shifts = torch.stack((shift_x, shift_y, zeros, zeros, zeros), dim=1) + + anchors.append((shifts.view(-1, 1, 5) + base_anchors.view(1, -1, 5)).reshape(-1, 5)) + + return anchors + + def generate_cell_anchors( + self, + sizes=(32, 64, 128, 256, 512), + aspect_ratios=(0.5, 1, 2), + angles=(-90, -60, -30, 0, 30, 60, 90), + ): + """ + Generate a tensor storing canonical anchor boxes, which are all anchor + boxes of different sizes, aspect_ratios, angles centered at (0, 0). + We can later build the set of anchors for a full feature map by + shifting and tiling these tensors (see `meth:_grid_anchors`). + + Args: + sizes (tuple[float]): + aspect_ratios (tuple[float]]): + angles (tuple[float]]): + + Returns: + Tensor of shape (len(sizes) * len(aspect_ratios) * len(angles), 5) + storing anchor boxes in (x_ctr, y_ctr, w, h, angle) format. + """ + anchors = [] + for size in sizes: + area = size ** 2.0 + for aspect_ratio in aspect_ratios: + # s * s = w * h + # a = h / w + # ... some algebra ... + # w = sqrt(s * s / a) + # h = a * w + w = math.sqrt(area / aspect_ratio) + h = aspect_ratio * w + anchors.extend([0, 0, w, h, a] for a in angles) + + return torch.tensor(anchors) + + def forward(self, features): + """ + Args: + features (list[Tensor]): list of backbone feature maps on which to generate anchors. + + Returns: + list[RotatedBoxes]: a list of Boxes containing all the anchors for each feature map + (i.e. the cell anchors repeated over all locations in the feature map). + The number of anchors of each feature map is Hi x Wi x num_cell_anchors, + where Hi, Wi are resolution of the feature map divided by anchor stride. + """ + grid_sizes = [feature_map.shape[-2:] for feature_map in features] + anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) + return [RotatedBoxes(x) for x in anchors_over_all_feature_maps] + + +def build_anchor_generator(cfg, input_shape): + """ + Built an anchor generator from `cfg.MODEL.ANCHOR_GENERATOR.NAME`. + """ + anchor_generator = cfg.MODEL.ANCHOR_GENERATOR.NAME + return ANCHOR_GENERATOR_REGISTRY.get(anchor_generator)(cfg, input_shape) diff --git a/detectron2/modeling/backbone/__init__.py b/detectron2/modeling/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d477fb1e596f77b4c24f2b2c66b528bf2f83b00e --- /dev/null +++ b/detectron2/modeling/backbone/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .build import build_backbone, BACKBONE_REGISTRY # noqa F401 isort:skip + +from .backbone import Backbone +from .fpn import FPN +from .resnet import ResNet, ResNetBlockBase, build_resnet_backbone, make_stage + +__all__ = [k for k in globals().keys() if not k.startswith("_")] +# TODO can expose more resnet blocks after careful consideration diff --git a/detectron2/modeling/backbone/backbone.py b/detectron2/modeling/backbone/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..66dee4a6565e6c45ed17d0880fcc37eac8f75c3a --- /dev/null +++ b/detectron2/modeling/backbone/backbone.py @@ -0,0 +1,53 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from abc import ABCMeta, abstractmethod +import torch.nn as nn + +from detectron2.layers import ShapeSpec + +__all__ = ["Backbone"] + + +class Backbone(nn.Module, metaclass=ABCMeta): + """ + Abstract base class for network backbones. + """ + + def __init__(self): + """ + The `__init__` method of any subclass can specify its own set of arguments. + """ + super().__init__() + + @abstractmethod + def forward(self): + """ + Subclasses must override this method, but adhere to the same return type. + + Returns: + dict[str->Tensor]: mapping from feature name (e.g., "res2") to tensor + """ + pass + + @property + def size_divisibility(self): + """ + Some backbones require the input height and width to be divisible by a + specific integer. This is typically true for encoder / decoder type networks + with lateral connection (e.g., FPN) for which feature maps need to match + dimension in the "bottom up" and "top down" paths. Set to 0 if no specific + input size divisibility is required. + """ + return 0 + + def output_shape(self): + """ + Returns: + dict[str->ShapeSpec] + """ + # this is a backward-compatible default + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } diff --git a/detectron2/modeling/backbone/build.py b/detectron2/modeling/backbone/build.py new file mode 100644 index 0000000000000000000000000000000000000000..3d2ecae783257418708b572e298a23e167dabb26 --- /dev/null +++ b/detectron2/modeling/backbone/build.py @@ -0,0 +1,33 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from detectron2.layers import ShapeSpec +from detectron2.utils.registry import Registry + +from .backbone import Backbone + +BACKBONE_REGISTRY = Registry("BACKBONE") +BACKBONE_REGISTRY.__doc__ = """ +Registry for backbones, which extract feature maps from images + +The registered object must be a callable that accepts two arguments: + +1. A :class:`detectron2.config.CfgNode` +2. A :class:`detectron2.layers.ShapeSpec`, which contains the input shape specification. + +It must returns an instance of :class:`Backbone`. +""" + + +def build_backbone(cfg, input_shape=None): + """ + Build a backbone from `cfg.MODEL.BACKBONE.NAME`. + + Returns: + an instance of :class:`Backbone` + """ + if input_shape is None: + input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN)) + + backbone_name = cfg.MODEL.BACKBONE.NAME + backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape) + assert isinstance(backbone, Backbone) + return backbone diff --git a/detectron2/modeling/backbone/fpn.py b/detectron2/modeling/backbone/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..338b5f5286ce233f17aa41f50a5a0a8fb819b8d3 --- /dev/null +++ b/detectron2/modeling/backbone/fpn.py @@ -0,0 +1,245 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +import fvcore.nn.weight_init as weight_init +import torch.nn.functional as F +from torch import nn + +from detectron2.layers import Conv2d, ShapeSpec, get_norm + +from .backbone import Backbone +from .build import BACKBONE_REGISTRY +from .resnet import build_resnet_backbone + +__all__ = ["build_resnet_fpn_backbone", "build_retinanet_resnet_fpn_backbone", "FPN"] + + +class FPN(Backbone): + """ + This module implements :paper:`FPN`. + It creates pyramid features built on top of some input feature maps. + """ + + def __init__( + self, bottom_up, in_features, out_channels, norm="", top_block=None, fuse_type="sum" + ): + """ + Args: + bottom_up (Backbone): module representing the bottom up subnetwork. + Must be a subclass of :class:`Backbone`. The multi-scale feature + maps generated by the bottom up network, and listed in `in_features`, + are used to generate FPN levels. + in_features (list[str]): names of the input feature maps coming + from the backbone to which FPN is attached. For example, if the + backbone produces ["res2", "res3", "res4"], any *contiguous* sublist + of these may be used; order must be from high to low resolution. + out_channels (int): number of channels in the output feature maps. + norm (str): the normalization to use. + top_block (nn.Module or None): if provided, an extra operation will + be performed on the output of the last (smallest resolution) + FPN output, and the result will extend the result list. The top_block + further downsamples the feature map. It must have an attribute + "num_levels", meaning the number of extra FPN levels added by + this block, and "in_feature", which is a string representing + its input feature (e.g., p5). + fuse_type (str): types for fusing the top down features and the lateral + ones. It can be "sum" (default), which sums up element-wise; or "avg", + which takes the element-wise mean of the two. + """ + super(FPN, self).__init__() + assert isinstance(bottom_up, Backbone) + + # Feature map strides and channels from the bottom up network (e.g. ResNet) + input_shapes = bottom_up.output_shape() + in_strides = [input_shapes[f].stride for f in in_features] + in_channels = [input_shapes[f].channels for f in in_features] + + _assert_strides_are_log2_contiguous(in_strides) + lateral_convs = [] + output_convs = [] + + use_bias = norm == "" + for idx, in_channels in enumerate(in_channels): + lateral_norm = get_norm(norm, out_channels) + output_norm = get_norm(norm, out_channels) + + lateral_conv = Conv2d( + in_channels, out_channels, kernel_size=1, bias=use_bias, norm=lateral_norm + ) + output_conv = Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + ) + weight_init.c2_xavier_fill(lateral_conv) + weight_init.c2_xavier_fill(output_conv) + stage = int(math.log2(in_strides[idx])) + self.add_module("fpn_lateral{}".format(stage), lateral_conv) + self.add_module("fpn_output{}".format(stage), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + self.top_block = top_block + self.in_features = in_features + self.bottom_up = bottom_up + # Return feature names are "p", like ["p2", "p3", ..., "p6"] + self._out_feature_strides = {"p{}".format(int(math.log2(s))): s for s in in_strides} + # top block output feature maps. + if self.top_block is not None: + for s in range(stage, stage + self.top_block.num_levels): + self._out_feature_strides["p{}".format(s + 1)] = 2 ** (s + 1) + + self._out_features = list(self._out_feature_strides.keys()) + self._out_feature_channels = {k: out_channels for k in self._out_features} + self._size_divisibility = in_strides[-1] + assert fuse_type in {"avg", "sum"} + self._fuse_type = fuse_type + + @property + def size_divisibility(self): + return self._size_divisibility + + def forward(self, x): + """ + Args: + input (dict[str->Tensor]): mapping feature map name (e.g., "res5") to + feature map tensor for each feature level in high to low resolution order. + + Returns: + dict[str->Tensor]: + mapping from feature map name to FPN feature map tensor + in high to low resolution order. Returned feature names follow the FPN + paper convention: "p", where stage has stride = 2 ** stage e.g., + ["p2", "p3", ..., "p6"]. + """ + # Reverse feature maps into top-down order (from low to high resolution) + bottom_up_features = self.bottom_up(x) + x = [bottom_up_features[f] for f in self.in_features[::-1]] + results = [] + prev_features = self.lateral_convs[0](x[0]) + results.append(self.output_convs[0](prev_features)) + for features, lateral_conv, output_conv in zip( + x[1:], self.lateral_convs[1:], self.output_convs[1:] + ): + top_down_features = F.interpolate(prev_features, scale_factor=2, mode="nearest") + lateral_features = lateral_conv(features) + prev_features = lateral_features + top_down_features + if self._fuse_type == "avg": + prev_features /= 2 + results.insert(0, output_conv(prev_features)) + + if self.top_block is not None: + top_block_in_feature = bottom_up_features.get(self.top_block.in_feature, None) + if top_block_in_feature is None: + top_block_in_feature = results[self._out_features.index(self.top_block.in_feature)] + results.extend(self.top_block(top_block_in_feature)) + assert len(self._out_features) == len(results) + return dict(zip(self._out_features, results)) + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + +def _assert_strides_are_log2_contiguous(strides): + """ + Assert that each stride is 2x times its preceding stride, i.e. "contiguous in log2". + """ + for i, stride in enumerate(strides[1:], 1): + assert stride == 2 * strides[i - 1], "Strides {} {} are not log2 contiguous".format( + stride, strides[i - 1] + ) + + +class LastLevelMaxPool(nn.Module): + """ + This module is used in the original FPN to generate a downsampled + P6 feature from P5. + """ + + def __init__(self): + super().__init__() + self.num_levels = 1 + self.in_feature = "p5" + + def forward(self, x): + return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)] + + +class LastLevelP6P7(nn.Module): + """ + This module is used in RetinaNet to generate extra layers, P6 and P7 from + C5 feature. + """ + + def __init__(self, in_channels, out_channels, in_feature="res5"): + super().__init__() + self.num_levels = 2 + self.in_feature = in_feature + self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1) + self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1) + for module in [self.p6, self.p7]: + weight_init.c2_xavier_fill(module) + + def forward(self, c5): + p6 = self.p6(c5) + p7 = self.p7(F.relu(p6)) + return [p6, p7] + + +@BACKBONE_REGISTRY.register() +def build_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): + """ + Args: + cfg: a detectron2 CfgNode + + Returns: + backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. + """ + bottom_up = build_resnet_backbone(cfg, input_shape) + in_features = cfg.MODEL.FPN.IN_FEATURES + out_channels = cfg.MODEL.FPN.OUT_CHANNELS + backbone = FPN( + bottom_up=bottom_up, + in_features=in_features, + out_channels=out_channels, + norm=cfg.MODEL.FPN.NORM, + top_block=LastLevelMaxPool(), + fuse_type=cfg.MODEL.FPN.FUSE_TYPE, + ) + return backbone + + +@BACKBONE_REGISTRY.register() +def build_retinanet_resnet_fpn_backbone(cfg, input_shape: ShapeSpec): + """ + Args: + cfg: a detectron2 CfgNode + + Returns: + backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`. + """ + bottom_up = build_resnet_backbone(cfg, input_shape) + in_features = cfg.MODEL.FPN.IN_FEATURES + out_channels = cfg.MODEL.FPN.OUT_CHANNELS + in_channels_p6p7 = bottom_up.output_shape()["res5"].channels + backbone = FPN( + bottom_up=bottom_up, + in_features=in_features, + out_channels=out_channels, + norm=cfg.MODEL.FPN.NORM, + top_block=LastLevelP6P7(in_channels_p6p7, out_channels), + fuse_type=cfg.MODEL.FPN.FUSE_TYPE, + ) + return backbone diff --git a/detectron2/modeling/backbone/resnet.py b/detectron2/modeling/backbone/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f1faae012f346166a311902826fb9e4b61e24e54 --- /dev/null +++ b/detectron2/modeling/backbone/resnet.py @@ -0,0 +1,591 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.layers import ( + CNNBlockBase, + Conv2d, + DeformConv, + ModulatedDeformConv, + ShapeSpec, + get_norm, +) + +from .backbone import Backbone +from .build import BACKBONE_REGISTRY + +__all__ = [ + "ResNetBlockBase", + "BasicBlock", + "BottleneckBlock", + "DeformBottleneckBlock", + "BasicStem", + "ResNet", + "make_stage", + "build_resnet_backbone", +] + + +ResNetBlockBase = CNNBlockBase +""" +Alias for backward compatibiltiy. +""" + + +class BasicBlock(CNNBlockBase): + """ + The basic residual block for ResNet-18 and ResNet-34 defined in :paper:`ResNet`, + with two 3x3 conv layers and a projection shortcut if needed. + """ + + def __init__(self, in_channels, out_channels, *, stride=1, norm="BN"): + """ + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + stride (int): Stride for the first conv. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + """ + super().__init__(in_channels, out_channels, stride) + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + self.conv1 = Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + self.conv2 = Conv2d( + out_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + out = self.conv2(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class BottleneckBlock(CNNBlockBase): + """ + The standard bottleneck residual block used by ResNet-50, 101 and 152 + defined in :paper:`ResNet`. It contains 3 conv layers with kernels + 1x1, 3x3, 1x1, and a projection shortcut if needed. + """ + + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="BN", + stride_in_1x1=False, + dilation=1, + ): + """ + Args: + bottleneck_channels (int): number of output channels for the 3x3 + "bottleneck" conv layers. + num_groups (int): number of groups for the 3x3 conv layer. + norm (str or callable): normalization for all conv layers. + See :func:`layers.get_norm` for supported format. + stride_in_1x1 (bool): when stride>1, whether to put stride in the + first 1x1 convolution or the bottleneck 3x3 convolution. + dilation (int): the dilation rate of the 3x3 conv layer. + """ + super().__init__(in_channels, out_channels, stride) + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + # The original MSRA ResNet models have stride in the first 1x1 conv + # The subsequent fb.torch.resnet and Caffe2 ResNe[X]t implementations have + # stride in the 3x3 conv + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv2 = Conv2d( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + bias=False, + groups=num_groups, + dilation=dilation, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + # Zero-initialize the last normalization in each residual branch, + # so that at the beginning, the residual branch starts with zeros, + # and each residual block behaves like an identity. + # See Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "For BN layers, the learnable scaling coefficient γ is initialized + # to be 1, except for each residual block's last BN + # where γ is initialized to be 0." + + # nn.init.constant_(self.conv3.norm.weight, 0) + # TODO this somehow hurts performance when training GN models from scratch. + # Add it as an option when we need to use this code to train a backbone. + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + out = self.conv2(out) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +class DeformBottleneckBlock(ResNetBlockBase): + """ + Similar to :class:`BottleneckBlock`, but with :paper:`deformable conv ` + in the 3x3 convolution. + """ + + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="BN", + stride_in_1x1=False, + dilation=1, + deform_modulated=False, + deform_num_groups=1, + ): + super().__init__(in_channels, out_channels, stride) + self.deform_modulated = deform_modulated + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels), + ) + + if deform_modulated: + deform_conv_op = ModulatedDeformConv + # offset channels are 2 or 3 (if with modulated) * kernel_size * kernel_size + offset_channels = 27 + else: + deform_conv_op = DeformConv + offset_channels = 18 + + self.conv2_offset = Conv2d( + bottleneck_channels, + offset_channels * deform_num_groups, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + dilation=dilation, + ) + self.conv2 = deform_conv_op( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + padding=1 * dilation, + bias=False, + groups=num_groups, + dilation=dilation, + deformable_groups=deform_num_groups, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + nn.init.constant_(self.conv2_offset.weight, 0) + nn.init.constant_(self.conv2_offset.bias, 0) + + def forward(self, x): + out = self.conv1(x) + out = F.relu_(out) + + if self.deform_modulated: + offset_mask = self.conv2_offset(out) + offset_x, offset_y, mask = torch.chunk(offset_mask, 3, dim=1) + offset = torch.cat((offset_x, offset_y), dim=1) + mask = mask.sigmoid() + out = self.conv2(out, offset, mask) + else: + offset = self.conv2_offset(out) + out = self.conv2(out, offset) + out = F.relu_(out) + + out = self.conv3(out) + + if self.shortcut is not None: + shortcut = self.shortcut(x) + else: + shortcut = x + + out += shortcut + out = F.relu_(out) + return out + + +def make_stage(block_class, num_blocks, first_stride, *, in_channels, out_channels, **kwargs): + """ + Create a list of blocks just like those in a ResNet stage. + + Args: + block_class (type): a subclass of ResNetBlockBase + num_blocks (int): + first_stride (int): the stride of the first block. The other blocks will have stride=1. + in_channels (int): input channels of the entire stage. + out_channels (int): output channels of **every block** in the stage. + kwargs: other arguments passed to the constructor of every block. + + Returns: + list[nn.Module]: a list of block module. + """ + assert "stride" not in kwargs, "Stride of blocks in make_stage cannot be changed." + blocks = [] + for i in range(num_blocks): + blocks.append( + block_class( + in_channels=in_channels, + out_channels=out_channels, + stride=first_stride if i == 0 else 1, + **kwargs, + ) + ) + in_channels = out_channels + return blocks + + +class BasicStem(CNNBlockBase): + """ + The standard ResNet stem (layers before the first residual block). + """ + + def __init__(self, in_channels=3, out_channels=64, norm="BN"): + """ + Args: + norm (str or callable): norm after the first conv layer. + See :func:`layers.get_norm` for supported format. + """ + super().__init__(in_channels, out_channels, 4) + self.in_channels = in_channels + self.conv1 = Conv2d( + in_channels, + out_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False, + norm=get_norm(norm, out_channels), + ) + weight_init.c2_msra_fill(self.conv1) + + def forward(self, x): + x = self.conv1(x) + x = F.relu_(x) + x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1) + return x + + +class ResNet(Backbone): + """ + Implement :paper:`ResNet`. + """ + + def __init__(self, stem, stages, num_classes=None, out_features=None): + """ + Args: + stem (nn.Module): a stem module + stages (list[list[CNNBlockBase]]): several (typically 4) stages, + each contains multiple :class:`CNNBlockBase`. + num_classes (None or int): if None, will not perform classification. + Otherwise, will create a linear layer. + out_features (list[str]): name of the layers whose outputs should + be returned in forward. Can be anything in "stem", "linear", or "res2" ... + If None, will return the output of the last layer. + """ + super(ResNet, self).__init__() + self.stem = stem + self.num_classes = num_classes + + current_stride = self.stem.stride + self._out_feature_strides = {"stem": current_stride} + self._out_feature_channels = {"stem": self.stem.out_channels} + + self.stages_and_names = [] + for i, blocks in enumerate(stages): + assert len(blocks) > 0, len(blocks) + for block in blocks: + assert isinstance(block, CNNBlockBase), block + + name = "res" + str(i + 2) + stage = nn.Sequential(*blocks) + + self.add_module(name, stage) + self.stages_and_names.append((stage, name)) + + self._out_feature_strides[name] = current_stride = int( + current_stride * np.prod([k.stride for k in blocks]) + ) + self._out_feature_channels[name] = curr_channels = blocks[-1].out_channels + + if num_classes is not None: + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.linear = nn.Linear(curr_channels, num_classes) + + # Sec 5.1 in "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour": + # "The 1000-way fully-connected layer is initialized by + # drawing weights from a zero-mean Gaussian with standard deviation of 0.01." + nn.init.normal_(self.linear.weight, std=0.01) + name = "linear" + + if out_features is None: + out_features = [name] + self._out_features = out_features + assert len(self._out_features) + children = [x[0] for x in self.named_children()] + for out_feature in self._out_features: + assert out_feature in children, "Available children: {}".format(", ".join(children)) + + def forward(self, x): + outputs = {} + x = self.stem(x) + if "stem" in self._out_features: + outputs["stem"] = x + for stage, name in self.stages_and_names: + x = stage(x) + if name in self._out_features: + outputs[name] = x + if self.num_classes is not None: + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.linear(x) + if "linear" in self._out_features: + outputs["linear"] = x + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + def freeze(self, freeze_at=0): + """ + Freeze the first several stages of the ResNet. Commonly used in + fine-tuning. + + Layers that produce the same feature map spatial size are defined as one + "stage" by :paper:`FPN`. + + Args: + freeze_at (int): number of stages to freeze. + `1` means freezing the stem. `2` means freezing the stem and + one residual stage, etc. + + Returns: + nn.Module: this ResNet itself + """ + if freeze_at >= 1: + self.stem.freeze() + for idx, (stage, _) in enumerate(self.stages_and_names, start=2): + if freeze_at >= idx: + for block in stage.children(): + block.freeze() + return self + + +@BACKBONE_REGISTRY.register() +def build_resnet_backbone(cfg, input_shape): + """ + Create a ResNet instance from config. + + Returns: + ResNet: a :class:`ResNet` instance. + """ + # need registration of new blocks/stems? + norm = cfg.MODEL.RESNETS.NORM + stem = BasicStem( + in_channels=input_shape.channels, + out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, + norm=norm, + ) + + # fmt: off + freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT + out_features = cfg.MODEL.RESNETS.OUT_FEATURES + depth = cfg.MODEL.RESNETS.DEPTH + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 + res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION + deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE + deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED + deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS + # fmt: on + assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) + + num_blocks_per_stage = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[depth] + + if depth in [18, 34]: + assert out_channels == 64, "Must set MODEL.RESNETS.RES2_OUT_CHANNELS = 64 for R18/R34" + assert not any( + deform_on_per_stage + ), "MODEL.RESNETS.DEFORM_ON_PER_STAGE unsupported for R18/R34" + assert res5_dilation == 1, "Must set MODEL.RESNETS.RES5_DILATION = 1 for R18/R34" + assert num_groups == 1, "Must set MODEL.RESNETS.NUM_GROUPS = 1 for R18/R34" + + stages = [] + + # Avoid creating variables without gradients + # It consumes extra memory and may cause allreduce to fail + out_stage_idx = [{"res2": 2, "res3": 3, "res4": 4, "res5": 5}[f] for f in out_features] + max_stage_idx = max(out_stage_idx) + for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): + dilation = res5_dilation if stage_idx == 5 else 1 + first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 + stage_kargs = { + "num_blocks": num_blocks_per_stage[idx], + "first_stride": first_stride, + "in_channels": in_channels, + "out_channels": out_channels, + "norm": norm, + } + # Use BasicBlock for R18 and R34. + if depth in [18, 34]: + stage_kargs["block_class"] = BasicBlock + else: + stage_kargs["bottleneck_channels"] = bottleneck_channels + stage_kargs["stride_in_1x1"] = stride_in_1x1 + stage_kargs["dilation"] = dilation + stage_kargs["num_groups"] = num_groups + if deform_on_per_stage[idx]: + stage_kargs["block_class"] = DeformBottleneckBlock + stage_kargs["deform_modulated"] = deform_modulated + stage_kargs["deform_num_groups"] = deform_num_groups + else: + stage_kargs["block_class"] = BottleneckBlock + blocks = make_stage(**stage_kargs) + in_channels = out_channels + out_channels *= 2 + bottleneck_channels *= 2 + stages.append(blocks) + return ResNet(stem, stages, out_features=out_features).freeze(freeze_at) diff --git a/detectron2/modeling/box_regression.py b/detectron2/modeling/box_regression.py new file mode 100644 index 0000000000000000000000000000000000000000..88426fddf36812f33def8fb434bebce53db3a4b4 --- /dev/null +++ b/detectron2/modeling/box_regression.py @@ -0,0 +1,247 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from typing import Tuple +import torch + +# Value for clamping large dw and dh predictions. The heuristic is that we clamp +# such that dw and dh are no larger than what would transform a 16px box into a +# 1000px box (based on a small anchor, 16px, and a typical image size, 1000px). +_DEFAULT_SCALE_CLAMP = math.log(1000.0 / 16) + + +__all__ = ["Box2BoxTransform", "Box2BoxTransformRotated"] + + +def apply_deltas_broadcast(box2box_transform, deltas, boxes): + """ + Apply transform deltas to boxes. Similar to `box2box_transform.apply_deltas`, + but allow broadcasting boxes when the second dimension of deltas is a multiple + of box dimension. + + Args: + box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): the transform to apply + deltas (Tensor): tensor of shape (N,B) or (N,KxB) + boxes (Tensor): tensor of shape (N,B) + + Returns: + Tensor: same shape as deltas. + """ + assert deltas.dim() == boxes.dim() == 2, f"{deltas.shape}, {boxes.shape}" + N, B = boxes.shape + assert ( + deltas.shape[1] % B == 0 + ), f"Second dim of deltas should be a multiple of {B}. Got {deltas.shape}" + K = deltas.shape[1] // B + ret = box2box_transform.apply_deltas( + deltas.view(N * K, B), boxes.unsqueeze(1).expand(N, K, B).reshape(N * K, B) + ) + return ret.view(N, K * B) + + +@torch.jit.script +class Box2BoxTransform(object): + """ + The box-to-box transform defined in R-CNN. The transformation is parameterized + by 4 deltas: (dx, dy, dw, dh). The transformation scales the box's width and height + by exp(dw), exp(dh) and shifts a box's center by the offset (dx * width, dy * height). + """ + + def __init__( + self, weights: Tuple[float, float, float, float], scale_clamp: float = _DEFAULT_SCALE_CLAMP + ): + """ + Args: + weights (4-element tuple): Scaling factors that are applied to the + (dx, dy, dw, dh) deltas. In Fast R-CNN, these were originally set + such that the deltas have unit variance; now they are treated as + hyperparameters of the system. + scale_clamp (float): When predicting deltas, the predicted box scaling + factors (dw and dh) are clamped such that they are <= scale_clamp. + """ + self.weights = weights + self.scale_clamp = scale_clamp + + def get_deltas(self, src_boxes, target_boxes): + """ + Get box regression transformation deltas (dx, dy, dw, dh) that can be used + to transform the `src_boxes` into the `target_boxes`. That is, the relation + ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless + any delta is too large and is clamped). + + Args: + src_boxes (Tensor): source boxes, e.g., object proposals + target_boxes (Tensor): target of the transformation, e.g., ground-truth + boxes. + """ + assert isinstance(src_boxes, torch.Tensor), type(src_boxes) + assert isinstance(target_boxes, torch.Tensor), type(target_boxes) + + src_widths = src_boxes[:, 2] - src_boxes[:, 0] + src_heights = src_boxes[:, 3] - src_boxes[:, 1] + src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths + src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights + + target_widths = target_boxes[:, 2] - target_boxes[:, 0] + target_heights = target_boxes[:, 3] - target_boxes[:, 1] + target_ctr_x = target_boxes[:, 0] + 0.5 * target_widths + target_ctr_y = target_boxes[:, 1] + 0.5 * target_heights + + wx, wy, ww, wh = self.weights + dx = wx * (target_ctr_x - src_ctr_x) / src_widths + dy = wy * (target_ctr_y - src_ctr_y) / src_heights + dw = ww * torch.log(target_widths / src_widths) + dh = wh * torch.log(target_heights / src_heights) + + deltas = torch.stack((dx, dy, dw, dh), dim=1) + assert (src_widths > 0).all().item(), "Input boxes to Box2BoxTransform are not valid!" + return deltas + + def apply_deltas(self, deltas, boxes): + """ + Apply transformation `deltas` (dx, dy, dw, dh) to `boxes`. + + Args: + deltas (Tensor): transformation deltas of shape (N, k*4), where k >= 1. + deltas[i] represents k potentially different class-specific + box transformations for the single box boxes[i]. + boxes (Tensor): boxes to transform, of shape (N, 4) + """ + boxes = boxes.to(deltas.dtype) + + widths = boxes[:, 2] - boxes[:, 0] + heights = boxes[:, 3] - boxes[:, 1] + ctr_x = boxes[:, 0] + 0.5 * widths + ctr_y = boxes[:, 1] + 0.5 * heights + + wx, wy, ww, wh = self.weights + dx = deltas[:, 0::4] / wx + dy = deltas[:, 1::4] / wy + dw = deltas[:, 2::4] / ww + dh = deltas[:, 3::4] / wh + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.scale_clamp) + dh = torch.clamp(dh, max=self.scale_clamp) + + pred_ctr_x = dx * widths[:, None] + ctr_x[:, None] + pred_ctr_y = dy * heights[:, None] + ctr_y[:, None] + pred_w = torch.exp(dw) * widths[:, None] + pred_h = torch.exp(dh) * heights[:, None] + + pred_boxes = torch.zeros_like(deltas) + pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w # x1 + pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h # y1 + pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w # x2 + pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h # y2 + return pred_boxes + + +@torch.jit.script +class Box2BoxTransformRotated(object): + """ + The box-to-box transform defined in Rotated R-CNN. The transformation is parameterized + by 5 deltas: (dx, dy, dw, dh, da). The transformation scales the box's width and height + by exp(dw), exp(dh), shifts a box's center by the offset (dx * width, dy * height), + and rotate a box's angle by da (radians). + Note: angles of deltas are in radians while angles of boxes are in degrees. + """ + + def __init__( + self, + weights: Tuple[float, float, float, float, float], + scale_clamp: float = _DEFAULT_SCALE_CLAMP, + ): + """ + Args: + weights (5-element tuple): Scaling factors that are applied to the + (dx, dy, dw, dh, da) deltas. These are treated as + hyperparameters of the system. + scale_clamp (float): When predicting deltas, the predicted box scaling + factors (dw and dh) are clamped such that they are <= scale_clamp. + """ + self.weights = weights + self.scale_clamp = scale_clamp + + def get_deltas(self, src_boxes, target_boxes): + """ + Get box regression transformation deltas (dx, dy, dw, dh, da) that can be used + to transform the `src_boxes` into the `target_boxes`. That is, the relation + ``target_boxes == self.apply_deltas(deltas, src_boxes)`` is true (unless + any delta is too large and is clamped). + + Args: + src_boxes (Tensor): Nx5 source boxes, e.g., object proposals + target_boxes (Tensor): Nx5 target of the transformation, e.g., ground-truth + boxes. + """ + assert isinstance(src_boxes, torch.Tensor), type(src_boxes) + assert isinstance(target_boxes, torch.Tensor), type(target_boxes) + + src_ctr_x, src_ctr_y, src_widths, src_heights, src_angles = torch.unbind(src_boxes, dim=1) + + target_ctr_x, target_ctr_y, target_widths, target_heights, target_angles = torch.unbind( + target_boxes, dim=1 + ) + + wx, wy, ww, wh, wa = self.weights + dx = wx * (target_ctr_x - src_ctr_x) / src_widths + dy = wy * (target_ctr_y - src_ctr_y) / src_heights + dw = ww * torch.log(target_widths / src_widths) + dh = wh * torch.log(target_heights / src_heights) + # Angles of deltas are in radians while angles of boxes are in degrees. + # the conversion to radians serve as a way to normalize the values + da = target_angles - src_angles + da = (da + 180.0) % 360.0 - 180.0 # make it in [-180, 180) + da *= wa * math.pi / 180.0 + + deltas = torch.stack((dx, dy, dw, dh, da), dim=1) + assert ( + (src_widths > 0).all().item() + ), "Input boxes to Box2BoxTransformRotated are not valid!" + return deltas + + def apply_deltas(self, deltas, boxes): + """ + Apply transformation `deltas` (dx, dy, dw, dh, da) to `boxes`. + + Args: + deltas (Tensor): transformation deltas of shape (N, 5). + deltas[i] represents box transformation for the single box boxes[i]. + boxes (Tensor): boxes to transform, of shape (N, 5) + """ + assert deltas.shape[1] == 5 and boxes.shape[1] == 5 + + boxes = boxes.to(deltas.dtype) + + ctr_x = boxes[:, 0] + ctr_y = boxes[:, 1] + widths = boxes[:, 2] + heights = boxes[:, 3] + angles = boxes[:, 4] + + wx, wy, ww, wh, wa = self.weights + + dx = deltas[:, 0] / wx + dy = deltas[:, 1] / wy + dw = deltas[:, 2] / ww + dh = deltas[:, 3] / wh + da = deltas[:, 4] / wa + + # Prevent sending too large values into torch.exp() + dw = torch.clamp(dw, max=self.scale_clamp) + dh = torch.clamp(dh, max=self.scale_clamp) + + pred_boxes = torch.zeros_like(deltas) + pred_boxes[:, 0] = dx * widths + ctr_x # x_ctr + pred_boxes[:, 1] = dy * heights + ctr_y # y_ctr + pred_boxes[:, 2] = torch.exp(dw) * widths # width + pred_boxes[:, 3] = torch.exp(dh) * heights # height + + # Following original RRPN implementation, + # angles of deltas are in radians while angles of boxes are in degrees. + pred_angle = da * 180.0 / math.pi + angles + pred_angle = (pred_angle + 180.0) % 360.0 - 180.0 # make it in [-180, 180) + + pred_boxes[:, 4] = pred_angle + + return pred_boxes diff --git a/detectron2/modeling/matcher.py b/detectron2/modeling/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..2911f8c1937749dec4dbe64aa3e8491a631e03f2 --- /dev/null +++ b/detectron2/modeling/matcher.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import List +import torch + + +class Matcher(object): + """ + This class assigns to each predicted "element" (e.g., a box) a ground-truth + element. Each predicted element will have exactly zero or one matches; each + ground-truth element may be matched to zero or more predicted elements. + + The matching is determined by the MxN match_quality_matrix, that characterizes + how well each (ground-truth, prediction)-pair match each other. For example, + if the elements are boxes, this matrix may contain box intersection-over-union + overlap values. + + The matcher returns (a) a vector of length N containing the index of the + ground-truth element m in [0, M) that matches to prediction n in [0, N). + (b) a vector of length N containing the labels for each prediction. + """ + + def __init__( + self, thresholds: List[float], labels: List[int], allow_low_quality_matches: bool = False + ): + """ + Args: + thresholds (list): a list of thresholds used to stratify predictions + into levels. + labels (list): a list of values to label predictions belonging at + each level. A label can be one of {-1, 0, 1} signifying + {ignore, negative class, positive class}, respectively. + allow_low_quality_matches (bool): if True, produce additional matches + for predictions with maximum match quality lower than high_threshold. + See set_low_quality_matches_ for more details. + + For example, + thresholds = [0.3, 0.5] + labels = [0, -1, 1] + All predictions with iou < 0.3 will be marked with 0 and + thus will be considered as false positives while training. + All predictions with 0.3 <= iou < 0.5 will be marked with -1 and + thus will be ignored. + All predictions with 0.5 <= iou will be marked with 1 and + thus will be considered as true positives. + """ + # Add -inf and +inf to first and last position in thresholds + thresholds = thresholds[:] + assert thresholds[0] > 0 + thresholds.insert(0, -float("inf")) + thresholds.append(float("inf")) + assert all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])) + assert all(l in [-1, 0, 1] for l in labels) + assert len(labels) == len(thresholds) - 1 + self.thresholds = thresholds + self.labels = labels + self.allow_low_quality_matches = allow_low_quality_matches + + def __call__(self, match_quality_matrix): + """ + Args: + match_quality_matrix (Tensor[float]): an MxN tensor, containing the + pairwise quality between M ground-truth elements and N predicted + elements. All elements must be >= 0 (due to the us of `torch.nonzero` + for selecting indices in :meth:`set_low_quality_matches_`). + + Returns: + matches (Tensor[int64]): a vector of length N, where matches[i] is a matched + ground-truth index in [0, M) + match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates + whether a prediction is a true or false positive or ignored + """ + assert match_quality_matrix.dim() == 2 + if match_quality_matrix.numel() == 0: + default_matches = match_quality_matrix.new_full( + (match_quality_matrix.size(1),), 0, dtype=torch.int64 + ) + # When no gt boxes exist, we define IOU = 0 and therefore set labels + # to `self.labels[0]`, which usually defaults to background class 0 + # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds + default_match_labels = match_quality_matrix.new_full( + (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8 + ) + return default_matches, default_match_labels + + assert torch.all(match_quality_matrix >= 0) + + # match_quality_matrix is M (gt) x N (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = match_quality_matrix.max(dim=0) + + match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8) + + for (l, low, high) in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]): + low_high = (matched_vals >= low) & (matched_vals < high) + match_labels[low_high] = l + + if self.allow_low_quality_matches: + self.set_low_quality_matches_(match_labels, match_quality_matrix) + + return matches, match_labels + + def set_low_quality_matches_(self, match_labels, match_quality_matrix): + """ + Produce additional matches for predictions that have only low-quality matches. + Specifically, for each ground-truth G find the set of predictions that have + maximum overlap with it (including ties); for each prediction in that set, if + it is unmatched, then match it to the ground-truth G. + + This function implements the RPN assignment case (i) in Sec. 3.1.2 of + :paper:`Faster R-CNN`. + """ + # For each gt, find the prediction with which it has highest quality + highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1) + # Find the highest quality match available, even if it is low, including ties. + # Note that the matches qualities must be positive due to the use of + # `torch.nonzero`. + _, pred_inds_with_highest_quality = torch.nonzero( + match_quality_matrix == highest_quality_foreach_gt[:, None], as_tuple=True + ) + # If an anchor was labeled positive only due to a low-quality match + # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B. + # This follows the implementation in Detectron, and is found to have no significant impact. + match_labels[pred_inds_with_highest_quality] = 1 diff --git a/detectron2/modeling/meta_arch/__init__.py b/detectron2/modeling/meta_arch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..96ef9b582c2ed38525102ebb589a750cf6b9fa54 --- /dev/null +++ b/detectron2/modeling/meta_arch/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .build import META_ARCH_REGISTRY, build_model # isort:skip + +from .panoptic_fpn import PanopticFPN + +# import all the meta_arch, so they will be registered +from .rcnn import GeneralizedRCNN, ProposalNetwork +from .retinanet import RetinaNet +from .semantic_seg import SEM_SEG_HEADS_REGISTRY, SemanticSegmentor, build_sem_seg_head diff --git a/detectron2/modeling/meta_arch/build.py b/detectron2/modeling/meta_arch/build.py new file mode 100644 index 0000000000000000000000000000000000000000..630389dfca822f295447abd5e8424186d02e0465 --- /dev/null +++ b/detectron2/modeling/meta_arch/build.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch + +from detectron2.utils.registry import Registry + +META_ARCH_REGISTRY = Registry("META_ARCH") # noqa F401 isort:skip +META_ARCH_REGISTRY.__doc__ = """ +Registry for meta-architectures, i.e. the whole model. + +The registered object will be called with `obj(cfg)` +and expected to return a `nn.Module` object. +""" + + +def build_model(cfg): + """ + Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``. + Note that it does not load any weights from ``cfg``. + """ + meta_arch = cfg.MODEL.META_ARCHITECTURE + model = META_ARCH_REGISTRY.get(meta_arch)(cfg) + model.to(torch.device(cfg.MODEL.DEVICE)) + return model diff --git a/detectron2/modeling/meta_arch/panoptic_fpn.py b/detectron2/modeling/meta_arch/panoptic_fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..c5f92f701f2da3aff6602ad2388307874102fc5c --- /dev/null +++ b/detectron2/modeling/meta_arch/panoptic_fpn.py @@ -0,0 +1,218 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import torch +from torch import nn + +from detectron2.structures import ImageList + +from ..backbone import build_backbone +from ..postprocessing import detector_postprocess, sem_seg_postprocess +from ..proposal_generator import build_proposal_generator +from ..roi_heads import build_roi_heads +from .build import META_ARCH_REGISTRY +from .semantic_seg import build_sem_seg_head + +__all__ = ["PanopticFPN"] + + +@META_ARCH_REGISTRY.register() +class PanopticFPN(nn.Module): + """ + Implement the paper :paper:`PanopticFPN`. + """ + + def __init__(self, cfg): + super().__init__() + + self.instance_loss_weight = cfg.MODEL.PANOPTIC_FPN.INSTANCE_LOSS_WEIGHT + + # options when combining instance & semantic outputs + self.combine_on = cfg.MODEL.PANOPTIC_FPN.COMBINE.ENABLED + self.combine_overlap_threshold = cfg.MODEL.PANOPTIC_FPN.COMBINE.OVERLAP_THRESH + self.combine_stuff_area_limit = cfg.MODEL.PANOPTIC_FPN.COMBINE.STUFF_AREA_LIMIT + self.combine_instances_confidence_threshold = ( + cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH + ) + + self.backbone = build_backbone(cfg) + self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) + self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) + self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape()) + + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + + For now, each item in the list is a dict that contains: + + * "image": Tensor, image in (C, H, W) format. + * "instances": Instances + * "sem_seg": semantic segmentation ground truth. + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + each dict is the results for one image. The dict contains the following keys: + + * "instances": see :meth:`GeneralizedRCNN.forward` for its format. + * "sem_seg": see :meth:`SemanticSegmentor.forward` for its format. + * "panoptic_seg": available when `PANOPTIC_FPN.COMBINE.ENABLED`. + See the return value of + :func:`combine_semantic_and_instance_outputs` for its format. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + features = self.backbone(images.tensor) + + if "proposals" in batched_inputs[0]: + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + proposal_losses = {} + + if "sem_seg" in batched_inputs[0]: + gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] + gt_sem_seg = ImageList.from_tensors( + gt_sem_seg, self.backbone.size_divisibility, self.sem_seg_head.ignore_value + ).tensor + else: + gt_sem_seg = None + sem_seg_results, sem_seg_losses = self.sem_seg_head(features, gt_sem_seg) + + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + if self.proposal_generator: + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + detector_results, detector_losses = self.roi_heads( + images, features, proposals, gt_instances + ) + + if self.training: + losses = {} + losses.update(sem_seg_losses) + losses.update({k: v * self.instance_loss_weight for k, v in detector_losses.items()}) + losses.update(proposal_losses) + return losses + + processed_results = [] + for sem_seg_result, detector_result, input_per_image, image_size in zip( + sem_seg_results, detector_results, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + sem_seg_r = sem_seg_postprocess(sem_seg_result, image_size, height, width) + detector_r = detector_postprocess(detector_result, height, width) + + processed_results.append({"sem_seg": sem_seg_r, "instances": detector_r}) + + if self.combine_on: + panoptic_r = combine_semantic_and_instance_outputs( + detector_r, + sem_seg_r.argmax(dim=0), + self.combine_overlap_threshold, + self.combine_stuff_area_limit, + self.combine_instances_confidence_threshold, + ) + processed_results[-1]["panoptic_seg"] = panoptic_r + return processed_results + + +def combine_semantic_and_instance_outputs( + instance_results, + semantic_results, + overlap_threshold, + stuff_area_limit, + instances_confidence_threshold, +): + """ + Implement a simple combining logic following + "combine_semantic_and_instance_predictions.py" in panopticapi + to produce panoptic segmentation outputs. + + Args: + instance_results: output of :func:`detector_postprocess`. + semantic_results: an (H, W) tensor, each is the contiguous semantic + category id + + Returns: + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + panoptic_seg = torch.zeros_like(semantic_results, dtype=torch.int32) + + # sort instance outputs by scores + sorted_inds = torch.argsort(-instance_results.scores) + + current_segment_id = 0 + segments_info = [] + + instance_masks = instance_results.pred_masks.to(dtype=torch.bool, device=panoptic_seg.device) + + # Add instances one-by-one, check for overlaps with existing ones + for inst_id in sorted_inds: + score = instance_results.scores[inst_id].item() + if score < instances_confidence_threshold: + break + mask = instance_masks[inst_id] # H,W + mask_area = mask.sum().item() + + if mask_area == 0: + continue + + intersect = (mask > 0) & (panoptic_seg > 0) + intersect_area = intersect.sum().item() + + if intersect_area * 1.0 / mask_area > overlap_threshold: + continue + + if intersect_area > 0: + mask = mask & (panoptic_seg == 0) + + current_segment_id += 1 + panoptic_seg[mask] = current_segment_id + segments_info.append( + { + "id": current_segment_id, + "isthing": True, + "score": score, + "category_id": instance_results.pred_classes[inst_id].item(), + "instance_id": inst_id.item(), + } + ) + + # Add semantic results to remaining empty areas + semantic_labels = torch.unique(semantic_results).cpu().tolist() + for semantic_label in semantic_labels: + if semantic_label == 0: # 0 is a special "thing" class + continue + mask = (semantic_results == semantic_label) & (panoptic_seg == 0) + mask_area = mask.sum().item() + if mask_area < stuff_area_limit: + continue + + current_segment_id += 1 + panoptic_seg[mask] = current_segment_id + segments_info.append( + { + "id": current_segment_id, + "isthing": False, + "category_id": semantic_label, + "area": mask_area, + } + ) + + return panoptic_seg, segments_info diff --git a/detectron2/modeling/meta_arch/rcnn.py b/detectron2/modeling/meta_arch/rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..b15ea8a38e5ddfbb4049c89917f055295e396b4f --- /dev/null +++ b/detectron2/modeling/meta_arch/rcnn.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import numpy as np +import torch +from torch import nn + +from detectron2.structures import ImageList +from detectron2.utils.events import get_event_storage +from detectron2.utils.logger import log_first_n + +from ..backbone import build_backbone +from ..postprocessing import detector_postprocess +from ..proposal_generator import build_proposal_generator +from ..roi_heads import build_roi_heads +from .build import META_ARCH_REGISTRY + +__all__ = ["GeneralizedRCNN", "ProposalNetwork"] + + +@META_ARCH_REGISTRY.register() +class GeneralizedRCNN(nn.Module): + """ + Generalized R-CNN. Any models that contains the following three components: + 1. Per-image feature extraction (aka backbone) + 2. Region proposal generation + 3. Per-region feature extraction and prediction + """ + + def __init__(self, cfg): + super().__init__() + + self.backbone = build_backbone(cfg) + self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) + self.roi_heads = build_roi_heads(cfg, self.backbone.output_shape()) + self.vis_period = cfg.VIS_PERIOD + self.input_format = cfg.INPUT.FORMAT + + assert len(cfg.MODEL.PIXEL_MEAN) == len(cfg.MODEL.PIXEL_STD) + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + @property + def device(self): + return self.pixel_mean.device + + def visualize_training(self, batched_inputs, proposals): + """ + A function used to visualize images and proposals. It shows ground truth + bounding boxes on the original image and up to 20 predicted object + proposals on the original image. Users can implement different + visualization functions for different models. + + Args: + batched_inputs (list): a list that contains input to the model. + proposals (list): a list that contains predicted proposals. Both + batched_inputs and proposals should have the same length. + """ + from detectron2.utils.visualizer import Visualizer + + storage = get_event_storage() + max_vis_prop = 20 + + for input, prop in zip(batched_inputs, proposals): + img = input["image"].cpu().numpy() + assert img.shape[0] == 3, "Images should have 3 channels." + if self.input_format == "BGR": + img = img[::-1, :, :] + img = img.transpose(1, 2, 0) + v_gt = Visualizer(img, None) + v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes) + anno_img = v_gt.get_image() + box_size = min(len(prop.proposal_boxes), max_vis_prop) + v_pred = Visualizer(img, None) + v_pred = v_pred.overlay_instances( + boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy() + ) + prop_img = v_pred.get_image() + vis_img = np.concatenate((anno_img, prop_img), axis=1) + vis_img = vis_img.transpose(2, 0, 1) + vis_name = "Left: GT bounding boxes; Right: Predicted proposals" + storage.put_image(vis_name, vis_img) + break # only visualize one image in a batch + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances (optional): groundtruth :class:`Instances` + * proposals (optional): :class:`Instances`, precomputed proposals. + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "instances" whose value is a :class:`Instances`. + The :class:`Instances` object has the following keys: + "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints" + """ + if not self.training: + return self.inference(batched_inputs) + + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + elif "targets" in batched_inputs[0]: + log_first_n( + logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 + ) + gt_instances = [x["targets"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + features = self.backbone(images.tensor) + + if self.proposal_generator: + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + proposal_losses = {} + + _, detector_losses = self.roi_heads(images, features, proposals, gt_instances) + if self.vis_period > 0: + storage = get_event_storage() + if storage.iter % self.vis_period == 0: + self.visualize_training(batched_inputs, proposals) + + losses = {} + losses.update(detector_losses) + losses.update(proposal_losses) + return losses + + def inference(self, batched_inputs, detected_instances=None, do_postprocess=True): + """ + Run inference on the given inputs. + + Args: + batched_inputs (list[dict]): same as in :meth:`forward` + detected_instances (None or list[Instances]): if not None, it + contains an `Instances` object per image. The `Instances` + object contains "pred_boxes" and "pred_classes" which are + known boxes in the image. + The inference will then skip the detection of bounding boxes, + and only predict other per-ROI outputs. + do_postprocess (bool): whether to apply post-processing on the outputs. + + Returns: + same as in :meth:`forward`. + """ + assert not self.training + + images = self.preprocess_image(batched_inputs) + features = self.backbone(images.tensor) + + if detected_instances is None: + if self.proposal_generator: + proposals, _ = self.proposal_generator(images, features, None) + else: + assert "proposals" in batched_inputs[0] + proposals = [x["proposals"].to(self.device) for x in batched_inputs] + + results, _ = self.roi_heads(images, features, proposals, None) + else: + detected_instances = [x.to(self.device) for x in detected_instances] + results = self.roi_heads.forward_with_given_boxes(features, detected_instances) + + if do_postprocess: + return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes) + else: + return results + + def preprocess_image(self, batched_inputs): + """ + Normalize, pad and batch the input images. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + return images + + @staticmethod + def _postprocess(instances, batched_inputs, image_sizes): + """ + Rescale the output instances to the target size. + """ + # note: private function; subject to changes + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + instances, batched_inputs, image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"instances": r}) + return processed_results + + +@META_ARCH_REGISTRY.register() +class ProposalNetwork(nn.Module): + """ + A meta architecture that only predicts object proposals. + """ + + def __init__(self, cfg): + super().__init__() + self.backbone = build_backbone(cfg) + self.proposal_generator = build_proposal_generator(cfg, self.backbone.output_shape()) + + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + Same as in :class:`GeneralizedRCNN.forward` + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "proposals" whose value is a + :class:`Instances` with keys "proposal_boxes" and "objectness_logits". + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + features = self.backbone(images.tensor) + + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + elif "targets" in batched_inputs[0]: + log_first_n( + logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 + ) + gt_instances = [x["targets"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + proposals, proposal_losses = self.proposal_generator(images, features, gt_instances) + # In training, the proposals are not useful at all but we generate them anyway. + # This makes RPN-only models about 5% slower. + if self.training: + return proposal_losses + + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + proposals, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"proposals": r}) + return processed_results diff --git a/detectron2/modeling/meta_arch/retinanet.py b/detectron2/modeling/meta_arch/retinanet.py new file mode 100644 index 0000000000000000000000000000000000000000..35c42cc25e93bf2841c5e1fcff389f317ed0883a --- /dev/null +++ b/detectron2/modeling/meta_arch/retinanet.py @@ -0,0 +1,489 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import math +import numpy as np +from typing import List +import torch +from fvcore.nn import sigmoid_focal_loss_jit, smooth_l1_loss +from torch import nn + +from detectron2.layers import ShapeSpec, batched_nms, cat +from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou +from detectron2.utils.events import get_event_storage +from detectron2.utils.logger import log_first_n + +from ..anchor_generator import build_anchor_generator +from ..backbone import build_backbone +from ..box_regression import Box2BoxTransform +from ..matcher import Matcher +from ..postprocessing import detector_postprocess +from .build import META_ARCH_REGISTRY + +__all__ = ["RetinaNet"] + + +def permute_to_N_HWA_K(tensor, K): + """ + Transpose/reshape a tensor from (N, (A x K), H, W) to (N, (HxWxA), K) + """ + assert tensor.dim() == 4, tensor.shape + N, _, H, W = tensor.shape + tensor = tensor.view(N, -1, K, H, W) + tensor = tensor.permute(0, 3, 4, 1, 2) + tensor = tensor.reshape(N, -1, K) # Size=(N,HWA,K) + return tensor + + +def permute_all_cls_and_box_to_N_HWA_K_and_concat(box_cls, box_delta, num_classes=80): + """ + Rearrange the tensor layout from the network output, i.e.: + list[Tensor]: #lvl tensors of shape (N, A x K, Hi, Wi) + to per-image predictions, i.e.: + Tensor: of shape (N x sum(Hi x Wi x A), K) + """ + # for each feature level, permute the outputs to make them be in the + # same format as the labels. Note that the labels are computed for + # all feature levels concatenated, so we keep the same representation + # for the objectness and the box_delta + box_cls_flattened = [permute_to_N_HWA_K(x, num_classes) for x in box_cls] + box_delta_flattened = [permute_to_N_HWA_K(x, 4) for x in box_delta] + # concatenate on the first dimension (representing the feature levels), to + # take into account the way the labels were generated (with all feature maps + # being concatenated as well) + box_cls = cat(box_cls_flattened, dim=1).view(-1, num_classes) + box_delta = cat(box_delta_flattened, dim=1).view(-1, 4) + return box_cls, box_delta + + +@META_ARCH_REGISTRY.register() +class RetinaNet(nn.Module): + """ + Implement RetinaNet in :paper:`RetinaNet`. + """ + + def __init__(self, cfg): + super().__init__() + + # fmt: off + self.num_classes = cfg.MODEL.RETINANET.NUM_CLASSES + self.in_features = cfg.MODEL.RETINANET.IN_FEATURES + # Loss parameters: + self.focal_loss_alpha = cfg.MODEL.RETINANET.FOCAL_LOSS_ALPHA + self.focal_loss_gamma = cfg.MODEL.RETINANET.FOCAL_LOSS_GAMMA + self.smooth_l1_loss_beta = cfg.MODEL.RETINANET.SMOOTH_L1_LOSS_BETA + # Inference parameters: + self.score_threshold = cfg.MODEL.RETINANET.SCORE_THRESH_TEST + self.topk_candidates = cfg.MODEL.RETINANET.TOPK_CANDIDATES_TEST + self.nms_threshold = cfg.MODEL.RETINANET.NMS_THRESH_TEST + self.max_detections_per_image = cfg.TEST.DETECTIONS_PER_IMAGE + # Vis parameters + self.vis_period = cfg.VIS_PERIOD + self.input_format = cfg.INPUT.FORMAT + # fmt: on + + self.backbone = build_backbone(cfg) + + backbone_shape = self.backbone.output_shape() + feature_shapes = [backbone_shape[f] for f in self.in_features] + self.head = RetinaNetHead(cfg, feature_shapes) + self.anchor_generator = build_anchor_generator(cfg, feature_shapes) + + # Matching and loss + self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) + self.matcher = Matcher( + cfg.MODEL.RETINANET.IOU_THRESHOLDS, + cfg.MODEL.RETINANET.IOU_LABELS, + allow_low_quality_matches=True, + ) + + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + """ + In Detectron1, loss is normalized by number of foreground samples in the batch. + When batch size is 1 per GPU, #foreground has a large variance and + using it lead to lower performance. Here we maintain an EMA of #foreground to + stabilize the normalizer. + """ + self.loss_normalizer = 100 # initialize with any reasonable #fg that's not too small + self.loss_normalizer_momentum = 0.9 + + @property + def device(self): + return self.pixel_mean.device + + def visualize_training(self, batched_inputs, results): + """ + A function used to visualize ground truth images and final network predictions. + It shows ground truth bounding boxes on the original image and up to 20 + predicted object bounding boxes on the original image. + + Args: + batched_inputs (list): a list that contains input to the model. + results (List[Instances]): a list of #images elements. + """ + from detectron2.utils.visualizer import Visualizer + + assert len(batched_inputs) == len( + results + ), "Cannot visualize inputs and results of different sizes" + storage = get_event_storage() + max_boxes = 20 + + image_index = 0 # only visualize a single image + img = batched_inputs[image_index]["image"].cpu().numpy() + assert img.shape[0] == 3, "Images should have 3 channels." + if self.input_format == "BGR": + img = img[::-1, :, :] + img = img.transpose(1, 2, 0) + v_gt = Visualizer(img, None) + v_gt = v_gt.overlay_instances(boxes=batched_inputs[image_index]["instances"].gt_boxes) + anno_img = v_gt.get_image() + processed_results = detector_postprocess(results[image_index], img.shape[0], img.shape[1]) + predicted_boxes = processed_results.pred_boxes.tensor.detach().cpu().numpy() + + v_pred = Visualizer(img, None) + v_pred = v_pred.overlay_instances(boxes=predicted_boxes[0:max_boxes]) + prop_img = v_pred.get_image() + vis_img = np.vstack((anno_img, prop_img)) + vis_img = vis_img.transpose(2, 0, 1) + vis_name = f"Top: GT bounding boxes; Bottom: {max_boxes} Highest Scoring Results" + storage.put_image(vis_name, vis_img) + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + + * image: Tensor, image in (C, H, W) format. + * instances: Instances + + Other information that's included in the original dicts, such as: + + * "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + Returns: + dict[str: Tensor]: + mapping from a named loss to a tensor storing the loss. Used during training only. + """ + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + elif "targets" in batched_inputs[0]: + log_first_n( + logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 + ) + gt_instances = [x["targets"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + features = self.backbone(images.tensor) + features = [features[f] for f in self.in_features] + box_cls, box_delta = self.head(features) + anchors = self.anchor_generator(features) + + if self.training: + gt_classes, gt_anchors_reg_deltas = self.get_ground_truth(anchors, gt_instances) + losses = self.losses(gt_classes, gt_anchors_reg_deltas, box_cls, box_delta) + + if self.vis_period > 0: + storage = get_event_storage() + if storage.iter % self.vis_period == 0: + results = self.inference(box_cls, box_delta, anchors, images.image_sizes) + self.visualize_training(batched_inputs, results) + + return losses + else: + results = self.inference(box_cls, box_delta, anchors, images.image_sizes) + processed_results = [] + for results_per_image, input_per_image, image_size in zip( + results, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = detector_postprocess(results_per_image, height, width) + processed_results.append({"instances": r}) + return processed_results + + def losses(self, gt_classes, gt_anchors_deltas, pred_class_logits, pred_anchor_deltas): + """ + Args: + For `gt_classes` and `gt_anchors_deltas` parameters, see + :meth:`RetinaNet.get_ground_truth`. + Their shapes are (N, R) and (N, R, 4), respectively, where R is + the total number of anchors across levels, i.e. sum(Hi x Wi x A) + For `pred_class_logits` and `pred_anchor_deltas`, see + :meth:`RetinaNetHead.forward`. + + Returns: + dict[str, Tensor]: + mapping from a named loss to a scalar tensor + storing the loss. Used during training only. The dict keys are: + "loss_cls" and "loss_box_reg" + """ + pred_class_logits, pred_anchor_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat( + pred_class_logits, pred_anchor_deltas, self.num_classes + ) # Shapes: (N x R, K) and (N x R, 4), respectively. + + gt_classes = gt_classes.flatten() + gt_anchors_deltas = gt_anchors_deltas.view(-1, 4) + + valid_idxs = gt_classes >= 0 + foreground_idxs = (gt_classes >= 0) & (gt_classes != self.num_classes) + num_foreground = foreground_idxs.sum().item() + get_event_storage().put_scalar("num_foreground", num_foreground) + self.loss_normalizer = ( + self.loss_normalizer_momentum * self.loss_normalizer + + (1 - self.loss_normalizer_momentum) * num_foreground + ) + + gt_classes_target = torch.zeros_like(pred_class_logits) + gt_classes_target[foreground_idxs, gt_classes[foreground_idxs]] = 1 + + # logits loss + loss_cls = sigmoid_focal_loss_jit( + pred_class_logits[valid_idxs], + gt_classes_target[valid_idxs], + alpha=self.focal_loss_alpha, + gamma=self.focal_loss_gamma, + reduction="sum", + ) / max(1, self.loss_normalizer) + + # regression loss + loss_box_reg = smooth_l1_loss( + pred_anchor_deltas[foreground_idxs], + gt_anchors_deltas[foreground_idxs], + beta=self.smooth_l1_loss_beta, + reduction="sum", + ) / max(1, self.loss_normalizer) + + return {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg} + + @torch.no_grad() + def get_ground_truth(self, anchors, targets): + """ + Args: + anchors (list[Boxes]): A list of #feature level Boxes. + The Boxes contains anchors of this image on the specific feature level. + targets (list[Instances]): a list of N `Instances`s. The i-th + `Instances` contains the ground-truth per-instance annotations + for the i-th input image. Specify `targets` during training only. + + Returns: + gt_classes (Tensor): + An integer tensor of shape (N, R) storing ground-truth labels for each anchor. + R is the total number of anchors, i.e. the sum of Hi x Wi x A for all levels. + Anchors with an IoU with some target higher than the foreground threshold + are assigned their corresponding label in the [0, K-1] range. + Anchors whose IoU are below the background threshold are assigned + the label "K". Anchors whose IoU are between the foreground and background + thresholds are assigned a label "-1", i.e. ignore. + gt_anchors_deltas (Tensor): + Shape (N, R, 4). + The last dimension represents ground-truth box2box transform + targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. + The values in the tensor are meaningful only when the corresponding + anchor is labeled as foreground. + """ + gt_classes = [] + gt_anchors_deltas = [] + anchors = Boxes.cat(anchors) # Rx4 + + for targets_per_image in targets: + match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, anchors) + gt_matched_idxs, anchor_labels = self.matcher(match_quality_matrix) + + has_gt = len(targets_per_image) > 0 + if has_gt: + # ground truth box regression + matched_gt_boxes = targets_per_image.gt_boxes[gt_matched_idxs] + gt_anchors_reg_deltas_i = self.box2box_transform.get_deltas( + anchors.tensor, matched_gt_boxes.tensor + ) + + gt_classes_i = targets_per_image.gt_classes[gt_matched_idxs] + # Anchors with label 0 are treated as background. + gt_classes_i[anchor_labels == 0] = self.num_classes + # Anchors with label -1 are ignored. + gt_classes_i[anchor_labels == -1] = -1 + else: + gt_classes_i = torch.zeros_like(gt_matched_idxs) + self.num_classes + gt_anchors_reg_deltas_i = torch.zeros_like(anchors.tensor) + + gt_classes.append(gt_classes_i) + gt_anchors_deltas.append(gt_anchors_reg_deltas_i) + + return torch.stack(gt_classes), torch.stack(gt_anchors_deltas) + + def inference(self, box_cls, box_delta, anchors, image_sizes): + """ + Arguments: + box_cls, box_delta: Same as the output of :meth:`RetinaNetHead.forward` + anchors (list[Boxes]): A list of #feature level Boxes. + The Boxes contain anchors of this image on the specific feature level. + image_sizes (List[torch.Size]): the input image sizes + + Returns: + results (List[Instances]): a list of #images elements. + """ + results = [] + + box_cls = [permute_to_N_HWA_K(x, self.num_classes) for x in box_cls] + box_delta = [permute_to_N_HWA_K(x, 4) for x in box_delta] + # list[Tensor], one per level, each has shape (N, Hi x Wi x A, K or 4) + + for img_idx, image_size in enumerate(image_sizes): + box_cls_per_image = [box_cls_per_level[img_idx] for box_cls_per_level in box_cls] + box_reg_per_image = [box_reg_per_level[img_idx] for box_reg_per_level in box_delta] + results_per_image = self.inference_single_image( + box_cls_per_image, box_reg_per_image, anchors, tuple(image_size) + ) + results.append(results_per_image) + return results + + def inference_single_image(self, box_cls, box_delta, anchors, image_size): + """ + Single-image inference. Return bounding-box detection results by thresholding + on scores and applying non-maximum suppression (NMS). + + Arguments: + box_cls (list[Tensor]): list of #feature levels. Each entry contains + tensor of size (H x W x A, K) + box_delta (list[Tensor]): Same shape as 'box_cls' except that K becomes 4. + anchors (list[Boxes]): list of #feature levels. Each entry contains + a Boxes object, which contains all the anchors for that + image in that feature level. + image_size (tuple(H, W)): a tuple of the image height and width. + + Returns: + Same as `inference`, but for only one image. + """ + boxes_all = [] + scores_all = [] + class_idxs_all = [] + + # Iterate over every feature level + for box_cls_i, box_reg_i, anchors_i in zip(box_cls, box_delta, anchors): + # (HxWxAxK,) + box_cls_i = box_cls_i.flatten().sigmoid_() + + # Keep top k top scoring indices only. + num_topk = min(self.topk_candidates, box_reg_i.size(0)) + # torch.sort is actually faster than .topk (at least on GPUs) + predicted_prob, topk_idxs = box_cls_i.sort(descending=True) + predicted_prob = predicted_prob[:num_topk] + topk_idxs = topk_idxs[:num_topk] + + # filter out the proposals with low confidence score + keep_idxs = predicted_prob > self.score_threshold + predicted_prob = predicted_prob[keep_idxs] + topk_idxs = topk_idxs[keep_idxs] + + anchor_idxs = topk_idxs // self.num_classes + classes_idxs = topk_idxs % self.num_classes + + box_reg_i = box_reg_i[anchor_idxs] + anchors_i = anchors_i[anchor_idxs] + # predict boxes + predicted_boxes = self.box2box_transform.apply_deltas(box_reg_i, anchors_i.tensor) + + boxes_all.append(predicted_boxes) + scores_all.append(predicted_prob) + class_idxs_all.append(classes_idxs) + + boxes_all, scores_all, class_idxs_all = [ + cat(x) for x in [boxes_all, scores_all, class_idxs_all] + ] + keep = batched_nms(boxes_all, scores_all, class_idxs_all, self.nms_threshold) + keep = keep[: self.max_detections_per_image] + + result = Instances(image_size) + result.pred_boxes = Boxes(boxes_all[keep]) + result.scores = scores_all[keep] + result.pred_classes = class_idxs_all[keep] + return result + + def preprocess_image(self, batched_inputs): + """ + Normalize, pad and batch the input images. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + return images + + +class RetinaNetHead(nn.Module): + """ + The head used in RetinaNet for object classification and box regression. + It has two subnets for the two tasks, with a common structure but separate parameters. + """ + + def __init__(self, cfg, input_shape: List[ShapeSpec]): + super().__init__() + # fmt: off + in_channels = input_shape[0].channels + num_classes = cfg.MODEL.RETINANET.NUM_CLASSES + num_convs = cfg.MODEL.RETINANET.NUM_CONVS + prior_prob = cfg.MODEL.RETINANET.PRIOR_PROB + num_anchors = build_anchor_generator(cfg, input_shape).num_cell_anchors + # fmt: on + assert ( + len(set(num_anchors)) == 1 + ), "Using different number of anchors between levels is not currently supported!" + num_anchors = num_anchors[0] + + cls_subnet = [] + bbox_subnet = [] + for _ in range(num_convs): + cls_subnet.append( + nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + ) + cls_subnet.append(nn.ReLU()) + bbox_subnet.append( + nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + ) + bbox_subnet.append(nn.ReLU()) + + self.cls_subnet = nn.Sequential(*cls_subnet) + self.bbox_subnet = nn.Sequential(*bbox_subnet) + self.cls_score = nn.Conv2d( + in_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 + ) + self.bbox_pred = nn.Conv2d(in_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1) + + # Initialization + for modules in [self.cls_subnet, self.bbox_subnet, self.cls_score, self.bbox_pred]: + for layer in modules.modules(): + if isinstance(layer, nn.Conv2d): + torch.nn.init.normal_(layer.weight, mean=0, std=0.01) + torch.nn.init.constant_(layer.bias, 0) + + # Use prior in model initialization to improve stability + bias_value = -(math.log((1 - prior_prob) / prior_prob)) + torch.nn.init.constant_(self.cls_score.bias, bias_value) + + def forward(self, features): + """ + Arguments: + features (list[Tensor]): FPN feature map tensors in high to low resolution. + Each tensor in the list correspond to different feature levels. + + Returns: + logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). + The tensor predicts the classification probability + at each spatial position for each of the A anchors and K object + classes. + bbox_reg (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). + The tensor predicts 4-vector (dx,dy,dw,dh) box + regression values for every anchor. These values are the + relative offset between the anchor and the ground truth box. + """ + logits = [] + bbox_reg = [] + for feature in features: + logits.append(self.cls_score(self.cls_subnet(feature))) + bbox_reg.append(self.bbox_pred(self.bbox_subnet(feature))) + return logits, bbox_reg diff --git a/detectron2/modeling/meta_arch/semantic_seg.py b/detectron2/modeling/meta_arch/semantic_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..2c41a7235cb9c578e2c6de5835854bdff7493616 --- /dev/null +++ b/detectron2/modeling/meta_arch/semantic_seg.py @@ -0,0 +1,186 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +from typing import Dict +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.layers import Conv2d, ShapeSpec +from detectron2.structures import ImageList +from detectron2.utils.registry import Registry + +from ..backbone import build_backbone +from ..postprocessing import sem_seg_postprocess +from .build import META_ARCH_REGISTRY + +__all__ = ["SemanticSegmentor", "SEM_SEG_HEADS_REGISTRY", "SemSegFPNHead", "build_sem_seg_head"] + + +SEM_SEG_HEADS_REGISTRY = Registry("SEM_SEG_HEADS") +SEM_SEG_HEADS_REGISTRY.__doc__ = """ +Registry for semantic segmentation heads, which make semantic segmentation predictions +from feature maps. +""" + + +@META_ARCH_REGISTRY.register() +class SemanticSegmentor(nn.Module): + """ + Main class for semantic segmentation architectures. + """ + + def __init__(self, cfg): + super().__init__() + self.backbone = build_backbone(cfg) + self.sem_seg_head = build_sem_seg_head(cfg, self.backbone.output_shape()) + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + + For now, each item in the list is a dict that contains: + + * "image": Tensor, image in (C, H, W) format. + * "sem_seg": semantic segmentation ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + + Returns: + list[dict]: + Each dict is the output for one input image. + The dict contains one key "sem_seg" whose value is a + Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + + features = self.backbone(images.tensor) + + if "sem_seg" in batched_inputs[0]: + targets = [x["sem_seg"].to(self.device) for x in batched_inputs] + targets = ImageList.from_tensors( + targets, self.backbone.size_divisibility, self.sem_seg_head.ignore_value + ).tensor + else: + targets = None + results, losses = self.sem_seg_head(features, targets) + + if self.training: + return losses + + processed_results = [] + for result, input_per_image, image_size in zip(results, batched_inputs, images.image_sizes): + height = input_per_image.get("height") + width = input_per_image.get("width") + r = sem_seg_postprocess(result, image_size, height, width) + processed_results.append({"sem_seg": r}) + return processed_results + + +def build_sem_seg_head(cfg, input_shape): + """ + Build a semantic segmentation head from `cfg.MODEL.SEM_SEG_HEAD.NAME`. + """ + name = cfg.MODEL.SEM_SEG_HEAD.NAME + return SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) + + +@SEM_SEG_HEADS_REGISTRY.register() +class SemSegFPNHead(nn.Module): + """ + A semantic segmentation head described in :paper:`PanopticFPN`. + It takes FPN features as input and merges information from all + levels of the FPN into single output. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): + super().__init__() + + # fmt: off + self.in_features = cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + feature_strides = {k: v.stride for k, v in input_shape.items()} + feature_channels = {k: v.channels for k, v in input_shape.items()} + self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE + num_classes = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES + conv_dims = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + self.common_stride = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE + norm = cfg.MODEL.SEM_SEG_HEAD.NORM + self.loss_weight = cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT + # fmt: on + + self.scale_heads = [] + for in_feature in self.in_features: + head_ops = [] + head_length = max( + 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) + ) + for k in range(head_length): + norm_module = nn.GroupNorm(32, conv_dims) if norm == "GN" else None + conv = Conv2d( + feature_channels[in_feature] if k == 0 else conv_dims, + conv_dims, + kernel_size=3, + stride=1, + padding=1, + bias=not norm, + norm=norm_module, + activation=F.relu, + ) + weight_init.c2_msra_fill(conv) + head_ops.append(conv) + if feature_strides[in_feature] != self.common_stride: + head_ops.append( + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + ) + self.scale_heads.append(nn.Sequential(*head_ops)) + self.add_module(in_feature, self.scale_heads[-1]) + self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) + weight_init.c2_msra_fill(self.predictor) + + def forward(self, features, targets=None): + """ + Returns: + In training, returns (None, dict of losses) + In inference, returns (CxHxW logits, {}) + """ + x = self.layers(features) + if self.training: + return None, self.losses(x, targets) + else: + x = F.interpolate( + x, scale_factor=self.common_stride, mode="bilinear", align_corners=False + ) + return x, {} + + def layers(self, features): + for i, f in enumerate(self.in_features): + if i == 0: + x = self.scale_heads[i](features[f]) + else: + x = x + self.scale_heads[i](features[f]) + x = self.predictor(x) + return x + + def losses(self, predictions, targets): + predictions = F.interpolate( + predictions, scale_factor=self.common_stride, mode="bilinear", align_corners=False + ) + loss = F.cross_entropy( + predictions, targets, reduction="mean", ignore_index=self.ignore_value + ) + losses = {"loss_sem_seg": loss * self.loss_weight} + return losses diff --git a/detectron2/modeling/poolers.py b/detectron2/modeling/poolers.py new file mode 100644 index 0000000000000000000000000000000000000000..678f5afc5680e6bdc9931f0449e2ab334a3a5369 --- /dev/null +++ b/detectron2/modeling/poolers.py @@ -0,0 +1,231 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import math +import sys +import torch +from torch import nn +from torchvision.ops import RoIPool + +from detectron2.layers import ROIAlign, ROIAlignRotated, cat + +__all__ = ["ROIPooler"] + + +def assign_boxes_to_levels(box_lists, min_level, max_level, canonical_box_size, canonical_level): + """ + Map each box in `box_lists` to a feature map level index and return the assignment + vector. + + Args: + box_lists (list[Boxes] | list[RotatedBoxes]): A list of N Boxes or N RotatedBoxes, + where N is the number of images in the batch. + min_level (int): Smallest feature map level index. The input is considered index 0, + the output of stage 1 is index 1, and so. + max_level (int): Largest feature map level index. + canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). + canonical_level (int): The feature map level index on which a canonically-sized box + should be placed. + + Returns: + A tensor of length M, where M is the total number of boxes aggregated over all + N batch images. The memory layout corresponds to the concatenation of boxes + from all images. Each element is the feature map index, as an offset from + `self.min_level`, for the corresponding box (so value i means the box is at + `self.min_level + i`). + """ + eps = sys.float_info.epsilon + box_sizes = torch.sqrt(cat([boxes.area() for boxes in box_lists])) + # Eqn.(1) in FPN paper + level_assignments = torch.floor( + canonical_level + torch.log2(box_sizes / canonical_box_size + eps) + ) + # clamp level to (min, max), in case the box size is too large or too small + # for the available feature maps + level_assignments = torch.clamp(level_assignments, min=min_level, max=max_level) + return level_assignments.to(torch.int64) - min_level + + +def convert_boxes_to_pooler_format(box_lists): + """ + Convert all boxes in `box_lists` to the low-level format used by ROI pooling ops + (see description under Returns). + + Args: + box_lists (list[Boxes] | list[RotatedBoxes]): + A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. + + Returns: + When input is list[Boxes]: + A tensor of shape (M, 5), where M is the total number of boxes aggregated over all + N batch images. + The 5 columns are (batch index, x0, y0, x1, y1), where batch index + is the index in [0, N) identifying which batch image the box with corners at + (x0, y0, x1, y1) comes from. + When input is list[RotatedBoxes]: + A tensor of shape (M, 6), where M is the total number of boxes aggregated over all + N batch images. + The 6 columns are (batch index, x_ctr, y_ctr, width, height, angle_degrees), + where batch index is the index in [0, N) identifying which batch image the + rotated box (x_ctr, y_ctr, width, height, angle_degrees) comes from. + """ + + def fmt_box_list(box_tensor, batch_index): + repeated_index = torch.full( + (len(box_tensor), 1), batch_index, dtype=box_tensor.dtype, device=box_tensor.device + ) + return cat((repeated_index, box_tensor), dim=1) + + pooler_fmt_boxes = cat( + [fmt_box_list(box_list.tensor, i) for i, box_list in enumerate(box_lists)], dim=0 + ) + + return pooler_fmt_boxes + + +class ROIPooler(nn.Module): + """ + Region of interest feature map pooler that supports pooling from one or more + feature maps. + """ + + def __init__( + self, + output_size, + scales, + sampling_ratio, + pooler_type, + canonical_box_size=224, + canonical_level=4, + ): + """ + Args: + output_size (int, tuple[int] or list[int]): output size of the pooled region, + e.g., 14 x 14. If tuple or list is given, the length must be 2. + scales (list[float]): The scale for each low-level pooling op relative to + the input image. For a feature map with stride s relative to the input + image, scale is defined as a 1 / s. The stride must be power of 2. + When there are multiple scales, they must form a pyramid, i.e. they must be + a monotically decreasing geometric sequence with a factor of 1/2. + sampling_ratio (int): The `sampling_ratio` parameter for the ROIAlign op. + pooler_type (string): Name of the type of pooling operation that should be applied. + For instance, "ROIPool" or "ROIAlignV2". + canonical_box_size (int): A canonical box size in pixels (sqrt(box area)). The default + is heuristically defined as 224 pixels in the FPN paper (based on ImageNet + pre-training). + canonical_level (int): The feature map level index from which a canonically-sized box + should be placed. The default is defined as level 4 (stride=16) in the FPN paper, + i.e., a box of size 224x224 will be placed on the feature with stride=16. + The box placement for all boxes will be determined from their sizes w.r.t + canonical_box_size. For example, a box whose area is 4x that of a canonical box + should be used to pool features from feature level ``canonical_level+1``. + + Note that the actual input feature maps given to this module may not have + sufficiently many levels for the input boxes. If the boxes are too large or too + small for the input feature maps, the closest level will be used. + """ + super().__init__() + + if isinstance(output_size, int): + output_size = (output_size, output_size) + assert len(output_size) == 2 + assert isinstance(output_size[0], int) and isinstance(output_size[1], int) + self.output_size = output_size + + if pooler_type == "ROIAlign": + self.level_poolers = nn.ModuleList( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=False + ) + for scale in scales + ) + elif pooler_type == "ROIAlignV2": + self.level_poolers = nn.ModuleList( + ROIAlign( + output_size, spatial_scale=scale, sampling_ratio=sampling_ratio, aligned=True + ) + for scale in scales + ) + elif pooler_type == "ROIPool": + self.level_poolers = nn.ModuleList( + RoIPool(output_size, spatial_scale=scale) for scale in scales + ) + elif pooler_type == "ROIAlignRotated": + self.level_poolers = nn.ModuleList( + ROIAlignRotated(output_size, spatial_scale=scale, sampling_ratio=sampling_ratio) + for scale in scales + ) + else: + raise ValueError("Unknown pooler type: {}".format(pooler_type)) + + # Map scale (defined as 1 / stride) to its feature map level under the + # assumption that stride is a power of 2. + min_level = -(math.log2(scales[0])) + max_level = -(math.log2(scales[-1])) + assert math.isclose(min_level, int(min_level)) and math.isclose( + max_level, int(max_level) + ), "Featuremap stride is not power of 2!" + self.min_level = int(min_level) + self.max_level = int(max_level) + assert ( + len(scales) == self.max_level - self.min_level + 1 + ), "[ROIPooler] Sizes of input featuremaps do not form a pyramid!" + assert 0 < self.min_level and self.min_level <= self.max_level + self.canonical_level = canonical_level + assert canonical_box_size > 0 + self.canonical_box_size = canonical_box_size + + def forward(self, x, box_lists): + """ + Args: + x (list[Tensor]): A list of feature maps of NCHW shape, with scales matching those + used to construct this module. + box_lists (list[Boxes] | list[RotatedBoxes]): + A list of N Boxes or N RotatedBoxes, where N is the number of images in the batch. + The box coordinates are defined on the original image and + will be scaled by the `scales` argument of :class:`ROIPooler`. + + Returns: + Tensor: + A tensor of shape (M, C, output_size, output_size) where M is the total number of + boxes aggregated over all N batch images and C is the number of channels in `x`. + """ + num_level_assignments = len(self.level_poolers) + + assert isinstance(x, list) and isinstance( + box_lists, list + ), "Arguments to pooler must be lists" + assert ( + len(x) == num_level_assignments + ), "unequal value, num_level_assignments={}, but x is list of {} Tensors".format( + num_level_assignments, len(x) + ) + + assert len(box_lists) == x[0].size( + 0 + ), "unequal value, x[0] batch dim 0 is {}, but box_list has length {}".format( + x[0].size(0), len(box_lists) + ) + + pooler_fmt_boxes = convert_boxes_to_pooler_format(box_lists) + + if num_level_assignments == 1: + return self.level_poolers[0](x[0], pooler_fmt_boxes) + + level_assignments = assign_boxes_to_levels( + box_lists, self.min_level, self.max_level, self.canonical_box_size, self.canonical_level + ) + + num_boxes = len(pooler_fmt_boxes) + num_channels = x[0].shape[1] + output_size = self.output_size[0] + + dtype, device = x[0].dtype, x[0].device + output = torch.zeros( + (num_boxes, num_channels, output_size, output_size), dtype=dtype, device=device + ) + + for level, (x_level, pooler) in enumerate(zip(x, self.level_poolers)): + inds = torch.nonzero(level_assignments == level, as_tuple=True)[0] + pooler_fmt_boxes_level = pooler_fmt_boxes[inds] + output[inds] = pooler(x_level, pooler_fmt_boxes_level) + + return output diff --git a/detectron2/modeling/postprocessing.py b/detectron2/modeling/postprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..e85541ff2e25568cdb9c73702f6c9e68a23f6e4c --- /dev/null +++ b/detectron2/modeling/postprocessing.py @@ -0,0 +1,79 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from torch.nn import functional as F + +from detectron2.layers import paste_masks_in_image +from detectron2.structures import Instances +from detectron2.utils.memory import retry_if_cuda_oom + + +def detector_postprocess(results, output_height, output_width, mask_threshold=0.5): + """ + Resize the output instances. + The input images are often resized when entering an object detector. + As a result, we often need the outputs of the detector in a different + resolution from its inputs. + + This function will resize the raw outputs of an R-CNN detector + to produce outputs according to the desired output resolution. + + Args: + results (Instances): the raw outputs from the detector. + `results.image_size` contains the input image resolution the detector sees. + This object might be modified in-place. + output_height, output_width: the desired output resolution. + + Returns: + Instances: the resized output from the model, based on the output resolution + """ + scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0]) + results = Instances((output_height, output_width), **results.get_fields()) + + if results.has("pred_boxes"): + output_boxes = results.pred_boxes + elif results.has("proposal_boxes"): + output_boxes = results.proposal_boxes + + output_boxes.scale(scale_x, scale_y) + output_boxes.clip(results.image_size) + + results = results[output_boxes.nonempty()] + + if results.has("pred_masks"): + results.pred_masks = retry_if_cuda_oom(paste_masks_in_image)( + results.pred_masks[:, 0, :, :], # N, 1, M, M + results.pred_boxes, + results.image_size, + threshold=mask_threshold, + ) + + if results.has("pred_keypoints"): + results.pred_keypoints[:, :, 0] *= scale_x + results.pred_keypoints[:, :, 1] *= scale_y + + return results + + +def sem_seg_postprocess(result, img_size, output_height, output_width): + """ + Return semantic segmentation predictions in the original resolution. + + The input images are often resized when entering semantic segmentor. Moreover, in same + cases, they also padded inside segmentor to be divisible by maximum network stride. + As a result, we often need the predictions of the segmentor in a different + resolution from its inputs. + + Args: + result (Tensor): semantic segmentation prediction logits. A tensor of shape (C, H, W), + where C is the number of classes, and H, W are the height and width of the prediction. + img_size (tuple): image size that segmentor is taking as input. + output_height, output_width: the desired output resolution. + + Returns: + semantic segmentation prediction (Tensor): A tensor of the shape + (C, output_height, output_width) that contains per-pixel soft predictions. + """ + result = result[:, : img_size[0], : img_size[1]].expand(1, -1, -1, -1) + result = F.interpolate( + result, size=(output_height, output_width), mode="bilinear", align_corners=False + )[0] + return result diff --git a/detectron2/modeling/proposal_generator/__init__.py b/detectron2/modeling/proposal_generator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..64fb6d46359c05ed3d7aa1ec91fdd6e15b14c932 --- /dev/null +++ b/detectron2/modeling/proposal_generator/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .build import PROPOSAL_GENERATOR_REGISTRY, build_proposal_generator +from .rpn import RPN_HEAD_REGISTRY, build_rpn_head, RPN diff --git a/detectron2/modeling/proposal_generator/build.py b/detectron2/modeling/proposal_generator/build.py new file mode 100644 index 0000000000000000000000000000000000000000..7f252bcb982032cd09270c44741772a34ef32277 --- /dev/null +++ b/detectron2/modeling/proposal_generator/build.py @@ -0,0 +1,24 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from detectron2.utils.registry import Registry + +PROPOSAL_GENERATOR_REGISTRY = Registry("PROPOSAL_GENERATOR") +PROPOSAL_GENERATOR_REGISTRY.__doc__ = """ +Registry for proposal generator, which produces object proposals from feature maps. + +The registered object will be called with `obj(cfg, input_shape)`. +The call should return a `nn.Module` object. +""" + +from . import rpn, rrpn # noqa F401 isort:skip + + +def build_proposal_generator(cfg, input_shape): + """ + Build a proposal generator from `cfg.MODEL.PROPOSAL_GENERATOR.NAME`. + The name can be "PrecomputedProposals" to use no proposal generator. + """ + name = cfg.MODEL.PROPOSAL_GENERATOR.NAME + if name == "PrecomputedProposals": + return None + + return PROPOSAL_GENERATOR_REGISTRY.get(name)(cfg, input_shape) diff --git a/detectron2/modeling/proposal_generator/proposal_utils.py b/detectron2/modeling/proposal_generator/proposal_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d4af90525ba07eb8d313460ee2c3f468fe367cff --- /dev/null +++ b/detectron2/modeling/proposal_generator/proposal_utils.py @@ -0,0 +1,57 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +import torch + +from detectron2.structures import Instances + + +def add_ground_truth_to_proposals(gt_boxes, proposals): + """ + Call `add_ground_truth_to_proposals_single_image` for all images. + + Args: + gt_boxes(list[Boxes]): list of N elements. Element i is a Boxes + representing the gound-truth for image i. + proposals (list[Instances]): list of N elements. Element i is a Instances + representing the proposals for image i. + + Returns: + list[Instances]: list of N Instances. Each is the proposals for the image, + with field "proposal_boxes" and "objectness_logits". + """ + assert gt_boxes is not None + + assert len(proposals) == len(gt_boxes) + if len(proposals) == 0: + return proposals + + return [ + add_ground_truth_to_proposals_single_image(gt_boxes_i, proposals_i) + for gt_boxes_i, proposals_i in zip(gt_boxes, proposals) + ] + + +def add_ground_truth_to_proposals_single_image(gt_boxes, proposals): + """ + Augment `proposals` with ground-truth boxes from `gt_boxes`. + + Args: + Same as `add_ground_truth_to_proposals`, but with gt_boxes and proposals + per image. + + Returns: + Same as `add_ground_truth_to_proposals`, but for only one image. + """ + device = proposals.objectness_logits.device + # Concatenating gt_boxes with proposals requires them to have the same fields + # Assign all ground-truth boxes an objectness logit corresponding to P(object) \approx 1. + gt_logit_value = math.log((1.0 - 1e-10) / (1 - (1.0 - 1e-10))) + + gt_logits = gt_logit_value * torch.ones(len(gt_boxes), device=device) + gt_proposal = Instances(proposals.image_size) + + gt_proposal.proposal_boxes = gt_boxes + gt_proposal.objectness_logits = gt_logits + new_proposals = Instances.cat([proposals, gt_proposal]) + + return new_proposals diff --git a/detectron2/modeling/proposal_generator/rpn.py b/detectron2/modeling/proposal_generator/rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..24f230c8f71c6a32ecdfc5fd2d4ed519e765eb9b --- /dev/null +++ b/detectron2/modeling/proposal_generator/rpn.py @@ -0,0 +1,285 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import Dict, List +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.config import configurable +from detectron2.layers import ShapeSpec +from detectron2.structures import Boxes, Instances, pairwise_iou +from detectron2.utils.memory import retry_if_cuda_oom +from detectron2.utils.registry import Registry + +from ..anchor_generator import build_anchor_generator +from ..box_regression import Box2BoxTransform +from ..matcher import Matcher +from ..sampling import subsample_labels +from .build import PROPOSAL_GENERATOR_REGISTRY +from .rpn_outputs import RPNOutputs, find_top_rpn_proposals + +RPN_HEAD_REGISTRY = Registry("RPN_HEAD") +RPN_HEAD_REGISTRY.__doc__ = """ +Registry for RPN heads, which take feature maps and perform +objectness classification and bounding box regression for anchors. + +The registered object will be called with `obj(cfg, input_shape)`. +The call should return a `nn.Module` object. +""" + + +def build_rpn_head(cfg, input_shape): + """ + Build an RPN head defined by `cfg.MODEL.RPN.HEAD_NAME`. + """ + name = cfg.MODEL.RPN.HEAD_NAME + return RPN_HEAD_REGISTRY.get(name)(cfg, input_shape) + + +@RPN_HEAD_REGISTRY.register() +class StandardRPNHead(nn.Module): + """ + Standard RPN classification and regression heads described in :paper:`Faster R-CNN`. + Uses a 3x3 conv to produce a shared hidden state from which one 1x1 conv predicts + objectness logits for each anchor and a second 1x1 conv predicts bounding-box deltas + specifying how to deform each anchor into an object proposal. + """ + + @configurable + def __init__(self, *, in_channels: int, num_anchors: int, box_dim: int = 4): + """ + NOTE: this interface is experimental. + + Args: + in_channels (int): number of input feature channels. When using multiple + input features, they must have the same number of channels. + num_anchors (int): number of anchors to predict for *each spatial position* + on the feature map. The total number of anchors for each + feature map will be `num_anchors * H * W`. + box_dim (int): dimension of a box, which is also the number of box regression + predictions to make for each anchor. An axis aligned box has + box_dim=4, while a rotated box has box_dim=5. + """ + super().__init__() + # 3x3 conv for the hidden representation + self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) + # 1x1 conv for predicting objectness logits + self.objectness_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1) + # 1x1 conv for predicting box2box transform deltas + self.anchor_deltas = nn.Conv2d(in_channels, num_anchors * box_dim, kernel_size=1, stride=1) + + for l in [self.conv, self.objectness_logits, self.anchor_deltas]: + nn.init.normal_(l.weight, std=0.01) + nn.init.constant_(l.bias, 0) + + @classmethod + def from_config(cls, cfg, input_shape): + # Standard RPN is shared across levels: + in_channels = [s.channels for s in input_shape] + assert len(set(in_channels)) == 1, "Each level must have the same channel!" + in_channels = in_channels[0] + + # RPNHead should take the same input as anchor generator + # NOTE: it assumes that creating an anchor generator does not have unwanted side effect. + anchor_generator = build_anchor_generator(cfg, input_shape) + num_anchors = anchor_generator.num_anchors + box_dim = anchor_generator.box_dim + assert ( + len(set(num_anchors)) == 1 + ), "Each level must have the same number of anchors per spatial position" + return {"in_channels": in_channels, "num_anchors": num_anchors[0], "box_dim": box_dim} + + def forward(self, features): + """ + Args: + features (list[Tensor]): list of feature maps + + Returns: + list[Tensor]: A list of L elements. + Element i is a tensor of shape (N, A, Hi, Wi) representing + the predicted objectness logits for all anchors. A is the number of cell anchors. + list[Tensor]: A list of L elements. Element i is a tensor of shape + (N, A*box_dim, Hi, Wi) representing the predicted "deltas" used to transform anchors + to proposals. + """ + pred_objectness_logits = [] + pred_anchor_deltas = [] + for x in features: + t = F.relu(self.conv(x)) + pred_objectness_logits.append(self.objectness_logits(t)) + pred_anchor_deltas.append(self.anchor_deltas(t)) + return pred_objectness_logits, pred_anchor_deltas + + +@PROPOSAL_GENERATOR_REGISTRY.register() +class RPN(nn.Module): + """ + Region Proposal Network, introduced by :paper:`Faster R-CNN`. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): + super().__init__() + + # fmt: off + self.min_box_side_len = cfg.MODEL.PROPOSAL_GENERATOR.MIN_SIZE + self.in_features = cfg.MODEL.RPN.IN_FEATURES + self.nms_thresh = cfg.MODEL.RPN.NMS_THRESH + self.batch_size_per_image = cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE + self.positive_fraction = cfg.MODEL.RPN.POSITIVE_FRACTION + self.smooth_l1_beta = cfg.MODEL.RPN.SMOOTH_L1_BETA + self.loss_weight = cfg.MODEL.RPN.LOSS_WEIGHT + # fmt: on + + # Map from self.training state to train/test settings + self.pre_nms_topk = { + True: cfg.MODEL.RPN.PRE_NMS_TOPK_TRAIN, + False: cfg.MODEL.RPN.PRE_NMS_TOPK_TEST, + } + self.post_nms_topk = { + True: cfg.MODEL.RPN.POST_NMS_TOPK_TRAIN, + False: cfg.MODEL.RPN.POST_NMS_TOPK_TEST, + } + self.boundary_threshold = cfg.MODEL.RPN.BOUNDARY_THRESH + + self.anchor_generator = build_anchor_generator( + cfg, [input_shape[f] for f in self.in_features] + ) + self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) + self.anchor_matcher = Matcher( + cfg.MODEL.RPN.IOU_THRESHOLDS, cfg.MODEL.RPN.IOU_LABELS, allow_low_quality_matches=True + ) + self.rpn_head = build_rpn_head(cfg, [input_shape[f] for f in self.in_features]) + + def _subsample_labels(self, label): + """ + Randomly sample a subset of positive and negative examples, and overwrite + the label vector to the ignore value (-1) for all elements that are not + included in the sample. + + Args: + labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned. + """ + pos_idx, neg_idx = subsample_labels( + label, self.batch_size_per_image, self.positive_fraction, 0 + ) + # Fill with the ignore label (-1), then set positive and negative labels + label.fill_(-1) + label.scatter_(0, pos_idx, 1) + label.scatter_(0, neg_idx, 0) + return label + + @torch.no_grad() + def label_and_sample_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]): + """ + Args: + anchors (list[Boxes]): anchors for each feature map. + gt_instances: the ground-truth instances for each image. + + Returns: + list[Tensor]: + List of #img tensors. i-th element is a vector of labels whose length is + the total number of anchors across feature maps. Label values are in {-1, 0, 1}, + with meanings: -1 = ignore; 0 = negative class; 1 = positive class. + list[Tensor]: + i-th element is a Nx4 tensor, where N is the total number of anchors across + feature maps. The values are the matched gt boxes for each anchor. + Values are undefined for those anchors not labeled as 1. + """ + anchors = Boxes.cat(anchors) + + gt_boxes = [x.gt_boxes for x in gt_instances] + image_sizes = [x.image_size for x in gt_instances] + del gt_instances + + gt_labels = [] + matched_gt_boxes = [] + for image_size_i, gt_boxes_i in zip(image_sizes, gt_boxes): + """ + image_size_i: (h, w) for the i-th image + gt_boxes_i: ground-truth boxes for i-th image + """ + + match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors) + matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix) + # Matching is memory-expensive and may result in CPU tensors. But the result is small + gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) + del match_quality_matrix + + if self.boundary_threshold >= 0: + # Discard anchors that go out of the boundaries of the image + # NOTE: This is legacy functionality that is turned off by default in Detectron2 + anchors_inside_image = anchors.inside_box(image_size_i, self.boundary_threshold) + gt_labels_i[~anchors_inside_image] = -1 + + # A vector of labels (-1, 0, 1) for each anchor + gt_labels_i = self._subsample_labels(gt_labels_i) + + if len(gt_boxes_i) == 0: + # These values won't be used anyway since the anchor is labeled as background + matched_gt_boxes_i = torch.zeros_like(anchors.tensor) + else: + # TODO wasted indexing computation for ignored boxes + matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor + + gt_labels.append(gt_labels_i) # N,AHW + matched_gt_boxes.append(matched_gt_boxes_i) + return gt_labels, matched_gt_boxes + + def forward(self, images, features, gt_instances=None): + """ + Args: + images (ImageList): input images of length `N` + features (dict[str: Tensor]): input data as a mapping from feature + map name to tensor. Axis 0 represents the number of images `N` in + the input data; axes 1-3 are channels, height, and width, which may + vary between feature maps (e.g., if a feature pyramid is used). + gt_instances (list[Instances], optional): a length `N` list of `Instances`s. + Each `Instances` stores ground-truth instances for the corresponding image. + + Returns: + proposals: list[Instances]: contains fields "proposal_boxes", "objectness_logits" + loss: dict[Tensor] or None + """ + features = [features[f] for f in self.in_features] + pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features) + anchors = self.anchor_generator(features) + + if self.training: + gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances) + else: + gt_labels, gt_boxes = None, None + + outputs = RPNOutputs( + self.box2box_transform, + self.batch_size_per_image, + images, + pred_objectness_logits, + pred_anchor_deltas, + anchors, + gt_labels, + gt_boxes, + self.smooth_l1_beta, + ) + + if self.training: + losses = {k: v * self.loss_weight for k, v in outputs.losses().items()} + else: + losses = {} + + with torch.no_grad(): + # Find the top proposals by applying NMS and removing boxes that + # are too small. The proposals are treated as fixed for approximate + # joint training with roi heads. This approach ignores the derivative + # w.r.t. the proposal boxes’ coordinates that are also network + # responses, so is approximate. + proposals = find_top_rpn_proposals( + outputs.predict_proposals(), + outputs.predict_objectness_logits(), + images, + self.nms_thresh, + self.pre_nms_topk[self.training], + self.post_nms_topk[self.training], + self.min_box_side_len, + self.training, + ) + + return proposals, losses diff --git a/detectron2/modeling/proposal_generator/rpn_outputs.py b/detectron2/modeling/proposal_generator/rpn_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..44f846f18b30d846d1d87faf7f2aa3b10c2333b8 --- /dev/null +++ b/detectron2/modeling/proposal_generator/rpn_outputs.py @@ -0,0 +1,323 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +import logging +import torch +import torch.nn.functional as F +from fvcore.nn import smooth_l1_loss + +from detectron2.layers import batched_nms, cat +from detectron2.structures import Boxes, Instances +from detectron2.utils.events import get_event_storage + +logger = logging.getLogger(__name__) + +# TODO: comments for future refactoring of this module +# +# From @rbg: +# This code involves a significant amount of tensor reshaping and permuting. Look for +# ways to simplify this. + +""" +Shape shorthand in this module: + + N: number of images in the minibatch + L: number of feature maps per image on which RPN is run + A: number of cell anchors (must be the same for all feature maps) + Hi, Wi: height and width of the i-th feature map + 4: size of the box parameterization + +Naming convention: + + objectness: refers to the binary classification of an anchor as object vs. not + object. + + deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box + transform (see :class:`box_regression.Box2BoxTransform`). + + pred_objectness_logits: predicted objectness scores in [-inf, +inf]; use + sigmoid(pred_objectness_logits) to estimate P(object). + + gt_labels: ground-truth binary classification labels for objectness + + pred_anchor_deltas: predicted box2box transform deltas + + gt_anchor_deltas: ground-truth box2box transform deltas +""" + + +def find_top_rpn_proposals( + proposals, + pred_objectness_logits, + images, + nms_thresh, + pre_nms_topk, + post_nms_topk, + min_box_side_len, + training, +): + """ + For each feature map, select the `pre_nms_topk` highest scoring proposals, + apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` + highest scoring proposals among all the feature maps if `training` is True, + otherwise, returns the highest `post_nms_topk` scoring proposals for each + feature map. + + Args: + proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 4). + All proposal predictions on the feature maps. + pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). + images (ImageList): Input images as an :class:`ImageList`. + nms_thresh (float): IoU threshold to use for NMS + pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. + When RPN is run on multiple feature maps (as in FPN) this number is per + feature map. + post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. + When RPN is run on multiple feature maps (as in FPN) this number is total, + over all feature maps. + min_box_side_len (float): minimum proposal box side length in pixels (absolute units + wrt input images). + training (bool): True if proposals are to be used in training, otherwise False. + This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." + comment. + + Returns: + proposals (list[Instances]): list of N Instances. The i-th Instances + stores post_nms_topk object proposals for image i, sorted by their + objectness score in descending order. + """ + image_sizes = images.image_sizes # in (h, w) order + num_images = len(image_sizes) + device = proposals[0].device + + # 1. Select top-k anchor for every level and every image + topk_scores = [] # #lvl Tensor, each of shape N x topk + topk_proposals = [] + level_ids = [] # #lvl Tensor, each of shape (topk,) + batch_idx = torch.arange(num_images, device=device) + for level_id, proposals_i, logits_i in zip( + itertools.count(), proposals, pred_objectness_logits + ): + Hi_Wi_A = logits_i.shape[1] + num_proposals_i = min(pre_nms_topk, Hi_Wi_A) + + # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) + # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) + logits_i, idx = logits_i.sort(descending=True, dim=1) + topk_scores_i = logits_i[batch_idx, :num_proposals_i] + topk_idx = idx[batch_idx, :num_proposals_i] + + # each is N x topk + topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 4 + + topk_proposals.append(topk_proposals_i) + topk_scores.append(topk_scores_i) + level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) + + # 2. Concat all levels together + topk_scores = cat(topk_scores, dim=1) + topk_proposals = cat(topk_proposals, dim=1) + level_ids = cat(level_ids, dim=0) + + # 3. For each image, run a per-level NMS, and choose topk results. + results = [] + for n, image_size in enumerate(image_sizes): + boxes = Boxes(topk_proposals[n]) + scores_per_img = topk_scores[n] + lvl = level_ids + + valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) + if not valid_mask.all(): + if training: + raise FloatingPointError( + "Predicted boxes or scores contain Inf/NaN. Training has diverged." + ) + boxes = boxes[valid_mask] + scores_per_img = scores_per_img[valid_mask] + lvl = lvl[valid_mask] + boxes.clip(image_size) + + # filter empty boxes + keep = boxes.nonempty(threshold=min_box_side_len) + if keep.sum().item() != len(boxes): + boxes, scores_per_img, lvl = boxes[keep], scores_per_img[keep], lvl[keep] + + keep = batched_nms(boxes.tensor, scores_per_img, lvl, nms_thresh) + # In Detectron1, there was different behavior during training vs. testing. + # (https://github.com/facebookresearch/Detectron/issues/459) + # During training, topk is over the proposals from *all* images in the training batch. + # During testing, it is over the proposals for each image separately. + # As a result, the training behavior becomes batch-dependent, + # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. + # This bug is addressed in Detectron2 to make the behavior independent of batch size. + keep = keep[:post_nms_topk] # keep is already sorted + + res = Instances(image_size) + res.proposal_boxes = boxes[keep] + res.objectness_logits = scores_per_img[keep] + results.append(res) + return results + + +def rpn_losses( + gt_labels, gt_anchor_deltas, pred_objectness_logits, pred_anchor_deltas, smooth_l1_beta +): + """ + Args: + gt_labels (Tensor): shape (N,), each element in {-1, 0, 1} representing + ground-truth objectness labels with: -1 = ignore; 0 = not object; 1 = object. + gt_anchor_deltas (Tensor): shape (N, box_dim), row i represents ground-truth + box2box transform targets (dx, dy, dw, dh) or (dx, dy, dw, dh, da) that map anchor i to + its matched ground-truth box. + pred_objectness_logits (Tensor): shape (N,), each element is a predicted objectness + logit. + pred_anchor_deltas (Tensor): shape (N, box_dim), each row is a predicted box2box + transform (dx, dy, dw, dh) or (dx, dy, dw, dh, da) + smooth_l1_beta (float): The transition point between L1 and L2 loss in + the smooth L1 loss function. When set to 0, the loss becomes L1. When + set to +inf, the loss becomes constant 0. + + Returns: + objectness_loss, localization_loss, both unnormalized (summed over samples). + """ + pos_masks = gt_labels == 1 + localization_loss = smooth_l1_loss( + pred_anchor_deltas[pos_masks], gt_anchor_deltas[pos_masks], smooth_l1_beta, reduction="sum" + ) + + valid_masks = gt_labels >= 0 + objectness_loss = F.binary_cross_entropy_with_logits( + pred_objectness_logits[valid_masks], + gt_labels[valid_masks].to(torch.float32), + reduction="sum", + ) + return objectness_loss, localization_loss + + +class RPNOutputs(object): + def __init__( + self, + box2box_transform, + batch_size_per_image, + images, + pred_objectness_logits, + pred_anchor_deltas, + anchors, + gt_labels=None, + gt_boxes=None, + smooth_l1_beta=0.0, + ): + """ + Args: + box2box_transform (Box2BoxTransform): :class:`Box2BoxTransform` instance for + anchor-proposal transformations. + images (ImageList): :class:`ImageList` instance representing N input images + batch_size_per_image (int): number of proposals to sample when training + pred_objectness_logits (list[Tensor]): A list of L elements. + Element i is a tensor of shape (N, A, Hi, Wi) representing + the predicted objectness logits for anchors. + pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape + (N, A*4 or 5, Hi, Wi) representing the predicted "deltas" used to transform anchors + to proposals. + anchors (list[Boxes or RotatedBoxes]): A list of Boxes/RotatedBoxes storing the all + the anchors for each feature map. See :meth:`AnchorGenerator.forward`. + gt_labels (list[Tensor]): Available on in training. + See :meth:`RPN.label_and_sample_anchors`. + gt_boxes (list[Boxes or RotatedBoxes]): Available on in training. + See :meth:`RPN.label_and_sample_anchors`. + smooth_l1_beta (float): The transition point between L1 and L2 loss in + the smooth L1 loss function. When set to 0, the loss becomes L1. When + set to +inf, the loss becomes constant 0. + """ + self.box2box_transform = box2box_transform + self.batch_size_per_image = batch_size_per_image + + B = anchors[0].tensor.size(1) # box dimension (4 or 5) + self.pred_objectness_logits = [ + # Reshape: (N, A, Hi, Wi) -> (N, Hi, Wi, A) -> (N, Hi*Wi*A) + score.permute(0, 2, 3, 1).flatten(1) + for score in pred_objectness_logits + ] + + self.pred_anchor_deltas = [ + # Reshape: (N, A*B, Hi, Wi) -> (N, A, B, Hi, Wi) -> (N, Hi, Wi, A, B) + # -> (N, Hi*Wi*A, B) + x.view(x.shape[0], -1, B, x.shape[-2], x.shape[-1]) + .permute(0, 3, 4, 1, 2) + .flatten(1, -2) + for x in pred_anchor_deltas + ] + + self.anchors = anchors + + self.gt_boxes = gt_boxes + self.gt_labels = gt_labels + + self.num_images = len(images) + self.smooth_l1_beta = smooth_l1_beta + + def losses(self): + """ + Return the losses from a set of RPN predictions and their associated ground-truth. + + Returns: + dict[loss name -> loss value]: A dict mapping from loss name to loss value. + Loss names are: `loss_rpn_cls` for objectness classification and + `loss_rpn_loc` for proposal localization. + """ + gt_labels = torch.stack(self.gt_labels) + anchors = self.anchors[0].cat(self.anchors).tensor # Ax(4 or 5) + gt_anchor_deltas = [self.box2box_transform.get_deltas(anchors, k) for k in self.gt_boxes] + gt_anchor_deltas = torch.stack(gt_anchor_deltas) + + # Log the number of positive/negative anchors per-image that's used in training + num_pos_anchors = (gt_labels == 1).sum().item() + num_neg_anchors = (gt_labels == 0).sum().item() + storage = get_event_storage() + storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / self.num_images) + storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / self.num_images) + + objectness_loss, localization_loss = rpn_losses( + gt_labels, + gt_anchor_deltas, + # concat on the Hi*Wi*A dimension + cat(self.pred_objectness_logits, dim=1), + cat(self.pred_anchor_deltas, dim=1), + self.smooth_l1_beta, + ) + normalizer = self.batch_size_per_image * self.num_images + return { + "loss_rpn_cls": objectness_loss / normalizer, + "loss_rpn_loc": localization_loss / normalizer, + } + + def predict_proposals(self): + """ + Transform anchors into proposals by applying the predicted anchor deltas. + + Returns: + proposals (list[Tensor]): A list of L tensors. Tensor i has shape + (N, Hi*Wi*A, B), where B is box dimension (4 or 5). + """ + proposals = [] + # For each feature map + for anchors_i, pred_anchor_deltas_i in zip(self.anchors, self.pred_anchor_deltas): + B = anchors_i.tensor.size(1) + N = self.num_images + pred_anchor_deltas_i = pred_anchor_deltas_i.reshape(-1, B) + # Expand anchors to shape (N*Hi*Wi*A, B) + anchors_i = anchors_i.tensor.unsqueeze(0).expand(N, -1, -1).reshape(-1, B) + proposals_i = self.box2box_transform.apply_deltas(pred_anchor_deltas_i, anchors_i) + # Append feature map proposals with shape (N, Hi*Wi*A, B) + proposals.append(proposals_i.view(N, -1, B)) + return proposals + + def predict_objectness_logits(self): + """ + Return objectness logits in the same format as the proposals returned by + :meth:`predict_proposals`. + + Returns: + pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape + (N, Hi*Wi*A). + """ + return self.pred_objectness_logits diff --git a/detectron2/modeling/proposal_generator/rrpn.py b/detectron2/modeling/proposal_generator/rrpn.py new file mode 100644 index 0000000000000000000000000000000000000000..b0f5f336a6b80834b8e0c6e3a16721bcaa33acbd --- /dev/null +++ b/detectron2/modeling/proposal_generator/rrpn.py @@ -0,0 +1,233 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +import logging +from typing import Dict, List +import torch + +from detectron2.layers import ShapeSpec, batched_nms_rotated, cat +from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated +from detectron2.utils.memory import retry_if_cuda_oom + +from ..box_regression import Box2BoxTransformRotated +from .build import PROPOSAL_GENERATOR_REGISTRY +from .rpn import RPN +from .rpn_outputs import RPNOutputs + +logger = logging.getLogger(__name__) + + +def find_top_rrpn_proposals( + proposals, + pred_objectness_logits, + images, + nms_thresh, + pre_nms_topk, + post_nms_topk, + min_box_side_len, + training, +): + """ + For each feature map, select the `pre_nms_topk` highest scoring proposals, + apply NMS, clip proposals, and remove small boxes. Return the `post_nms_topk` + highest scoring proposals among all the feature maps if `training` is True, + otherwise, returns the highest `post_nms_topk` scoring proposals for each + feature map. + + Args: + proposals (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A, 5). + All proposal predictions on the feature maps. + pred_objectness_logits (list[Tensor]): A list of L tensors. Tensor i has shape (N, Hi*Wi*A). + images (ImageList): Input images as an :class:`ImageList`. + nms_thresh (float): IoU threshold to use for NMS + pre_nms_topk (int): number of top k scoring proposals to keep before applying NMS. + When RRPN is run on multiple feature maps (as in FPN) this number is per + feature map. + post_nms_topk (int): number of top k scoring proposals to keep after applying NMS. + When RRPN is run on multiple feature maps (as in FPN) this number is total, + over all feature maps. + min_box_side_len (float): minimum proposal box side length in pixels (absolute units + wrt input images). + training (bool): True if proposals are to be used in training, otherwise False. + This arg exists only to support a legacy bug; look for the "NB: Legacy bug ..." + comment. + + Returns: + proposals (list[Instances]): list of N Instances. The i-th Instances + stores post_nms_topk object proposals for image i. + """ + image_sizes = images.image_sizes # in (h, w) order + num_images = len(image_sizes) + device = proposals[0].device + + # 1. Select top-k anchor for every level and every image + topk_scores = [] # #lvl Tensor, each of shape N x topk + topk_proposals = [] + level_ids = [] # #lvl Tensor, each of shape (topk,) + batch_idx = torch.arange(num_images, device=device) + for level_id, proposals_i, logits_i in zip( + itertools.count(), proposals, pred_objectness_logits + ): + Hi_Wi_A = logits_i.shape[1] + num_proposals_i = min(pre_nms_topk, Hi_Wi_A) + + # sort is faster than topk (https://github.com/pytorch/pytorch/issues/22812) + # topk_scores_i, topk_idx = logits_i.topk(num_proposals_i, dim=1) + logits_i, idx = logits_i.sort(descending=True, dim=1) + topk_scores_i = logits_i[batch_idx, :num_proposals_i] + topk_idx = idx[batch_idx, :num_proposals_i] + + # each is N x topk + topk_proposals_i = proposals_i[batch_idx[:, None], topk_idx] # N x topk x 5 + + topk_proposals.append(topk_proposals_i) + topk_scores.append(topk_scores_i) + level_ids.append(torch.full((num_proposals_i,), level_id, dtype=torch.int64, device=device)) + + # 2. Concat all levels together + topk_scores = cat(topk_scores, dim=1) + topk_proposals = cat(topk_proposals, dim=1) + level_ids = cat(level_ids, dim=0) + + # 3. For each image, run a per-level NMS, and choose topk results. + results = [] + for n, image_size in enumerate(image_sizes): + boxes = RotatedBoxes(topk_proposals[n]) + scores_per_img = topk_scores[n] + valid_mask = torch.isfinite(boxes.tensor).all(dim=1) & torch.isfinite(scores_per_img) + if not valid_mask.all(): + boxes = boxes[valid_mask] + scores_per_img = scores_per_img[valid_mask] + boxes.clip(image_size) + + # filter empty boxes + keep = boxes.nonempty(threshold=min_box_side_len) + lvl = level_ids + if keep.sum().item() != len(boxes): + boxes, scores_per_img, lvl = (boxes[keep], scores_per_img[keep], level_ids[keep]) + + keep = batched_nms_rotated(boxes.tensor, scores_per_img, lvl, nms_thresh) + # In Detectron1, there was different behavior during training vs. testing. + # (https://github.com/facebookresearch/Detectron/issues/459) + # During training, topk is over the proposals from *all* images in the training batch. + # During testing, it is over the proposals for each image separately. + # As a result, the training behavior becomes batch-dependent, + # and the configuration "POST_NMS_TOPK_TRAIN" end up relying on the batch size. + # This bug is addressed in Detectron2 to make the behavior independent of batch size. + keep = keep[:post_nms_topk] + + res = Instances(image_size) + res.proposal_boxes = boxes[keep] + res.objectness_logits = scores_per_img[keep] + results.append(res) + return results + + +@PROPOSAL_GENERATOR_REGISTRY.register() +class RRPN(RPN): + """ + Rotated Region Proposal Network described in :paper:`RRPN`. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): + super().__init__(cfg, input_shape) + self.box2box_transform = Box2BoxTransformRotated(weights=cfg.MODEL.RPN.BBOX_REG_WEIGHTS) + if self.boundary_threshold >= 0: + raise NotImplementedError( + "boundary_threshold is a legacy option not implemented for RRPN." + ) + + @torch.no_grad() + def label_and_sample_anchors(self, anchors: List[RotatedBoxes], gt_instances: List[Instances]): + """ + Args: + anchors (list[RotatedBoxes]): anchors for each feature map. + gt_instances: the ground-truth instances for each image. + + Returns: + list[Tensor]: + List of #img tensors. i-th element is a vector of labels whose length is + the total number of anchors across feature maps. Label values are in {-1, 0, 1}, + with meanings: -1 = ignore; 0 = negative class; 1 = positive class. + list[Tensor]: + i-th element is a Nx5 tensor, where N is the total number of anchors across + feature maps. The values are the matched gt boxes for each anchor. + Values are undefined for those anchors not labeled as 1. + """ + anchors = RotatedBoxes.cat(anchors) + + gt_boxes = [x.gt_boxes for x in gt_instances] + del gt_instances + + gt_labels = [] + matched_gt_boxes = [] + for gt_boxes_i in gt_boxes: + """ + gt_boxes_i: ground-truth boxes for i-th image + """ + match_quality_matrix = retry_if_cuda_oom(pairwise_iou_rotated)(gt_boxes_i, anchors) + matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix) + # Matching is memory-expensive and may result in CPU tensors. But the result is small + gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device) + + # A vector of labels (-1, 0, 1) for each anchor + gt_labels_i = self._subsample_labels(gt_labels_i) + + if len(gt_boxes_i) == 0: + # These values won't be used anyway since the anchor is labeled as background + matched_gt_boxes_i = torch.zeros_like(anchors.tensor) + else: + # TODO wasted indexing computation for ignored boxes + matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor + + gt_labels.append(gt_labels_i) # N,AHW + matched_gt_boxes.append(matched_gt_boxes_i) + return gt_labels, matched_gt_boxes + + def forward(self, images, features, gt_instances=None): + # same signature as RPN.forward + features = [features[f] for f in self.in_features] + pred_objectness_logits, pred_anchor_deltas = self.rpn_head(features) + anchors = self.anchor_generator(features) + + if self.training: + gt_labels, gt_boxes = self.label_and_sample_anchors(anchors, gt_instances) + else: + gt_labels, gt_boxes = None, None + + outputs = RPNOutputs( + self.box2box_transform, + self.batch_size_per_image, + images, + pred_objectness_logits, + pred_anchor_deltas, + anchors, + gt_labels, + gt_boxes, + self.smooth_l1_beta, + ) + + if self.training: + losses = {k: v * self.loss_weight for k, v in outputs.losses().items()} + else: + losses = {} + + with torch.no_grad(): + # Find the top proposals by applying NMS and removing boxes that + # are too small. The proposals are treated as fixed for approximate + # joint training with roi heads. This approach ignores the derivative + # w.r.t. the proposal boxes’ coordinates that are also network + # responses, so is approximate. + + # Note: this line is the only difference v.s. RPN.forward + proposals = find_top_rrpn_proposals( + outputs.predict_proposals(), + outputs.predict_objectness_logits(), + images, + self.nms_thresh, + self.pre_nms_topk[self.training], + self.post_nms_topk[self.training], + self.min_box_side_len, + self.training, + ) + + return proposals, losses diff --git a/detectron2/modeling/roi_heads/__init__.py b/detectron2/modeling/roi_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a49099aa5cfa58b55c66fe8fa85092eb26d15535 --- /dev/null +++ b/detectron2/modeling/roi_heads/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .box_head import ROI_BOX_HEAD_REGISTRY, build_box_head +from .keypoint_head import ROI_KEYPOINT_HEAD_REGISTRY, build_keypoint_head, BaseKeypointRCNNHead +from .mask_head import ROI_MASK_HEAD_REGISTRY, build_mask_head, BaseMaskRCNNHead +from .roi_heads import ( + ROI_HEADS_REGISTRY, + ROIHeads, + Res5ROIHeads, + StandardROIHeads, + build_roi_heads, + select_foreground_proposals, +) +from .rotated_fast_rcnn import RROIHeads +from .fast_rcnn import FastRCNNOutputLayers + +from . import cascade_rcnn # isort:skip diff --git a/detectron2/modeling/roi_heads/box_head.py b/detectron2/modeling/roi_heads/box_head.py new file mode 100644 index 0000000000000000000000000000000000000000..de62d47acfd0ac634daf7db228b43f035cc721f3 --- /dev/null +++ b/detectron2/modeling/roi_heads/box_head.py @@ -0,0 +1,115 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +from typing import List +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, Linear, ShapeSpec, get_norm +from detectron2.utils.registry import Registry + +ROI_BOX_HEAD_REGISTRY = Registry("ROI_BOX_HEAD") +ROI_BOX_HEAD_REGISTRY.__doc__ = """ +Registry for box heads, which make box predictions from per-region features. + +The registered object will be called with `obj(cfg, input_shape)`. +""" + + +@ROI_BOX_HEAD_REGISTRY.register() +class FastRCNNConvFCHead(nn.Module): + """ + A head with several 3x3 conv layers (each followed by norm & relu) and then + several fc layers (each followed by relu). + """ + + @configurable + def __init__( + self, input_shape: ShapeSpec, *, conv_dims: List[int], fc_dims: List[int], conv_norm="" + ): + """ + NOTE: this interface is experimental. + + Args: + input_shape (ShapeSpec): shape of the input feature. + conv_dims (list[int]): the output dimensions of the conv layers + fc_dims (list[int]): the output dimensions of the fc layers + conv_norm (str or callable): normalization for the conv layers. + See :func:`detectron2.layers.get_norm` for supported types. + """ + super().__init__() + assert len(conv_dims) + len(fc_dims) > 0 + + self._output_size = (input_shape.channels, input_shape.height, input_shape.width) + + self.conv_norm_relus = [] + for k, conv_dim in enumerate(conv_dims): + conv = Conv2d( + self._output_size[0], + conv_dim, + kernel_size=3, + padding=1, + bias=not conv_norm, + norm=get_norm(conv_norm, conv_dim), + activation=F.relu, + ) + self.add_module("conv{}".format(k + 1), conv) + self.conv_norm_relus.append(conv) + self._output_size = (conv_dim, self._output_size[1], self._output_size[2]) + + self.fcs = [] + for k, fc_dim in enumerate(fc_dims): + fc = Linear(np.prod(self._output_size), fc_dim) + self.add_module("fc{}".format(k + 1), fc) + self.fcs.append(fc) + self._output_size = fc_dim + + for layer in self.conv_norm_relus: + weight_init.c2_msra_fill(layer) + for layer in self.fcs: + weight_init.c2_xavier_fill(layer) + + @classmethod + def from_config(cls, cfg, input_shape): + num_conv = cfg.MODEL.ROI_BOX_HEAD.NUM_CONV + conv_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_DIM + num_fc = cfg.MODEL.ROI_BOX_HEAD.NUM_FC + fc_dim = cfg.MODEL.ROI_BOX_HEAD.FC_DIM + return { + "input_shape": input_shape, + "conv_dims": [conv_dim] * num_conv, + "fc_dims": [fc_dim] * num_fc, + "conv_norm": cfg.MODEL.ROI_BOX_HEAD.NORM, + } + + def forward(self, x): + for layer in self.conv_norm_relus: + x = layer(x) + if len(self.fcs): + if x.dim() > 2: + x = torch.flatten(x, start_dim=1) + for layer in self.fcs: + x = F.relu(layer(x)) + return x + + @property + def output_shape(self): + """ + Returns: + ShapeSpec: the output feature shape + """ + o = self._output_size + if isinstance(o, int): + return ShapeSpec(channels=o) + else: + return ShapeSpec(channels=o[0], height=o[1], width=o[2]) + + +def build_box_head(cfg, input_shape): + """ + Build a box head defined by `cfg.MODEL.ROI_BOX_HEAD.NAME`. + """ + name = cfg.MODEL.ROI_BOX_HEAD.NAME + return ROI_BOX_HEAD_REGISTRY.get(name)(cfg, input_shape) diff --git a/detectron2/modeling/roi_heads/cascade_rcnn.py b/detectron2/modeling/roi_heads/cascade_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..b3efdcf70c3b71b935676e103be288484c66f4e2 --- /dev/null +++ b/detectron2/modeling/roi_heads/cascade_rcnn.py @@ -0,0 +1,298 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import List +import torch +from torch import nn +from torch.autograd.function import Function + +from detectron2.config import configurable +from detectron2.layers import ShapeSpec +from detectron2.structures import Boxes, Instances, pairwise_iou +from detectron2.utils.events import get_event_storage + +from ..box_regression import Box2BoxTransform +from ..matcher import Matcher +from ..poolers import ROIPooler +from .box_head import build_box_head +from .fast_rcnn import FastRCNNOutputLayers, fast_rcnn_inference +from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads + + +class _ScaleGradient(Function): + @staticmethod + def forward(ctx, input, scale): + ctx.scale = scale + return input + + @staticmethod + def backward(ctx, grad_output): + return grad_output * ctx.scale, None + + +@ROI_HEADS_REGISTRY.register() +class CascadeROIHeads(StandardROIHeads): + """ + Implement :paper:`Cascade R-CNN`. + """ + + @configurable + def __init__( + self, + *, + box_in_features: List[str], + box_pooler: ROIPooler, + box_heads: List[nn.Module], + box_predictors: List[nn.Module], + proposal_matchers: List[Matcher], + **kwargs, + ): + """ + NOTE: this interface is experimental. + + Args: + box_pooler (ROIPooler): pooler that extracts region features from given boxes + box_heads (list[nn.Module]): box head for each cascade stage + box_predictors (list[nn.Module]): box predictor for each cascade stage + proposal_matchers (list[Matcher]): matcher with different IoU thresholds to + match boxes with ground truth for each stage. The first matcher matches + RPN proposals with ground truth, the other matchers use boxes predicted + by the previous stage as proposals and match them with ground truth. + """ + assert "proposal_matcher" not in kwargs, ( + "CascadeROIHeads takes 'proposal_matchers=' for each stage instead " + "of one 'proposal_matcher='." + ) + # The first matcher matches RPN proposals with ground truth, done in the base class + kwargs["proposal_matcher"] = proposal_matchers[0] + num_stages = self.num_cascade_stages = len(box_heads) + box_heads = nn.ModuleList(box_heads) + box_predictors = nn.ModuleList(box_predictors) + assert len(box_predictors) == num_stages, f"{len(box_predictors)} != {num_stages}!" + assert len(proposal_matchers) == num_stages, f"{len(proposal_matchers)} != {num_stages}!" + super().__init__( + box_in_features=box_in_features, + box_pooler=box_pooler, + box_head=box_heads, + box_predictor=box_predictors, + **kwargs, + ) + self.proposal_matchers = proposal_matchers + + @classmethod + def from_config(cls, cfg, input_shape): + ret = super().from_config(cfg, input_shape) + ret.pop("proposal_matcher") + return ret + + @classmethod + def _init_box_head(cls, cfg, input_shape): + # fmt: off + in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + cascade_bbox_reg_weights = cfg.MODEL.ROI_BOX_CASCADE_HEAD.BBOX_REG_WEIGHTS + cascade_ious = cfg.MODEL.ROI_BOX_CASCADE_HEAD.IOUS + assert len(cascade_bbox_reg_weights) == len(cascade_ious) + assert cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, \ + "CascadeROIHeads only support class-agnostic regression now!" + assert cascade_ious[0] == cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS[0] + # fmt: on + + in_channels = [input_shape[f].channels for f in in_features] + # Check all channel counts are equal + assert len(set(in_channels)) == 1, in_channels + in_channels = in_channels[0] + + box_pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + pooled_shape = ShapeSpec( + channels=in_channels, width=pooler_resolution, height=pooler_resolution + ) + + box_heads, box_predictors, proposal_matchers = [], [], [] + for match_iou, bbox_reg_weights in zip(cascade_ious, cascade_bbox_reg_weights): + box_head = build_box_head(cfg, pooled_shape) + box_heads.append(box_head) + box_predictors.append( + FastRCNNOutputLayers( + cfg, + box_head.output_shape, + box2box_transform=Box2BoxTransform(weights=bbox_reg_weights), + ) + ) + proposal_matchers.append(Matcher([match_iou], [0, 1], allow_low_quality_matches=False)) + return { + "box_in_features": in_features, + "box_pooler": box_pooler, + "box_heads": box_heads, + "box_predictors": box_predictors, + "proposal_matchers": proposal_matchers, + } + + def forward(self, images, features, proposals, targets=None): + del images + if self.training: + proposals = self.label_and_sample_proposals(proposals, targets) + + if self.training: + # Need targets to box head + losses = self._forward_box(features, proposals, targets) + losses.update(self._forward_mask(features, proposals)) + losses.update(self._forward_keypoint(features, proposals)) + return proposals, losses + else: + pred_instances = self._forward_box(features, proposals) + pred_instances = self.forward_with_given_boxes(features, pred_instances) + return pred_instances, {} + + def _forward_box(self, features, proposals, targets=None): + """ + Args: + features, targets: the same as in + Same as in :meth:`ROIHeads.forward`. + proposals (list[Instances]): the per-image object proposals with + their matching ground truth. + Each has fields "proposal_boxes", and "objectness_logits", + "gt_classes", "gt_boxes". + """ + features = [features[f] for f in self.box_in_features] + head_outputs = [] # (predictor, predictions, proposals) + prev_pred_boxes = None + image_sizes = [x.image_size for x in proposals] + for k in range(self.num_cascade_stages): + if k > 0: + # The output boxes of the previous stage are used to create the input + # proposals of the next stage. + proposals = self._create_proposals_from_boxes(prev_pred_boxes, image_sizes) + if self.training: + proposals = self._match_and_label_boxes(proposals, k, targets) + predictions = self._run_stage(features, proposals, k) + prev_pred_boxes = self.box_predictor[k].predict_boxes(predictions, proposals) + head_outputs.append((self.box_predictor[k], predictions, proposals)) + + if self.training: + losses = {} + storage = get_event_storage() + for stage, (predictor, predictions, proposals) in enumerate(head_outputs): + with storage.name_scope("stage{}".format(stage)): + stage_losses = predictor.losses(predictions, proposals) + losses.update({k + "_stage{}".format(stage): v for k, v in stage_losses.items()}) + return losses + else: + # Each is a list[Tensor] of length #image. Each tensor is Ri x (K+1) + scores_per_stage = [h[0].predict_probs(h[1], h[2]) for h in head_outputs] + + # Average the scores across heads + scores = [ + sum(list(scores_per_image)) * (1.0 / self.num_cascade_stages) + for scores_per_image in zip(*scores_per_stage) + ] + # Use the boxes of the last head + predictor, predictions, proposals = head_outputs[-1] + boxes = predictor.predict_boxes(predictions, proposals) + pred_instances, _ = fast_rcnn_inference( + boxes, + scores, + image_sizes, + predictor.test_score_thresh, + predictor.test_nms_thresh, + predictor.test_topk_per_image, + ) + return pred_instances + + @torch.no_grad() + def _match_and_label_boxes(self, proposals, stage, targets): + """ + Match proposals with groundtruth using the matcher at the given stage. + Label the proposals as foreground or background based on the match. + + Args: + proposals (list[Instances]): One Instances for each image, with + the field "proposal_boxes". + stage (int): the current stage + targets (list[Instances]): the ground truth instances + + Returns: + list[Instances]: the same proposals, but with fields "gt_classes" and "gt_boxes" + """ + num_fg_samples, num_bg_samples = [], [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + match_quality_matrix = pairwise_iou( + targets_per_image.gt_boxes, proposals_per_image.proposal_boxes + ) + # proposal_labels are 0 or 1 + matched_idxs, proposal_labels = self.proposal_matchers[stage](match_quality_matrix) + if len(targets_per_image) > 0: + gt_classes = targets_per_image.gt_classes[matched_idxs] + # Label unmatched proposals (0 label from matcher) as background (label=num_classes) + gt_classes[proposal_labels == 0] = self.num_classes + gt_boxes = targets_per_image.gt_boxes[matched_idxs] + else: + gt_classes = torch.zeros_like(matched_idxs) + self.num_classes + gt_boxes = Boxes( + targets_per_image.gt_boxes.tensor.new_zeros((len(proposals_per_image), 4)) + ) + proposals_per_image.gt_classes = gt_classes + proposals_per_image.gt_boxes = gt_boxes + + num_fg_samples.append((proposal_labels == 1).sum().item()) + num_bg_samples.append(proposal_labels.numel() - num_fg_samples[-1]) + + # Log the number of fg/bg samples in each stage + storage = get_event_storage() + storage.put_scalar( + "stage{}/roi_head/num_fg_samples".format(stage), + sum(num_fg_samples) / len(num_fg_samples), + ) + storage.put_scalar( + "stage{}/roi_head/num_bg_samples".format(stage), + sum(num_bg_samples) / len(num_bg_samples), + ) + return proposals + + def _run_stage(self, features, proposals, stage): + """ + Args: + features (list[Tensor]): #lvl input features to ROIHeads + proposals (list[Instances]): #image Instances, with the field "proposal_boxes" + stage (int): the current stage + + Returns: + Same output as `FastRCNNOutputLayers.forward()`. + """ + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + # The original implementation averages the losses among heads, + # but scale up the parameter gradients of the heads. + # This is equivalent to adding the losses among heads, + # but scale down the gradients on features. + box_features = _ScaleGradient.apply(box_features, 1.0 / self.num_cascade_stages) + box_features = self.box_head[stage](box_features) + return self.box_predictor[stage](box_features) + + def _create_proposals_from_boxes(self, boxes, image_sizes): + """ + Args: + boxes (list[Tensor]): per-image predicted boxes, each of shape Ri x 4 + image_sizes (list[tuple]): list of image shapes in (h, w) + + Returns: + list[Instances]: per-image proposals with the given boxes. + """ + # Just like RPN, the proposals should not have gradients + boxes = [Boxes(b.detach()) for b in boxes] + proposals = [] + for boxes_per_image, image_size in zip(boxes, image_sizes): + boxes_per_image.clip(image_size) + if self.training: + # do not filter empty boxes at inference time, + # because the scores from each stage need to be aligned and added later + boxes_per_image = boxes_per_image[boxes_per_image.nonempty()] + prop = Instances(image_size) + prop.proposal_boxes = boxes_per_image + proposals.append(prop) + return proposals diff --git a/detectron2/modeling/roi_heads/fast_rcnn.py b/detectron2/modeling/roi_heads/fast_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..ca796ace55509efb8a898f580203076bada387f2 --- /dev/null +++ b/detectron2/modeling/roi_heads/fast_rcnn.py @@ -0,0 +1,510 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import torch +from fvcore.nn import smooth_l1_loss +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Linear, ShapeSpec, batched_nms, cat +from detectron2.modeling.box_regression import Box2BoxTransform, apply_deltas_broadcast +from detectron2.structures import Boxes, Instances +from detectron2.utils.events import get_event_storage + +__all__ = ["fast_rcnn_inference", "FastRCNNOutputLayers"] + + +logger = logging.getLogger(__name__) + +""" +Shape shorthand in this module: + + N: number of images in the minibatch + R: number of ROIs, combined over all images, in the minibatch + Ri: number of ROIs in image i + K: number of foreground classes. E.g.,there are 80 foreground classes in COCO. + +Naming convention: + + deltas: refers to the 4-d (dx, dy, dw, dh) deltas that parameterize the box2box + transform (see :class:`box_regression.Box2BoxTransform`). + + pred_class_logits: predicted class scores in [-inf, +inf]; use + softmax(pred_class_logits) to estimate P(class). + + gt_classes: ground-truth classification labels in [0, K], where [0, K) represent + foreground object classes and K represents the background class. + + pred_proposal_deltas: predicted box2box transform deltas for transforming proposals + to detection box predictions. + + gt_proposal_deltas: ground-truth box2box transform deltas +""" + + +def fast_rcnn_inference(boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image): + """ + Call `fast_rcnn_inference_single_image` for all images. + + Args: + boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic + boxes for each image. Element i has shape (Ri, K * 4) if doing + class-specific regression, or (Ri, 4) if doing class-agnostic + regression, where Ri is the number of predicted objects for image i. + This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`. + scores (list[Tensor]): A list of Tensors of predicted class scores for each image. + Element i has shape (Ri, K + 1), where Ri is the number of predicted objects + for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`. + image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. + score_thresh (float): Only return detections with a confidence score exceeding this + threshold. + nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. + topk_per_image (int): The number of top scoring detections to return. Set < 0 to return + all detections. + + Returns: + instances: (list[Instances]): A list of N instances, one for each image in the batch, + that stores the topk most confidence detections. + kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates + the corresponding boxes/scores index in [0, Ri) from the input, for image i. + """ + result_per_image = [ + fast_rcnn_inference_single_image( + boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image + ) + for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) + ] + return [x[0] for x in result_per_image], [x[1] for x in result_per_image] + + +def fast_rcnn_inference_single_image( + boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image +): + """ + Single-image inference. Return bounding-box detection results by thresholding + on scores and applying non-maximum suppression (NMS). + + Args: + Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes + per image. + + Returns: + Same as `fast_rcnn_inference`, but for only one image. + """ + valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) + if not valid_mask.all(): + boxes = boxes[valid_mask] + scores = scores[valid_mask] + + scores = scores[:, :-1] + num_bbox_reg_classes = boxes.shape[1] // 4 + # Convert to Boxes to use the `clip` function ... + boxes = Boxes(boxes.reshape(-1, 4)) + boxes.clip(image_shape) + boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4) # R x C x 4 + + # Filter results based on detection scores + filter_mask = scores > score_thresh # R x K + # R' x 2. First column contains indices of the R predictions; + # Second column contains indices of classes. + filter_inds = filter_mask.nonzero() + if num_bbox_reg_classes == 1: + boxes = boxes[filter_inds[:, 0], 0] + else: + boxes = boxes[filter_mask] + scores = scores[filter_mask] + + # Apply per-class NMS + keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh) + if topk_per_image >= 0: + keep = keep[:topk_per_image] + boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] + + result = Instances(image_shape) + result.pred_boxes = Boxes(boxes) + result.scores = scores + result.pred_classes = filter_inds[:, 1] + return result, filter_inds[:, 0] + + +class FastRCNNOutputs(object): + """ + A class that stores information about outputs of a Fast R-CNN head. + It provides methods that are used to decode the outputs of a Fast R-CNN head. + """ + + def __init__( + self, + box2box_transform, + pred_class_logits, + pred_proposal_deltas, + proposals, + smooth_l1_beta=0, + ): + """ + Args: + box2box_transform (Box2BoxTransform/Box2BoxTransformRotated): + box2box transform instance for proposal-to-detection transformations. + pred_class_logits (Tensor): A tensor of shape (R, K + 1) storing the predicted class + logits for all R predicted object instances. + Each row corresponds to a predicted object instance. + pred_proposal_deltas (Tensor): A tensor of shape (R, K * B) or (R, B) for + class-specific or class-agnostic regression. It stores the predicted deltas that + transform proposals into final box detections. + B is the box dimension (4 or 5). + When B is 4, each row is [dx, dy, dw, dh (, ....)]. + When B is 5, each row is [dx, dy, dw, dh, da (, ....)]. + proposals (list[Instances]): A list of N Instances, where Instances i stores the + proposals for image i, in the field "proposal_boxes". + When training, each Instances must have ground-truth labels + stored in the field "gt_classes" and "gt_boxes". + The total number of all instances must be equal to R. + smooth_l1_beta (float): The transition point between L1 and L2 loss in + the smooth L1 loss function. When set to 0, the loss becomes L1. When + set to +inf, the loss becomes constant 0. + """ + self.box2box_transform = box2box_transform + self.num_preds_per_image = [len(p) for p in proposals] + self.pred_class_logits = pred_class_logits + self.pred_proposal_deltas = pred_proposal_deltas + self.smooth_l1_beta = smooth_l1_beta + self.image_shapes = [x.image_size for x in proposals] + + if len(proposals): + box_type = type(proposals[0].proposal_boxes) + # cat(..., dim=0) concatenates over all images in the batch + self.proposals = box_type.cat([p.proposal_boxes for p in proposals]) + assert ( + not self.proposals.tensor.requires_grad + ), "Proposals should not require gradients!" + + # The following fields should exist only when training. + if proposals[0].has("gt_boxes"): + self.gt_boxes = box_type.cat([p.gt_boxes for p in proposals]) + assert proposals[0].has("gt_classes") + self.gt_classes = cat([p.gt_classes for p in proposals], dim=0) + else: + self.proposals = Boxes(torch.zeros(0, 4, device=self.pred_proposal_deltas.device)) + self._no_instances = len(proposals) == 0 # no instances found + + def _log_accuracy(self): + """ + Log the accuracy metrics to EventStorage. + """ + num_instances = self.gt_classes.numel() + pred_classes = self.pred_class_logits.argmax(dim=1) + bg_class_ind = self.pred_class_logits.shape[1] - 1 + + fg_inds = (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind) + num_fg = fg_inds.nonzero().numel() + fg_gt_classes = self.gt_classes[fg_inds] + fg_pred_classes = pred_classes[fg_inds] + + num_false_negative = (fg_pred_classes == bg_class_ind).nonzero().numel() + num_accurate = (pred_classes == self.gt_classes).nonzero().numel() + fg_num_accurate = (fg_pred_classes == fg_gt_classes).nonzero().numel() + + storage = get_event_storage() + if num_instances > 0: + storage.put_scalar("fast_rcnn/cls_accuracy", num_accurate / num_instances) + if num_fg > 0: + storage.put_scalar("fast_rcnn/fg_cls_accuracy", fg_num_accurate / num_fg) + storage.put_scalar("fast_rcnn/false_negative", num_false_negative / num_fg) + + def softmax_cross_entropy_loss(self): + """ + Compute the softmax cross entropy loss for box classification. + + Returns: + scalar Tensor + """ + if self._no_instances: + return 0.0 * self.pred_class_logits.sum() + else: + self._log_accuracy() + return F.cross_entropy(self.pred_class_logits, self.gt_classes, reduction="mean") + + def smooth_l1_loss(self): + """ + Compute the smooth L1 loss for box regression. + + Returns: + scalar Tensor + """ + if self._no_instances: + return 0.0 * self.pred_proposal_deltas.sum() + gt_proposal_deltas = self.box2box_transform.get_deltas( + self.proposals.tensor, self.gt_boxes.tensor + ) + box_dim = gt_proposal_deltas.size(1) # 4 or 5 + cls_agnostic_bbox_reg = self.pred_proposal_deltas.size(1) == box_dim + device = self.pred_proposal_deltas.device + + bg_class_ind = self.pred_class_logits.shape[1] - 1 + + # Box delta loss is only computed between the prediction for the gt class k + # (if 0 <= k < bg_class_ind) and the target; there is no loss defined on predictions + # for non-gt classes and background. + # Empty fg_inds produces a valid loss of zero as long as the size_average + # arg to smooth_l1_loss is False (otherwise it uses torch.mean internally + # and would produce a nan loss). + fg_inds = torch.nonzero( + (self.gt_classes >= 0) & (self.gt_classes < bg_class_ind), as_tuple=True + )[0] + if cls_agnostic_bbox_reg: + # pred_proposal_deltas only corresponds to foreground class for agnostic + gt_class_cols = torch.arange(box_dim, device=device) + else: + fg_gt_classes = self.gt_classes[fg_inds] + # pred_proposal_deltas for class k are located in columns [b * k : b * k + b], + # where b is the dimension of box representation (4 or 5) + # Note that compared to Detectron1, + # we do not perform bounding box regression for background classes. + gt_class_cols = box_dim * fg_gt_classes[:, None] + torch.arange(box_dim, device=device) + + loss_box_reg = smooth_l1_loss( + self.pred_proposal_deltas[fg_inds[:, None], gt_class_cols], + gt_proposal_deltas[fg_inds], + self.smooth_l1_beta, + reduction="sum", + ) + # The loss is normalized using the total number of regions (R), not the number + # of foreground regions even though the box regression loss is only defined on + # foreground regions. Why? Because doing so gives equal training influence to + # each foreground example. To see how, consider two different minibatches: + # (1) Contains a single foreground region + # (2) Contains 100 foreground regions + # If we normalize by the number of foreground regions, the single example in + # minibatch (1) will be given 100 times as much influence as each foreground + # example in minibatch (2). Normalizing by the total number of regions, R, + # means that the single example in minibatch (1) and each of the 100 examples + # in minibatch (2) are given equal influence. + loss_box_reg = loss_box_reg / self.gt_classes.numel() + return loss_box_reg + + def _predict_boxes(self): + """ + Returns: + Tensor: A Tensors of predicted class-specific or class-agnostic boxes + for all images in a batch. Element i has shape (Ri, K * B) or (Ri, B), where Ri is + the number of predicted objects for image i and B is the box dimension (4 or 5) + """ + return apply_deltas_broadcast( + self.box2box_transform, self.pred_proposal_deltas, self.proposals.tensor + ) + + """ + A subclass is expected to have the following methods because + they are used to query information about the head predictions. + """ + + def losses(self): + """ + Compute the default losses for box head in Fast(er) R-CNN, + with softmax cross entropy loss and smooth L1 loss. + + Returns: + A dict of losses (scalar tensors) containing keys "loss_cls" and "loss_box_reg". + """ + return { + "loss_cls": self.softmax_cross_entropy_loss(), + "loss_box_reg": self.smooth_l1_loss(), + } + + def predict_boxes(self): + """ + Deprecated + """ + return self._predict_boxes().split(self.num_preds_per_image, dim=0) + + def predict_probs(self): + """ + Deprecated + """ + probs = F.softmax(self.pred_class_logits, dim=-1) + return probs.split(self.num_preds_per_image, dim=0) + + def inference(self, score_thresh, nms_thresh, topk_per_image): + """ + Deprecated + """ + boxes = self.predict_boxes() + scores = self.predict_probs() + image_shapes = self.image_shapes + return fast_rcnn_inference( + boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image + ) + + +class FastRCNNOutputLayers(nn.Module): + """ + Two linear layers for predicting Fast R-CNN outputs: + (1) proposal-to-detection box regression deltas + (2) classification scores + """ + + @configurable + def __init__( + self, + input_shape, + *, + box2box_transform, + num_classes, + cls_agnostic_bbox_reg=False, + smooth_l1_beta=0.0, + test_score_thresh=0.0, + test_nms_thresh=0.5, + test_topk_per_image=100, + ): + """ + NOTE: this interface is experimental. + + Args: + input_shape (ShapeSpec): shape of the input feature to this module + box2box_transform (Box2BoxTransform or Box2BoxTransformRotated): + num_classes (int): number of foreground classes + cls_agnostic_bbox_reg (bool): whether to use class agnostic for bbox regression + smooth_l1_beta (float): transition point from L1 to L2 loss. + test_score_thresh (float): threshold to filter predictions results. + test_nms_thresh (float): NMS threshold for prediction results. + test_topk_per_image (int): number of top predictions to produce per image. + """ + super().__init__() + if isinstance(input_shape, int): # some backward compatibility + input_shape = ShapeSpec(channels=input_shape) + input_size = input_shape.channels * (input_shape.width or 1) * (input_shape.height or 1) + # The prediction layer for num_classes foreground classes and one background class + # (hence + 1) + self.cls_score = Linear(input_size, num_classes + 1) + num_bbox_reg_classes = 1 if cls_agnostic_bbox_reg else num_classes + box_dim = len(box2box_transform.weights) + self.bbox_pred = Linear(input_size, num_bbox_reg_classes * box_dim) + + nn.init.normal_(self.cls_score.weight, std=0.01) + nn.init.normal_(self.bbox_pred.weight, std=0.001) + for l in [self.cls_score, self.bbox_pred]: + nn.init.constant_(l.bias, 0) + + self.box2box_transform = box2box_transform + self.smooth_l1_beta = smooth_l1_beta + self.test_score_thresh = test_score_thresh + self.test_nms_thresh = test_nms_thresh + self.test_topk_per_image = test_topk_per_image + + @classmethod + def from_config(cls, cfg, input_shape): + return { + "input_shape": input_shape, + "box2box_transform": Box2BoxTransform(weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS), + # fmt: off + "num_classes" : cfg.MODEL.ROI_HEADS.NUM_CLASSES, + "cls_agnostic_bbox_reg" : cfg.MODEL.ROI_BOX_HEAD.CLS_AGNOSTIC_BBOX_REG, + "smooth_l1_beta" : cfg.MODEL.ROI_BOX_HEAD.SMOOTH_L1_BETA, + "test_score_thresh" : cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST, + "test_nms_thresh" : cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, + "test_topk_per_image" : cfg.TEST.DETECTIONS_PER_IMAGE + # fmt: on + } + + def forward(self, x): + """ + Returns: + Tensor: Nx(K+1) scores for each box + Tensor: Nx4 or Nx(Kx4) bounding box regression deltas. + """ + if x.dim() > 2: + x = torch.flatten(x, start_dim=1) + scores = self.cls_score(x) + proposal_deltas = self.bbox_pred(x) + return scores, proposal_deltas + + # TODO: move the implementation to this class. + def losses(self, predictions, proposals): + """ + Args: + predictions: return values of :meth:`forward()`. + proposals (list[Instances]): proposals that match the features + that were used to compute predictions. + """ + scores, proposal_deltas = predictions + return FastRCNNOutputs( + self.box2box_transform, scores, proposal_deltas, proposals, self.smooth_l1_beta + ).losses() + + def inference(self, predictions, proposals): + """ + Returns: + list[Instances]: same as `fast_rcnn_inference`. + list[Tensor]: same as `fast_rcnn_inference`. + """ + boxes = self.predict_boxes(predictions, proposals) + scores = self.predict_probs(predictions, proposals) + image_shapes = [x.image_size for x in proposals] + return fast_rcnn_inference( + boxes, + scores, + image_shapes, + self.test_score_thresh, + self.test_nms_thresh, + self.test_topk_per_image, + ) + + def predict_boxes_for_gt_classes(self, predictions, proposals): + """ + Returns: + list[Tensor]: A list of Tensors of predicted boxes for GT classes in case of + class-specific box head. Element i of the list has shape (Ri, B), where Ri is + the number of predicted objects for image i and B is the box dimension (4 or 5) + """ + if not len(proposals): + return [] + scores, proposal_deltas = predictions + proposal_boxes = [p.proposal_boxes for p in proposals] + proposal_boxes = proposal_boxes[0].cat(proposal_boxes).tensor + N, B = proposal_boxes.shape + predict_boxes = apply_deltas_broadcast( + self.box2box_transform, proposal_deltas, proposal_boxes + ) # Nx(KxB) + + K = predict_boxes.shape[1] // B + if K > 1: + gt_classes = torch.cat([p.gt_classes for p in proposals], dim=0) + # Some proposals are ignored or have a background class. Their gt_classes + # cannot be used as index. + gt_classes = gt_classes.clamp_(0, K - 1) + + predict_boxes = predict_boxes.view(N, K, B)[ + torch.arange(N, dtype=torch.long, device=predict_boxes.device), gt_classes + ] + num_prop_per_image = [len(p) for p in proposals] + return predict_boxes.split(num_prop_per_image) + + def predict_boxes(self, predictions, proposals): + """ + Returns: + list[Tensor]: A list of Tensors of predicted class-specific or class-agnostic boxes + for each image. Element i has shape (Ri, K * B) or (Ri, B), where Ri is + the number of predicted objects for image i and B is the box dimension (4 or 5) + """ + if not len(proposals): + return [] + _, proposal_deltas = predictions + num_prop_per_image = [len(p) for p in proposals] + proposal_boxes = [p.proposal_boxes for p in proposals] + proposal_boxes = proposal_boxes[0].cat(proposal_boxes).tensor + predict_boxes = apply_deltas_broadcast( + self.box2box_transform, proposal_deltas, proposal_boxes + ) # Nx(KxB) + return predict_boxes.split(num_prop_per_image) + + def predict_probs(self, predictions, proposals): + """ + Returns: + list[Tensor]: A list of Tensors of predicted class probabilities for each image. + Element i has shape (Ri, K + 1), where Ri is the number of predicted objects + for image i. + """ + scores, _ = predictions + num_inst_per_image = [len(p) for p in proposals] + probs = F.softmax(scores, dim=-1) + return probs.split(num_inst_per_image, dim=0) diff --git a/detectron2/modeling/roi_heads/keypoint_head.py b/detectron2/modeling/roi_heads/keypoint_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c7990c8fd90c70c98d6b2e3f94935f571b957a79 --- /dev/null +++ b/detectron2/modeling/roi_heads/keypoint_head.py @@ -0,0 +1,253 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import List +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ConvTranspose2d, cat, interpolate +from detectron2.structures import Instances, heatmaps_to_keypoints +from detectron2.utils.events import get_event_storage +from detectron2.utils.registry import Registry + +_TOTAL_SKIPPED = 0 + +ROI_KEYPOINT_HEAD_REGISTRY = Registry("ROI_KEYPOINT_HEAD") +ROI_KEYPOINT_HEAD_REGISTRY.__doc__ = """ +Registry for keypoint heads, which make keypoint predictions from per-region features. + +The registered object will be called with `obj(cfg, input_shape)`. +""" + + +def build_keypoint_head(cfg, input_shape): + """ + Build a keypoint head from `cfg.MODEL.ROI_KEYPOINT_HEAD.NAME`. + """ + name = cfg.MODEL.ROI_KEYPOINT_HEAD.NAME + return ROI_KEYPOINT_HEAD_REGISTRY.get(name)(cfg, input_shape) + + +def keypoint_rcnn_loss(pred_keypoint_logits, instances, normalizer): + """ + Arguments: + pred_keypoint_logits (Tensor): A tensor of shape (N, K, S, S) where N is the total number + of instances in the batch, K is the number of keypoints, and S is the side length + of the keypoint heatmap. The values are spatial logits. + instances (list[Instances]): A list of M Instances, where M is the batch size. + These instances are predictions from the model + that are in 1:1 correspondence with pred_keypoint_logits. + Each Instances should contain a `gt_keypoints` field containing a `structures.Keypoint` + instance. + normalizer (float): Normalize the loss by this amount. + If not specified, we normalize by the number of visible keypoints in the minibatch. + + Returns a scalar tensor containing the loss. + """ + heatmaps = [] + valid = [] + + keypoint_side_len = pred_keypoint_logits.shape[2] + for instances_per_image in instances: + if len(instances_per_image) == 0: + continue + keypoints = instances_per_image.gt_keypoints + heatmaps_per_image, valid_per_image = keypoints.to_heatmap( + instances_per_image.proposal_boxes.tensor, keypoint_side_len + ) + heatmaps.append(heatmaps_per_image.view(-1)) + valid.append(valid_per_image.view(-1)) + + if len(heatmaps): + keypoint_targets = cat(heatmaps, dim=0) + valid = cat(valid, dim=0).to(dtype=torch.uint8) + valid = torch.nonzero(valid).squeeze(1) + + # torch.mean (in binary_cross_entropy_with_logits) doesn't + # accept empty tensors, so handle it separately + if len(heatmaps) == 0 or valid.numel() == 0: + global _TOTAL_SKIPPED + _TOTAL_SKIPPED += 1 + storage = get_event_storage() + storage.put_scalar("kpts_num_skipped_batches", _TOTAL_SKIPPED, smoothing_hint=False) + return pred_keypoint_logits.sum() * 0 + + N, K, H, W = pred_keypoint_logits.shape + pred_keypoint_logits = pred_keypoint_logits.view(N * K, H * W) + + keypoint_loss = F.cross_entropy( + pred_keypoint_logits[valid], keypoint_targets[valid], reduction="sum" + ) + + # If a normalizer isn't specified, normalize by the number of visible keypoints in the minibatch + if normalizer is None: + normalizer = valid.numel() + keypoint_loss /= normalizer + + return keypoint_loss + + +def keypoint_rcnn_inference(pred_keypoint_logits, pred_instances): + """ + Post process each predicted keypoint heatmap in `pred_keypoint_logits` into (x, y, score) + and add it to the `pred_instances` as a `pred_keypoints` field. + + Args: + pred_keypoint_logits (Tensor): A tensor of shape (R, K, S, S) where R is the total number + of instances in the batch, K is the number of keypoints, and S is the side length of + the keypoint heatmap. The values are spatial logits. + pred_instances (list[Instances]): A list of N Instances, where N is the number of images. + + Returns: + None. Each element in pred_instances will contain an extra "pred_keypoints" field. + The field is a tensor of shape (#instance, K, 3) where the last + dimension corresponds to (x, y, score). + The scores are larger than 0. + """ + # flatten all bboxes from all images together (list[Boxes] -> Rx4 tensor) + bboxes_flat = cat([b.pred_boxes.tensor for b in pred_instances], dim=0) + + keypoint_results = heatmaps_to_keypoints(pred_keypoint_logits.detach(), bboxes_flat.detach()) + num_instances_per_image = [len(i) for i in pred_instances] + keypoint_results = keypoint_results[:, :, [0, 1, 3]].split(num_instances_per_image, dim=0) + + for keypoint_results_per_image, instances_per_image in zip(keypoint_results, pred_instances): + # keypoint_results_per_image is (num instances)x(num keypoints)x(x, y, score) + instances_per_image.pred_keypoints = keypoint_results_per_image + + +class BaseKeypointRCNNHead(nn.Module): + """ + Implement the basic Keypoint R-CNN losses and inference logic described in :paper:`Mask R-CNN`. + """ + + @configurable + def __init__(self, *, num_keypoints, loss_weight=1.0, loss_normalizer=1.0): + """ + NOTE: this interface is experimental. + + Args: + num_keypoints (int): number of keypoints to predict + loss_weight (float): weight to multiple on the keypoint loss + loss_normalizer (float or str): + If float, divide the loss by `loss_normalizer * #images`. + If 'visible', the loss is normalized by the total number of + visible keypoints across images. + """ + super().__init__() + self.num_keypoints = num_keypoints + self.loss_weight = loss_weight + assert loss_normalizer == "visible" or isinstance(loss_normalizer, float), loss_normalizer + self.loss_normalizer = loss_normalizer + + @classmethod + def from_config(cls, cfg, input_shape): + ret = { + "loss_weight": cfg.MODEL.ROI_KEYPOINT_HEAD.LOSS_WEIGHT, + "num_keypoints": cfg.MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS, + } + normalize_by_visible = ( + cfg.MODEL.ROI_KEYPOINT_HEAD.NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS + ) # noqa + if not normalize_by_visible: + batch_size_per_image = cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE + positive_sample_fraction = cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION + ret["loss_normalizer"] = ( + ret["num_keypoints"] * batch_size_per_image * positive_sample_fraction + ) + else: + ret["loss_normalizer"] = "visible" + return ret + + def forward(self, x, instances: List[Instances]): + """ + Args: + x: input region feature(s) provided by :class:`ROIHeads`. + instances (list[Instances]): contains the boxes & labels corresponding + to the input features. + Exact format is up to its caller to decide. + Typically, this is the foreground instances in training, with + "proposal_boxes" field and other gt annotations. + In inference, it contains boxes that are already predicted. + + Returns: + A dict of losses if in training. The predicted "instances" if in inference. + """ + x = self.layers(x) + if self.training: + num_images = len(instances) + normalizer = ( + None if self.loss_normalizer == "visible" else num_images * self.loss_normalizer + ) + return { + "loss_keypoint": keypoint_rcnn_loss(x, instances, normalizer=normalizer) + * self.loss_weight + } + else: + keypoint_rcnn_inference(x, instances) + return instances + + def layers(self, x): + """ + Neural network layers that makes predictions from regional input features. + """ + raise NotImplementedError + + +@ROI_KEYPOINT_HEAD_REGISTRY.register() +class KRCNNConvDeconvUpsampleHead(BaseKeypointRCNNHead): + """ + A standard keypoint head containing a series of 3x3 convs, followed by + a transpose convolution and bilinear interpolation for upsampling. + """ + + @configurable + def __init__(self, input_shape, *, num_keypoints, conv_dims, **kwargs): + """ + NOTE: this interface is experimental. + + Args: + input_shape (ShapeSpec): shape of the input feature + conv_dims: an iterable of output channel counts for each conv in the head + e.g. (512, 512, 512) for three convs outputting 512 channels. + """ + super().__init__(num_keypoints=num_keypoints, **kwargs) + + # default up_scale to 2 (this can be made an option) + up_scale = 2 + in_channels = input_shape.channels + + self.blocks = [] + for idx, layer_channels in enumerate(conv_dims, 1): + module = Conv2d(in_channels, layer_channels, 3, stride=1, padding=1) + self.add_module("conv_fcn{}".format(idx), module) + self.blocks.append(module) + in_channels = layer_channels + + deconv_kernel = 4 + self.score_lowres = ConvTranspose2d( + in_channels, num_keypoints, deconv_kernel, stride=2, padding=deconv_kernel // 2 - 1 + ) + self.up_scale = up_scale + + for name, param in self.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + # Caffe2 implementation uses MSRAFill, which in fact + # corresponds to kaiming_normal_ in PyTorch + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") + + @classmethod + def from_config(cls, cfg, input_shape): + ret = super().from_config(cfg, input_shape) + ret["input_shape"] = input_shape + ret["conv_dims"] = cfg.MODEL.ROI_KEYPOINT_HEAD.CONV_DIMS + return ret + + def layers(self, x): + for layer in self.blocks: + x = F.relu(layer(x)) + x = self.score_lowres(x) + x = interpolate(x, scale_factor=self.up_scale, mode="bilinear", align_corners=False) + return x diff --git a/detectron2/modeling/roi_heads/mask_head.py b/detectron2/modeling/roi_heads/mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5209722fb96b5e430bb5f30b3fce2b94b91f2b2e --- /dev/null +++ b/detectron2/modeling/roi_heads/mask_head.py @@ -0,0 +1,277 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import List +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ConvTranspose2d, ShapeSpec, cat, get_norm +from detectron2.structures import Instances +from detectron2.utils.events import get_event_storage +from detectron2.utils.registry import Registry + +ROI_MASK_HEAD_REGISTRY = Registry("ROI_MASK_HEAD") +ROI_MASK_HEAD_REGISTRY.__doc__ = """ +Registry for mask heads, which predicts instance masks given +per-region features. + +The registered object will be called with `obj(cfg, input_shape)`. +""" + + +def mask_rcnn_loss(pred_mask_logits, instances, vis_period=0): + """ + Compute the mask prediction loss defined in the Mask R-CNN paper. + + Args: + pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) + for class-specific or class-agnostic, where B is the total number of predicted masks + in all images, C is the number of foreground classes, and Hmask, Wmask are the height + and width of the mask predictions. The values are logits. + instances (list[Instances]): A list of N Instances, where N is the number of images + in the batch. These instances are in 1:1 + correspondence with the pred_mask_logits. The ground-truth labels (class, box, mask, + ...) associated with each instance are stored in fields. + vis_period (int): the period (in steps) to dump visualization. + + Returns: + mask_loss (Tensor): A scalar tensor containing the loss. + """ + cls_agnostic_mask = pred_mask_logits.size(1) == 1 + total_num_masks = pred_mask_logits.size(0) + mask_side_len = pred_mask_logits.size(2) + assert pred_mask_logits.size(2) == pred_mask_logits.size(3), "Mask prediction must be square!" + + gt_classes = [] + gt_masks = [] + for instances_per_image in instances: + if len(instances_per_image) == 0: + continue + if not cls_agnostic_mask: + gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64) + gt_classes.append(gt_classes_per_image) + + gt_masks_per_image = instances_per_image.gt_masks.crop_and_resize( + instances_per_image.proposal_boxes.tensor, mask_side_len + ).to(device=pred_mask_logits.device) + # A tensor of shape (N, M, M), N=#instances in the image; M=mask_side_len + gt_masks.append(gt_masks_per_image) + + if len(gt_masks) == 0: + return pred_mask_logits.sum() * 0 + + gt_masks = cat(gt_masks, dim=0) + + if cls_agnostic_mask: + pred_mask_logits = pred_mask_logits[:, 0] + else: + indices = torch.arange(total_num_masks) + gt_classes = cat(gt_classes, dim=0) + pred_mask_logits = pred_mask_logits[indices, gt_classes] + + if gt_masks.dtype == torch.bool: + gt_masks_bool = gt_masks + else: + # Here we allow gt_masks to be float as well (depend on the implementation of rasterize()) + gt_masks_bool = gt_masks > 0.5 + gt_masks = gt_masks.to(dtype=torch.float32) + + # Log the training accuracy (using gt classes and 0.5 threshold) + mask_incorrect = (pred_mask_logits > 0.0) != gt_masks_bool + mask_accuracy = 1 - (mask_incorrect.sum().item() / max(mask_incorrect.numel(), 1.0)) + num_positive = gt_masks_bool.sum().item() + false_positive = (mask_incorrect & ~gt_masks_bool).sum().item() / max( + gt_masks_bool.numel() - num_positive, 1.0 + ) + false_negative = (mask_incorrect & gt_masks_bool).sum().item() / max(num_positive, 1.0) + + storage = get_event_storage() + storage.put_scalar("mask_rcnn/accuracy", mask_accuracy) + storage.put_scalar("mask_rcnn/false_positive", false_positive) + storage.put_scalar("mask_rcnn/false_negative", false_negative) + if vis_period > 0 and storage.iter % vis_period == 0: + pred_masks = pred_mask_logits.sigmoid() + vis_masks = torch.cat([pred_masks, gt_masks], axis=2) + name = "Left: mask prediction; Right: mask GT" + for idx, vis_mask in enumerate(vis_masks): + vis_mask = torch.stack([vis_mask] * 3, axis=0) + storage.put_image(name + f" ({idx})", vis_mask) + + mask_loss = F.binary_cross_entropy_with_logits(pred_mask_logits, gt_masks, reduction="mean") + return mask_loss + + +def mask_rcnn_inference(pred_mask_logits, pred_instances): + """ + Convert pred_mask_logits to estimated foreground probability masks while also + extracting only the masks for the predicted classes in pred_instances. For each + predicted box, the mask of the same class is attached to the instance by adding a + new "pred_masks" field to pred_instances. + + Args: + pred_mask_logits (Tensor): A tensor of shape (B, C, Hmask, Wmask) or (B, 1, Hmask, Wmask) + for class-specific or class-agnostic, where B is the total number of predicted masks + in all images, C is the number of foreground classes, and Hmask, Wmask are the height + and width of the mask predictions. The values are logits. + pred_instances (list[Instances]): A list of N Instances, where N is the number of images + in the batch. Each Instances must have field "pred_classes". + + Returns: + None. pred_instances will contain an extra "pred_masks" field storing a mask of size (Hmask, + Wmask) for predicted class. Note that the masks are returned as a soft (non-quantized) + masks the resolution predicted by the network; post-processing steps, such as resizing + the predicted masks to the original image resolution and/or binarizing them, is left + to the caller. + """ + cls_agnostic_mask = pred_mask_logits.size(1) == 1 + + if cls_agnostic_mask: + mask_probs_pred = pred_mask_logits.sigmoid() + else: + # Select masks corresponding to the predicted classes + num_masks = pred_mask_logits.shape[0] + class_pred = cat([i.pred_classes for i in pred_instances]) + indices = torch.arange(num_masks, device=class_pred.device) + mask_probs_pred = pred_mask_logits[indices, class_pred][:, None].sigmoid() + # mask_probs_pred.shape: (B, 1, Hmask, Wmask) + + num_boxes_per_image = [len(i) for i in pred_instances] + mask_probs_pred = mask_probs_pred.split(num_boxes_per_image, dim=0) + + for prob, instances in zip(mask_probs_pred, pred_instances): + instances.pred_masks = prob # (1, Hmask, Wmask) + + +class BaseMaskRCNNHead(nn.Module): + """ + Implement the basic Mask R-CNN losses and inference logic described in :paper:`Mask R-CNN` + """ + + @configurable + def __init__(self, *, vis_period=0): + """ + NOTE: this interface is experimental. + + Args: + vis_period (int): visualization period + """ + super().__init__() + self.vis_period = vis_period + + @classmethod + def from_config(cls, cfg, input_shape): + return {"vis_period": cfg.VIS_PERIOD} + + def forward(self, x, instances: List[Instances]): + """ + Args: + x: input region feature(s) provided by :class:`ROIHeads`. + instances (list[Instances]): contains the boxes & labels corresponding + to the input features. + Exact format is up to its caller to decide. + Typically, this is the foreground instances in training, with + "proposal_boxes" field and other gt annotations. + In inference, it contains boxes that are already predicted. + + Returns: + A dict of losses in training. The predicted "instances" in inference. + """ + x = self.layers(x) + if self.training: + return {"loss_mask": mask_rcnn_loss(x, instances, self.vis_period)} + else: + mask_rcnn_inference(x, instances) + return instances + + def layers(self, x): + """ + Neural network layers that makes predictions from input features. + """ + raise NotImplementedError + + +@ROI_MASK_HEAD_REGISTRY.register() +class MaskRCNNConvUpsampleHead(BaseMaskRCNNHead): + """ + A mask head with several conv layers, plus an upsample layer (with `ConvTranspose2d`). + Predictions are made with a final 1x1 conv layer. + """ + + @configurable + def __init__(self, input_shape: ShapeSpec, *, num_classes, conv_dims, conv_norm="", **kwargs): + """ + NOTE: this interface is experimental. + + Args: + input_shape (ShapeSpec): shape of the input feature + num_classes (int): the number of classes. 1 if using class agnostic prediction. + conv_dims (list[int]): a list of N>0 integers representing the output dimensions + of N-1 conv layers and the last upsample layer. + conv_norm (str or callable): normalization for the conv layers. + See :func:`detectron2.layers.get_norm` for supported types. + """ + super().__init__(**kwargs) + assert len(conv_dims) >= 1, "conv_dims have to be non-empty!" + + self.conv_norm_relus = [] + + cur_channels = input_shape.channels + for k, conv_dim in enumerate(conv_dims[:-1]): + conv = Conv2d( + cur_channels, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=not conv_norm, + norm=get_norm(conv_norm, conv_dim), + activation=F.relu, + ) + self.add_module("mask_fcn{}".format(k + 1), conv) + self.conv_norm_relus.append(conv) + cur_channels = conv_dim + + self.deconv = ConvTranspose2d( + cur_channels, conv_dims[-1], kernel_size=2, stride=2, padding=0 + ) + cur_channels = conv_dims[-1] + + self.predictor = Conv2d(cur_channels, num_classes, kernel_size=1, stride=1, padding=0) + + for layer in self.conv_norm_relus + [self.deconv]: + weight_init.c2_msra_fill(layer) + # use normal distribution initialization for mask prediction layer + nn.init.normal_(self.predictor.weight, std=0.001) + if self.predictor.bias is not None: + nn.init.constant_(self.predictor.bias, 0) + + @classmethod + def from_config(cls, cfg, input_shape): + ret = super().from_config(cfg, input_shape) + conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM + num_conv = cfg.MODEL.ROI_MASK_HEAD.NUM_CONV + ret.update( + conv_dims=[conv_dim] * (num_conv + 1), # +1 for ConvTranspose + conv_norm=cfg.MODEL.ROI_MASK_HEAD.NORM, + input_shape=input_shape, + ) + if cfg.MODEL.ROI_MASK_HEAD.CLS_AGNOSTIC_MASK: + ret["num_classes"] = 1 + else: + ret["num_classes"] = cfg.MODEL.ROI_HEADS.NUM_CLASSES + return ret + + def layers(self, x): + for layer in self.conv_norm_relus: + x = layer(x) + x = F.relu(self.deconv(x)) + return self.predictor(x) + + +def build_mask_head(cfg, input_shape): + """ + Build a mask head defined by `cfg.MODEL.ROI_MASK_HEAD.NAME`. + """ + name = cfg.MODEL.ROI_MASK_HEAD.NAME + return ROI_MASK_HEAD_REGISTRY.get(name)(cfg, input_shape) diff --git a/detectron2/modeling/roi_heads/roi_heads.py b/detectron2/modeling/roi_heads/roi_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..f35588e474a1c3d938e5a3b2b8a8ae5e88006215 --- /dev/null +++ b/detectron2/modeling/roi_heads/roi_heads.py @@ -0,0 +1,812 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import inspect +import logging +import numpy as np +from typing import Dict, List, Optional, Tuple, Union +import torch +from torch import nn + +from detectron2.config import configurable +from detectron2.layers import ShapeSpec +from detectron2.structures import Boxes, ImageList, Instances, pairwise_iou +from detectron2.utils.events import get_event_storage +from detectron2.utils.registry import Registry + +from ..backbone.resnet import BottleneckBlock, make_stage +from ..matcher import Matcher +from ..poolers import ROIPooler +from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals +from ..sampling import subsample_labels +from .box_head import build_box_head +from .fast_rcnn import FastRCNNOutputLayers +from .keypoint_head import build_keypoint_head +from .mask_head import build_mask_head + +ROI_HEADS_REGISTRY = Registry("ROI_HEADS") +ROI_HEADS_REGISTRY.__doc__ = """ +Registry for ROI heads in a generalized R-CNN model. +ROIHeads take feature maps and region proposals, and +perform per-region computation. + +The registered object will be called with `obj(cfg, input_shape)`. +The call is expected to return an :class:`ROIHeads`. +""" + +logger = logging.getLogger(__name__) + + +def build_roi_heads(cfg, input_shape): + """ + Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`. + """ + name = cfg.MODEL.ROI_HEADS.NAME + return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape) + + +def select_foreground_proposals( + proposals: List[Instances], bg_label: int +) -> Tuple[List[Instances], List[torch.Tensor]]: + """ + Given a list of N Instances (for N images), each containing a `gt_classes` field, + return a list of Instances that contain only instances with `gt_classes != -1 && + gt_classes != bg_label`. + + Args: + proposals (list[Instances]): A list of N Instances, where N is the number of + images in the batch. + bg_label: label index of background class. + + Returns: + list[Instances]: N Instances, each contains only the selected foreground instances. + list[Tensor]: N boolean vector, correspond to the selection mask of + each Instances object. True for selected instances. + """ + assert isinstance(proposals, (list, tuple)) + assert isinstance(proposals[0], Instances) + assert proposals[0].has("gt_classes") + fg_proposals = [] + fg_selection_masks = [] + for proposals_per_image in proposals: + gt_classes = proposals_per_image.gt_classes + fg_selection_mask = (gt_classes != -1) & (gt_classes != bg_label) + fg_idxs = fg_selection_mask.nonzero().squeeze(1) + fg_proposals.append(proposals_per_image[fg_idxs]) + fg_selection_masks.append(fg_selection_mask) + return fg_proposals, fg_selection_masks + + +def select_proposals_with_visible_keypoints(proposals: List[Instances]) -> List[Instances]: + """ + Args: + proposals (list[Instances]): a list of N Instances, where N is the + number of images. + + Returns: + proposals: only contains proposals with at least one visible keypoint. + + Note that this is still slightly different from Detectron. + In Detectron, proposals for training keypoint head are re-sampled from + all the proposals with IOU>threshold & >=1 visible keypoint. + + Here, the proposals are first sampled from all proposals with + IOU>threshold, then proposals with no visible keypoint are filtered out. + This strategy seems to make no difference on Detectron and is easier to implement. + """ + ret = [] + all_num_fg = [] + for proposals_per_image in proposals: + # If empty/unannotated image (hard negatives), skip filtering for train + if len(proposals_per_image) == 0: + ret.append(proposals_per_image) + continue + gt_keypoints = proposals_per_image.gt_keypoints.tensor + # #fg x K x 3 + vis_mask = gt_keypoints[:, :, 2] >= 1 + xs, ys = gt_keypoints[:, :, 0], gt_keypoints[:, :, 1] + proposal_boxes = proposals_per_image.proposal_boxes.tensor.unsqueeze(dim=1) # #fg x 1 x 4 + kp_in_box = ( + (xs >= proposal_boxes[:, :, 0]) + & (xs <= proposal_boxes[:, :, 2]) + & (ys >= proposal_boxes[:, :, 1]) + & (ys <= proposal_boxes[:, :, 3]) + ) + selection = (kp_in_box & vis_mask).any(dim=1) + selection_idxs = torch.nonzero(selection, as_tuple=True)[0] + all_num_fg.append(selection_idxs.numel()) + ret.append(proposals_per_image[selection_idxs]) + + storage = get_event_storage() + storage.put_scalar("keypoint_head/num_fg_samples", np.mean(all_num_fg)) + return ret + + +class ROIHeads(torch.nn.Module): + """ + ROIHeads perform all per-region computation in an R-CNN. + + It typically contains logic to + 1. (in training only) match proposals with ground truth and sample them + 2. crop the regions and extract per-region features using proposals + 3. make per-region predictions with different heads + + It can have many variants, implemented as subclasses of this class. + This base class contains the logic to match/sample proposals. + But it is not necessary to inherit this class if the sampling logic is not needed. + """ + + @configurable + def __init__( + self, + *, + num_classes, + batch_size_per_image, + positive_sample_fraction, + proposal_matcher, + proposal_append_gt=True + ): + """ + NOTE: this interface is experimental. + + Args: + num_classes (int): number of classes. Used to label background proposals. + batch_size_per_image (int): number of proposals to use for training + positive_sample_fraction (float): fraction of positive (foreground) proposals + to use for training. + proposal_matcher (Matcher): matcher that matches proposals and ground truth + proposal_append_gt (bool): whether to include ground truth as proposals as well + """ + super().__init__() + self.batch_size_per_image = batch_size_per_image + self.positive_sample_fraction = positive_sample_fraction + self.num_classes = num_classes + self.proposal_matcher = proposal_matcher + self.proposal_append_gt = proposal_append_gt + + @classmethod + def from_config(cls, cfg): + return { + "batch_size_per_image": cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, + "positive_sample_fraction": cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION, + "num_classes": cfg.MODEL.ROI_HEADS.NUM_CLASSES, + "proposal_append_gt": cfg.MODEL.ROI_HEADS.PROPOSAL_APPEND_GT, + # Matcher to assign box proposals to gt boxes + "proposal_matcher": Matcher( + cfg.MODEL.ROI_HEADS.IOU_THRESHOLDS, + cfg.MODEL.ROI_HEADS.IOU_LABELS, + allow_low_quality_matches=False, + ), + } + + def _sample_proposals( + self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Based on the matching between N proposals and M groundtruth, + sample the proposals and set their classification labels. + + Args: + matched_idxs (Tensor): a vector of length N, each is the best-matched + gt index in [0, M) for each proposal. + matched_labels (Tensor): a vector of length N, the matcher's label + (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. + gt_classes (Tensor): a vector of length M. + + Returns: + Tensor: a vector of indices of sampled proposals. Each is in [0, N). + Tensor: a vector of the same length, the classification label for + each sampled proposal. Each sample is labeled as either a category in + [0, num_classes) or the background (num_classes). + """ + has_gt = gt_classes.numel() > 0 + # Get the corresponding GT for each proposal + if has_gt: + gt_classes = gt_classes[matched_idxs] + # Label unmatched proposals (0 label from matcher) as background (label=num_classes) + gt_classes[matched_labels == 0] = self.num_classes + # Label ignore proposals (-1 label) + gt_classes[matched_labels == -1] = -1 + else: + gt_classes = torch.zeros_like(matched_idxs) + self.num_classes + + sampled_fg_idxs, sampled_bg_idxs = subsample_labels( + gt_classes, self.batch_size_per_image, self.positive_sample_fraction, self.num_classes + ) + + sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) + return sampled_idxs, gt_classes[sampled_idxs] + + @torch.no_grad() + def label_and_sample_proposals( + self, proposals: List[Instances], targets: List[Instances] + ) -> List[Instances]: + """ + Prepare some proposals to be used to train the ROI heads. + It performs box matching between `proposals` and `targets`, and assigns + training labels to the proposals. + It returns ``self.batch_size_per_image`` random samples from proposals and groundtruth + boxes, with a fraction of positives that is no larger than + ``self.positive_sample_fraction``. + + Args: + See :meth:`ROIHeads.forward` + + Returns: + list[Instances]: + length `N` list of `Instances`s containing the proposals + sampled for training. Each `Instances` has the following fields: + + - proposal_boxes: the proposal boxes + - gt_boxes: the ground-truth box that the proposal is assigned to + (this is only meaningful if the proposal has a label > 0; if label = 0 + then the ground-truth box is random) + + Other fields such as "gt_classes", "gt_masks", that's included in `targets`. + """ + gt_boxes = [x.gt_boxes for x in targets] + # Augment proposals with ground-truth boxes. + # In the case of learned proposals (e.g., RPN), when training starts + # the proposals will be low quality due to random initialization. + # It's possible that none of these initial + # proposals have high enough overlap with the gt objects to be used + # as positive examples for the second stage components (box head, + # cls head, mask head). Adding the gt boxes to the set of proposals + # ensures that the second stage components will have some positive + # examples from the start of training. For RPN, this augmentation improves + # convergence and empirically improves box AP on COCO by about 0.5 + # points (under one tested configuration). + if self.proposal_append_gt: + proposals = add_ground_truth_to_proposals(gt_boxes, proposals) + + proposals_with_gt = [] + + num_fg_samples = [] + num_bg_samples = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + has_gt = len(targets_per_image) > 0 + match_quality_matrix = pairwise_iou( + targets_per_image.gt_boxes, proposals_per_image.proposal_boxes + ) + matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) + sampled_idxs, gt_classes = self._sample_proposals( + matched_idxs, matched_labels, targets_per_image.gt_classes + ) + + # Set target attributes of the sampled proposals: + proposals_per_image = proposals_per_image[sampled_idxs] + proposals_per_image.gt_classes = gt_classes + + # We index all the attributes of targets that start with "gt_" + # and have not been added to proposals yet (="gt_classes"). + if has_gt: + sampled_targets = matched_idxs[sampled_idxs] + # NOTE: here the indexing waste some compute, because heads + # like masks, keypoints, etc, will filter the proposals again, + # (by foreground/background, or number of keypoints in the image, etc) + # so we essentially index the data twice. + for (trg_name, trg_value) in targets_per_image.get_fields().items(): + if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): + proposals_per_image.set(trg_name, trg_value[sampled_targets]) + else: + gt_boxes = Boxes( + targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 4)) + ) + proposals_per_image.gt_boxes = gt_boxes + + num_bg_samples.append((gt_classes == self.num_classes).sum().item()) + num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) + proposals_with_gt.append(proposals_per_image) + + # Log the number of fg/bg samples that are selected for training ROI heads + storage = get_event_storage() + storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) + storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) + + return proposals_with_gt + + def forward( + self, + images: ImageList, + features: Dict[str, torch.Tensor], + proposals: List[Instances], + targets: Optional[List[Instances]] = None, + ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: + """ + Args: + images (ImageList): + features (dict[str,Tensor]): input data as a mapping from feature + map name to tensor. Axis 0 represents the number of images `N` in + the input data; axes 1-3 are channels, height, and width, which may + vary between feature maps (e.g., if a feature pyramid is used). + proposals (list[Instances]): length `N` list of `Instances`. The i-th + `Instances` contains object proposals for the i-th input image, + with fields "proposal_boxes" and "objectness_logits". + targets (list[Instances], optional): length `N` list of `Instances`. The i-th + `Instances` contains the ground-truth per-instance annotations + for the i-th input image. Specify `targets` during training only. + It may have the following fields: + + - gt_boxes: the bounding box of each instance. + - gt_classes: the label for each instance with a category ranging in [0, #class]. + - gt_masks: PolygonMasks or BitMasks, the ground-truth masks of each instance. + - gt_keypoints: NxKx3, the groud-truth keypoints for each instance. + + Returns: + list[Instances]: length `N` list of `Instances` containing the + detected instances. Returned during inference only; may be [] during training. + + dict[str->Tensor]: + mapping from a named loss to a tensor storing the loss. Used during training only. + """ + raise NotImplementedError() + + +@ROI_HEADS_REGISTRY.register() +class Res5ROIHeads(ROIHeads): + """ + The ROIHeads in a typical "C4" R-CNN model, where + the box and mask head share the cropping and + the per-region feature computation by a Res5 block. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg) + + # fmt: off + self.in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + pooler_scales = (1.0 / input_shape[self.in_features[0]].stride, ) + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + self.mask_on = cfg.MODEL.MASK_ON + # fmt: on + assert not cfg.MODEL.KEYPOINT_ON + assert len(self.in_features) == 1 + + self.pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + + self.res5, out_channels = self._build_res5_block(cfg) + self.box_predictor = FastRCNNOutputLayers( + cfg, ShapeSpec(channels=out_channels, height=1, width=1) + ) + + if self.mask_on: + self.mask_head = build_mask_head( + cfg, + ShapeSpec(channels=out_channels, width=pooler_resolution, height=pooler_resolution), + ) + + def _build_res5_block(self, cfg): + # fmt: off + stage_channel_factor = 2 ** 3 # res5 is 8x res2 + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group * stage_channel_factor + out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS * stage_channel_factor + stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 + norm = cfg.MODEL.RESNETS.NORM + assert not cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE[-1], \ + "Deformable conv is not yet supported in res5 head." + # fmt: on + + blocks = make_stage( + BottleneckBlock, + 3, + first_stride=2, + in_channels=out_channels // 2, + bottleneck_channels=bottleneck_channels, + out_channels=out_channels, + num_groups=num_groups, + norm=norm, + stride_in_1x1=stride_in_1x1, + ) + return nn.Sequential(*blocks), out_channels + + def _shared_roi_transform(self, features, boxes): + x = self.pooler(features, boxes) + return self.res5(x) + + def forward(self, images, features, proposals, targets=None): + """ + See :meth:`ROIHeads.forward`. + """ + del images + + if self.training: + assert targets + proposals = self.label_and_sample_proposals(proposals, targets) + del targets + + proposal_boxes = [x.proposal_boxes for x in proposals] + box_features = self._shared_roi_transform( + [features[f] for f in self.in_features], proposal_boxes + ) + predictions = self.box_predictor(box_features.mean(dim=[2, 3])) + + if self.training: + del features + losses = self.box_predictor.losses(predictions, proposals) + if self.mask_on: + proposals, fg_selection_masks = select_foreground_proposals( + proposals, self.num_classes + ) + # Since the ROI feature transform is shared between boxes and masks, + # we don't need to recompute features. The mask loss is only defined + # on foreground proposals, so we need to select out the foreground + # features. + mask_features = box_features[torch.cat(fg_selection_masks, dim=0)] + del box_features + losses.update(self.mask_head(mask_features, proposals)) + return [], losses + else: + pred_instances, _ = self.box_predictor.inference(predictions, proposals) + pred_instances = self.forward_with_given_boxes(features, pred_instances) + return pred_instances, {} + + def forward_with_given_boxes(self, features, instances): + """ + Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. + + Args: + features: same as in `forward()` + instances (list[Instances]): instances to predict other outputs. Expect the keys + "pred_boxes" and "pred_classes" to exist. + + Returns: + instances (Instances): + the same `Instances` object, with extra + fields such as `pred_masks` or `pred_keypoints`. + """ + assert not self.training + assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") + + if self.mask_on: + features = [features[f] for f in self.in_features] + x = self._shared_roi_transform(features, [x.pred_boxes for x in instances]) + return self.mask_head(x, instances) + else: + return instances + + +@ROI_HEADS_REGISTRY.register() +class StandardROIHeads(ROIHeads): + """ + It's "standard" in a sense that there is no ROI transform sharing + or feature sharing between tasks. + Each head independently processes the input features by each head's + own pooler and head. + + This class is used by most models, such as FPN and C5. + To implement more models, you can subclass it and implement a different + :meth:`forward()` or a head. + """ + + @configurable + def __init__( + self, + *, + box_in_features: List[str], + box_pooler: ROIPooler, + box_head: nn.Module, + box_predictor: nn.Module, + mask_in_features: Optional[List[str]] = None, + mask_pooler: Optional[ROIPooler] = None, + mask_head: Optional[nn.Module] = None, + keypoint_in_features: Optional[List[str]] = None, + keypoint_pooler: Optional[ROIPooler] = None, + keypoint_head: Optional[nn.Module] = None, + train_on_pred_boxes: bool = False, + **kwargs + ): + """ + NOTE: this interface is experimental. + + Args: + box_in_features (list[str]): list of feature names to use for the box head. + box_pooler (ROIPooler): pooler to extra region features for box head + box_head (nn.Module): transform features to make box predictions + box_predictor (nn.Module): make box predictions from the feature. + Should have the same interface as :class:`FastRCNNOutputLayers`. + mask_in_features (list[str]): list of feature names to use for the mask head. + None if not using mask head. + mask_pooler (ROIPooler): pooler to extra region features for mask head + mask_head (nn.Module): transform features to make mask predictions + keypoint_in_features, keypoint_pooler, keypoint_head: similar to ``mask*``. + train_on_pred_boxes (bool): whether to use proposal boxes or + predicted boxes from the box head to train other heads. + """ + super().__init__(**kwargs) + # keep self.in_features for backward compatibility + self.in_features = self.box_in_features = box_in_features + self.box_pooler = box_pooler + self.box_head = box_head + self.box_predictor = box_predictor + + self.mask_on = mask_in_features is not None + if self.mask_on: + self.mask_in_features = mask_in_features + self.mask_pooler = mask_pooler + self.mask_head = mask_head + self.keypoint_on = keypoint_in_features is not None + if self.keypoint_on: + self.keypoint_in_features = keypoint_in_features + self.keypoint_pooler = keypoint_pooler + self.keypoint_head = keypoint_head + + self.train_on_pred_boxes = train_on_pred_boxes + + @classmethod + def from_config(cls, cfg, input_shape): + ret = super().from_config(cfg) + ret["train_on_pred_boxes"] = cfg.MODEL.ROI_BOX_HEAD.TRAIN_ON_PRED_BOXES + # Subclasses that have not been updated to use from_config style construction + # may have overridden _init_*_head methods. In this case, those overridden methods + # will not be classmethods and we need to avoid trying to call them here. + # We test for this with ismethod which only returns True for bound methods of cls. + # Such subclasses will need to handle calling their overridden _init_*_head methods. + if inspect.ismethod(cls._init_box_head): + ret.update(cls._init_box_head(cfg, input_shape)) + if inspect.ismethod(cls._init_mask_head): + ret.update(cls._init_mask_head(cfg, input_shape)) + if inspect.ismethod(cls._init_keypoint_head): + ret.update(cls._init_keypoint_head(cfg, input_shape)) + return ret + + @classmethod + def _init_box_head(cls, cfg, input_shape): + # fmt: off + in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + # fmt: on + + # If StandardROIHeads is applied on multiple feature maps (as in FPN), + # then we share the same predictors and therefore the channel counts must be the same + in_channels = [input_shape[f].channels for f in in_features] + # Check all channel counts are equal + assert len(set(in_channels)) == 1, in_channels + in_channels = in_channels[0] + + box_pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + # Here we split "box head" and "box predictor", which is mainly due to historical reasons. + # They are used together so the "box predictor" layers should be part of the "box head". + # New subclasses of ROIHeads do not need "box predictor"s. + box_head = build_box_head( + cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) + ) + box_predictor = FastRCNNOutputLayers(cfg, box_head.output_shape) + return { + "box_in_features": in_features, + "box_pooler": box_pooler, + "box_head": box_head, + "box_predictor": box_predictor, + } + + @classmethod + def _init_mask_head(cls, cfg, input_shape): + if not cfg.MODEL.MASK_ON: + return {} + # fmt: off + in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION + pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) + sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO + pooler_type = cfg.MODEL.ROI_MASK_HEAD.POOLER_TYPE + # fmt: on + + in_channels = [input_shape[f].channels for f in in_features][0] + + ret = {"mask_in_features": in_features} + ret["mask_pooler"] = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + ret["mask_head"] = build_mask_head( + cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution) + ) + return ret + + @classmethod + def _init_keypoint_head(cls, cfg, input_shape): + if not cfg.MODEL.KEYPOINT_ON: + return {} + # fmt: off + in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_RESOLUTION + pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) # noqa + sampling_ratio = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_SAMPLING_RATIO + pooler_type = cfg.MODEL.ROI_KEYPOINT_HEAD.POOLER_TYPE + # fmt: on + + in_channels = [input_shape[f].channels for f in in_features][0] + + ret = {"keypoint_in_features": in_features} + ret["keypoint_pooler"] = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + ret["keypoint_head"] = build_keypoint_head( + cfg, ShapeSpec(channels=in_channels, width=pooler_resolution, height=pooler_resolution) + ) + return ret + + def forward( + self, + images: ImageList, + features: Dict[str, torch.Tensor], + proposals: List[Instances], + targets: Optional[List[Instances]] = None, + ) -> Tuple[List[Instances], Dict[str, torch.Tensor]]: + """ + See :class:`ROIHeads.forward`. + """ + del images + if self.training: + assert targets + proposals = self.label_and_sample_proposals(proposals, targets) + del targets + + if self.training: + losses = self._forward_box(features, proposals) + # Usually the original proposals used by the box head are used by the mask, keypoint + # heads. But when `self.train_on_pred_boxes is True`, proposals will contain boxes + # predicted by the box head. + losses.update(self._forward_mask(features, proposals)) + losses.update(self._forward_keypoint(features, proposals)) + return proposals, losses + else: + pred_instances = self._forward_box(features, proposals) + # During inference cascaded prediction is used: the mask and keypoints heads are only + # applied to the top scoring box detections. + pred_instances = self.forward_with_given_boxes(features, pred_instances) + return pred_instances, {} + + def forward_with_given_boxes( + self, features: Dict[str, torch.Tensor], instances: List[Instances] + ) -> List[Instances]: + """ + Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. + + This is useful for downstream tasks where a box is known, but need to obtain + other attributes (outputs of other heads). + Test-time augmentation also uses this. + + Args: + features: same as in `forward()` + instances (list[Instances]): instances to predict other outputs. Expect the keys + "pred_boxes" and "pred_classes" to exist. + + Returns: + instances (list[Instances]): + the same `Instances` objects, with extra + fields such as `pred_masks` or `pred_keypoints`. + """ + assert not self.training + assert instances[0].has("pred_boxes") and instances[0].has("pred_classes") + + instances = self._forward_mask(features, instances) + instances = self._forward_keypoint(features, instances) + return instances + + def _forward_box( + self, features: Dict[str, torch.Tensor], proposals: List[Instances] + ) -> Union[Dict[str, torch.Tensor], List[Instances]]: + """ + Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, + the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. + + Args: + features (dict[str, Tensor]): mapping from feature map names to tensor. + Same as in :meth:`ROIHeads.forward`. + proposals (list[Instances]): the per-image object proposals with + their matching ground truth. + Each has fields "proposal_boxes", and "objectness_logits", + "gt_classes", "gt_boxes". + + Returns: + In training, a dict of losses. + In inference, a list of `Instances`, the predicted instances. + """ + features = [features[f] for f in self.box_in_features] + box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) + box_features = self.box_head(box_features) + predictions = self.box_predictor(box_features) + del box_features + + if self.training: + losses = self.box_predictor.losses(predictions, proposals) + # proposals is modified in-place below, so losses must be computed first. + if self.train_on_pred_boxes: + with torch.no_grad(): + pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( + predictions, proposals + ) + for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): + proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) + return losses + else: + pred_instances, _ = self.box_predictor.inference(predictions, proposals) + return pred_instances + + def _forward_mask( + self, features: Dict[str, torch.Tensor], instances: List[Instances] + ) -> Union[Dict[str, torch.Tensor], List[Instances]]: + """ + Forward logic of the mask prediction branch. + + Args: + features (dict[str, Tensor]): mapping from feature map names to tensor. + Same as in :meth:`ROIHeads.forward`. + instances (list[Instances]): the per-image instances to train/predict masks. + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "pred_masks" and return it. + """ + if not self.mask_on: + return {} if self.training else instances + + features = [features[f] for f in self.mask_in_features] + + if self.training: + # The loss is only defined on positive proposals. + proposals, _ = select_foreground_proposals(instances, self.num_classes) + proposal_boxes = [x.proposal_boxes for x in proposals] + mask_features = self.mask_pooler(features, proposal_boxes) + return self.mask_head(mask_features, proposals) + else: + pred_boxes = [x.pred_boxes for x in instances] + mask_features = self.mask_pooler(features, pred_boxes) + return self.mask_head(mask_features, instances) + + def _forward_keypoint( + self, features: Dict[str, torch.Tensor], instances: List[Instances] + ) -> Union[Dict[str, torch.Tensor], List[Instances]]: + """ + Forward logic of the keypoint prediction branch. + + Args: + features (dict[str, Tensor]): mapping from feature map names to tensor. + Same as in :meth:`ROIHeads.forward`. + instances (list[Instances]): the per-image instances to train/predict keypoints. + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "pred_keypoints" and return it. + """ + if not self.keypoint_on: + return {} if self.training else instances + + features = [features[f] for f in self.keypoint_in_features] + + if self.training: + # The loss is defined on positive proposals with >=1 visible keypoints. + proposals, _ = select_foreground_proposals(instances, self.num_classes) + proposals = select_proposals_with_visible_keypoints(proposals) + proposal_boxes = [x.proposal_boxes for x in proposals] + + keypoint_features = self.keypoint_pooler(features, proposal_boxes) + return self.keypoint_head(keypoint_features, proposals) + else: + pred_boxes = [x.pred_boxes for x in instances] + keypoint_features = self.keypoint_pooler(features, pred_boxes) + return self.keypoint_head(keypoint_features, instances) diff --git a/detectron2/modeling/roi_heads/rotated_fast_rcnn.py b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..3d7362d93f9be8d3838c477406540603e81ee0be --- /dev/null +++ b/detectron2/modeling/roi_heads/rotated_fast_rcnn.py @@ -0,0 +1,276 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import numpy as np +import torch + +from detectron2.config import configurable +from detectron2.layers import ShapeSpec, batched_nms_rotated +from detectron2.structures import Instances, RotatedBoxes, pairwise_iou_rotated +from detectron2.utils.events import get_event_storage + +from ..box_regression import Box2BoxTransformRotated +from ..poolers import ROIPooler +from ..proposal_generator.proposal_utils import add_ground_truth_to_proposals +from .box_head import build_box_head +from .fast_rcnn import FastRCNNOutputLayers +from .roi_heads import ROI_HEADS_REGISTRY, StandardROIHeads + +logger = logging.getLogger(__name__) + +""" +Shape shorthand in this module: + + N: number of images in the minibatch + R: number of ROIs, combined over all images, in the minibatch + Ri: number of ROIs in image i + K: number of foreground classes. E.g.,there are 80 foreground classes in COCO. + +Naming convention: + + deltas: refers to the 5-d (dx, dy, dw, dh, da) deltas that parameterize the box2box + transform (see :class:`box_regression.Box2BoxTransformRotated`). + + pred_class_logits: predicted class scores in [-inf, +inf]; use + softmax(pred_class_logits) to estimate P(class). + + gt_classes: ground-truth classification labels in [0, K], where [0, K) represent + foreground object classes and K represents the background class. + + pred_proposal_deltas: predicted rotated box2box transform deltas for transforming proposals + to detection box predictions. + + gt_proposal_deltas: ground-truth rotated box2box transform deltas +""" + + +def fast_rcnn_inference_rotated( + boxes, scores, image_shapes, score_thresh, nms_thresh, topk_per_image +): + """ + Call `fast_rcnn_inference_single_image_rotated` for all images. + + Args: + boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic + boxes for each image. Element i has shape (Ri, K * 5) if doing + class-specific regression, or (Ri, 5) if doing class-agnostic + regression, where Ri is the number of predicted objects for image i. + This is compatible with the output of :meth:`FastRCNNOutputs.predict_boxes`. + scores (list[Tensor]): A list of Tensors of predicted class scores for each image. + Element i has shape (Ri, K + 1), where Ri is the number of predicted objects + for image i. Compatible with the output of :meth:`FastRCNNOutputs.predict_probs`. + image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch. + score_thresh (float): Only return detections with a confidence score exceeding this + threshold. + nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. + topk_per_image (int): The number of top scoring detections to return. Set < 0 to return + all detections. + + Returns: + instances: (list[Instances]): A list of N instances, one for each image in the batch, + that stores the topk most confidence detections. + kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates + the corresponding boxes/scores index in [0, Ri) from the input, for image i. + """ + result_per_image = [ + fast_rcnn_inference_single_image_rotated( + boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image + ) + for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes) + ] + return [x[0] for x in result_per_image], [x[1] for x in result_per_image] + + +def fast_rcnn_inference_single_image_rotated( + boxes, scores, image_shape, score_thresh, nms_thresh, topk_per_image +): + """ + Single-image inference. Return rotated bounding-box detection results by thresholding + on scores and applying rotated non-maximum suppression (Rotated NMS). + + Args: + Same as `fast_rcnn_inference_rotated`, but with rotated boxes, scores, and image shapes + per image. + + Returns: + Same as `fast_rcnn_inference_rotated`, but for only one image. + """ + valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1) + if not valid_mask.all(): + boxes = boxes[valid_mask] + scores = scores[valid_mask] + + B = 5 # box dimension + scores = scores[:, :-1] + num_bbox_reg_classes = boxes.shape[1] // B + # Convert to Boxes to use the `clip` function ... + boxes = RotatedBoxes(boxes.reshape(-1, B)) + boxes.clip(image_shape) + boxes = boxes.tensor.view(-1, num_bbox_reg_classes, B) # R x C x B + # Filter results based on detection scores + filter_mask = scores > score_thresh # R x K + # R' x 2. First column contains indices of the R predictions; + # Second column contains indices of classes. + filter_inds = filter_mask.nonzero() + if num_bbox_reg_classes == 1: + boxes = boxes[filter_inds[:, 0], 0] + else: + boxes = boxes[filter_mask] + scores = scores[filter_mask] + + # Apply per-class Rotated NMS + keep = batched_nms_rotated(boxes, scores, filter_inds[:, 1], nms_thresh) + if topk_per_image >= 0: + keep = keep[:topk_per_image] + boxes, scores, filter_inds = boxes[keep], scores[keep], filter_inds[keep] + + result = Instances(image_shape) + result.pred_boxes = RotatedBoxes(boxes) + result.scores = scores + result.pred_classes = filter_inds[:, 1] + + return result, filter_inds[:, 0] + + +class RotatedFastRCNNOutputLayers(FastRCNNOutputLayers): + """ + Two linear layers for predicting Rotated Fast R-CNN outputs. + """ + + @classmethod + def from_config(cls, cfg, input_shape): + args = super().from_config(cfg, input_shape) + args["box2box_transform"] = Box2BoxTransformRotated( + weights=cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS + ) + return args + + def inference(self, predictions, proposals): + """ + Returns: + list[Instances]: same as `fast_rcnn_inference_rotated`. + list[Tensor]: same as `fast_rcnn_inference_rotated`. + """ + boxes = self.predict_boxes(predictions, proposals) + scores = self.predict_probs(predictions, proposals) + image_shapes = [x.image_size for x in proposals] + + return fast_rcnn_inference_rotated( + boxes, + scores, + image_shapes, + self.test_score_thresh, + self.test_nms_thresh, + self.test_topk_per_image, + ) + + +@ROI_HEADS_REGISTRY.register() +class RROIHeads(StandardROIHeads): + """ + This class is used by Rotated Fast R-CNN to detect rotated boxes. + For now, it only supports box predictions but not mask or keypoints. + """ + + @configurable + def __init__(self, **kwargs): + """ + NOTE: this interface is experimental. + """ + super().__init__(**kwargs) + assert ( + not self.mask_on and not self.keypoint_on + ), "Mask/Keypoints not supported in Rotated ROIHeads." + assert not self.train_on_pred_boxes, "train_on_pred_boxes not implemented for RROIHeads!" + + @classmethod + def _init_box_head(cls, cfg, input_shape): + # fmt: off + in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES + pooler_resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION + pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) + sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO + pooler_type = cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE + # fmt: on + assert pooler_type in ["ROIAlignRotated"], pooler_type + # assume all channel counts are equal + in_channels = [input_shape[f].channels for f in in_features][0] + + box_pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type=pooler_type, + ) + box_head = build_box_head( + cfg, ShapeSpec(channels=in_channels, height=pooler_resolution, width=pooler_resolution) + ) + # This line is the only difference v.s. StandardROIHeads + box_predictor = RotatedFastRCNNOutputLayers(cfg, box_head.output_shape) + return { + "box_in_features": in_features, + "box_pooler": box_pooler, + "box_head": box_head, + "box_predictor": box_predictor, + } + + @torch.no_grad() + def label_and_sample_proposals(self, proposals, targets): + """ + Prepare some proposals to be used to train the RROI heads. + It performs box matching between `proposals` and `targets`, and assigns + training labels to the proposals. + It returns `self.batch_size_per_image` random samples from proposals and groundtruth boxes, + with a fraction of positives that is no larger than `self.positive_sample_fraction. + + Args: + See :meth:`StandardROIHeads.forward` + + Returns: + list[Instances]: length `N` list of `Instances`s containing the proposals + sampled for training. Each `Instances` has the following fields: + - proposal_boxes: the rotated proposal boxes + - gt_boxes: the ground-truth rotated boxes that the proposal is assigned to + (this is only meaningful if the proposal has a label > 0; if label = 0 + then the ground-truth box is random) + - gt_classes: the ground-truth classification lable for each proposal + """ + gt_boxes = [x.gt_boxes for x in targets] + if self.proposal_append_gt: + proposals = add_ground_truth_to_proposals(gt_boxes, proposals) + + proposals_with_gt = [] + + num_fg_samples = [] + num_bg_samples = [] + for proposals_per_image, targets_per_image in zip(proposals, targets): + has_gt = len(targets_per_image) > 0 + match_quality_matrix = pairwise_iou_rotated( + targets_per_image.gt_boxes, proposals_per_image.proposal_boxes + ) + matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) + sampled_idxs, gt_classes = self._sample_proposals( + matched_idxs, matched_labels, targets_per_image.gt_classes + ) + + proposals_per_image = proposals_per_image[sampled_idxs] + proposals_per_image.gt_classes = gt_classes + + if has_gt: + sampled_targets = matched_idxs[sampled_idxs] + proposals_per_image.gt_boxes = targets_per_image.gt_boxes[sampled_targets] + else: + gt_boxes = RotatedBoxes( + targets_per_image.gt_boxes.tensor.new_zeros((len(sampled_idxs), 5)) + ) + proposals_per_image.gt_boxes = gt_boxes + + num_bg_samples.append((gt_classes == self.num_classes).sum().item()) + num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) + proposals_with_gt.append(proposals_per_image) + + # Log the number of fg/bg samples that are selected for training ROI heads + storage = get_event_storage() + storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) + storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) + + return proposals_with_gt diff --git a/detectron2/modeling/sampling.py b/detectron2/modeling/sampling.py new file mode 100644 index 0000000000000000000000000000000000000000..ecf251a2fa301d9e31eee7d3ba5dc6eaab1732f8 --- /dev/null +++ b/detectron2/modeling/sampling.py @@ -0,0 +1,50 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch + +__all__ = ["subsample_labels"] + + +def subsample_labels(labels, num_samples, positive_fraction, bg_label): + """ + Return `num_samples` (or fewer, if not enough found) + random samples from `labels` which is a mixture of positives & negatives. + It will try to return as many positives as possible without + exceeding `positive_fraction * num_samples`, and then try to + fill the remaining slots with negatives. + + Args: + labels (Tensor): (N, ) label vector with values: + * -1: ignore + * bg_label: background ("negative") class + * otherwise: one or more foreground ("positive") classes + num_samples (int): The total number of labels with value >= 0 to return. + Values that are not sampled will be filled with -1 (ignore). + positive_fraction (float): The number of subsampled labels with values > 0 + is `min(num_positives, int(positive_fraction * num_samples))`. The number + of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`. + In order words, if there are not enough positives, the sample is filled with + negatives. If there are also not enough negatives, then as many elements are + sampled as is possible. + bg_label (int): label index of background ("negative") class. + + Returns: + pos_idx, neg_idx (Tensor): + 1D vector of indices. The total length of both is `num_samples` or fewer. + """ + positive = torch.nonzero((labels != -1) & (labels != bg_label), as_tuple=True)[0] + negative = torch.nonzero(labels == bg_label, as_tuple=True)[0] + + num_pos = int(num_samples * positive_fraction) + # protect against not enough positive examples + num_pos = min(positive.numel(), num_pos) + num_neg = num_samples - num_pos + # protect against not enough negative examples + num_neg = min(negative.numel(), num_neg) + + # randomly select positive and negative examples + perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos] + perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg] + + pos_idx = positive[perm1] + neg_idx = negative[perm2] + return pos_idx, neg_idx diff --git a/detectron2/modeling/test_time_augmentation.py b/detectron2/modeling/test_time_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..1e5bcf02f655956f76eb78fb7de36d691de6a53c --- /dev/null +++ b/detectron2/modeling/test_time_augmentation.py @@ -0,0 +1,285 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import numpy as np +from contextlib import contextmanager +from itertools import count +import torch +from torch import nn +from torch.nn.parallel import DistributedDataParallel + +from detectron2.data.detection_utils import read_image +from detectron2.data.transforms import ResizeShortestEdge +from detectron2.structures import Instances + +from .meta_arch import GeneralizedRCNN +from .postprocessing import detector_postprocess +from .roi_heads.fast_rcnn import fast_rcnn_inference_single_image + +__all__ = ["DatasetMapperTTA", "GeneralizedRCNNWithTTA"] + + +class DatasetMapperTTA: + """ + Implement test-time augmentation for detection data. + It is a callable which takes a dataset dict from a detection dataset, + and returns a list of dataset dicts where the images + are augmented from the input image by the transformations defined in the config. + This is used for test-time augmentation. + """ + + def __init__(self, cfg): + self.min_sizes = cfg.TEST.AUG.MIN_SIZES + self.max_size = cfg.TEST.AUG.MAX_SIZE + self.flip = cfg.TEST.AUG.FLIP + self.image_format = cfg.INPUT.FORMAT + + def __call__(self, dataset_dict): + """ + Args: + dict: a detection dataset dict + + Returns: + list[dict]: + a list of dataset dicts, which contain augmented version of the input image. + The total number of dicts is ``len(min_sizes) * (2 if flip else 1)``. + """ + ret = [] + if "image" not in dataset_dict: + numpy_image = read_image(dataset_dict["file_name"], self.image_format) + else: + numpy_image = dataset_dict["image"].permute(1, 2, 0).numpy().astype("uint8") + for min_size in self.min_sizes: + image = np.copy(numpy_image) + tfm = ResizeShortestEdge(min_size, self.max_size).get_transform(image) + resized = tfm.apply_image(image) + resized = torch.as_tensor(resized.transpose(2, 0, 1).astype("float32")) + + dic = copy.deepcopy(dataset_dict) + dic["horiz_flip"] = False + dic["image"] = resized + ret.append(dic) + + if self.flip: + dic = copy.deepcopy(dataset_dict) + dic["horiz_flip"] = True + dic["image"] = torch.flip(resized, dims=[2]) + ret.append(dic) + return ret + + +class GeneralizedRCNNWithTTA(nn.Module): + """ + A GeneralizedRCNN with test-time augmentation enabled. + Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`. + """ + + def __init__(self, cfg, model, tta_mapper=None, batch_size=3): + """ + Args: + cfg (CfgNode): + model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. + tta_mapper (callable): takes a dataset dict and returns a list of + augmented versions of the dataset dict. Defaults to + `DatasetMapperTTA(cfg)`. + batch_size (int): batch the augmented images into this batch size for inference. + """ + super().__init__() + if isinstance(model, DistributedDataParallel): + model = model.module + assert isinstance( + model, GeneralizedRCNN + ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model)) + self.cfg = cfg.clone() + assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet" + assert ( + not self.cfg.MODEL.LOAD_PROPOSALS + ), "TTA for pre-computed proposals is not supported yet" + + self.model = model + + if tta_mapper is None: + tta_mapper = DatasetMapperTTA(cfg) + self.tta_mapper = tta_mapper + self.batch_size = batch_size + + @contextmanager + def _turn_off_roi_heads(self, attrs): + """ + Open a context where some heads in `model.roi_heads` are temporarily turned off. + Args: + attr (list[str]): the attribute in `model.roi_heads` which can be used + to turn off a specific head, e.g., "mask_on", "keypoint_on". + """ + roi_heads = self.model.roi_heads + old = {} + for attr in attrs: + try: + old[attr] = getattr(roi_heads, attr) + except AttributeError: + # The head may not be implemented in certain ROIHeads + pass + + if len(old.keys()) == 0: + yield + else: + for attr in old.keys(): + setattr(roi_heads, attr, False) + yield + for attr in old.keys(): + setattr(roi_heads, attr, old[attr]) + + def _batch_inference(self, batched_inputs, detected_instances=None, do_postprocess=True): + """ + Execute inference on a list of inputs, + using batch size = self.batch_size, instead of the length of the list. + + Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` + """ + if detected_instances is None: + detected_instances = [None] * len(batched_inputs) + + outputs = [] + inputs, instances = [], [] + for idx, input, instance in zip(count(), batched_inputs, detected_instances): + inputs.append(input) + instances.append(instance) + if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: + outputs.extend( + self.model.inference( + inputs, + instances if instances[0] is not None else None, + do_postprocess=do_postprocess, + ) + ) + inputs, instances = [], [] + return outputs + + def __call__(self, batched_inputs): + """ + Same input/output format as :meth:`GeneralizedRCNN.forward` + """ + return [self._inference_one_image(x) for x in batched_inputs] + + def _detector_postprocess(self, outputs, aug_vars): + return detector_postprocess(outputs, aug_vars["height"], aug_vars["width"]) + + def _inference_one_image(self, input): + """ + Args: + input (dict): one dataset dict + + Returns: + dict: one output dict + """ + + augmented_inputs, aug_vars = self._get_augmented_inputs(input) + # Detect boxes from all augmented versions + with self._turn_off_roi_heads(["mask_on", "keypoint_on"]): + # temporarily disable roi heads + all_boxes, all_scores, all_classes = self._get_augmented_boxes( + augmented_inputs, aug_vars + ) + merged_instances = self._merge_detections( + all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"]) + ) + + if self.cfg.MODEL.MASK_ON: + # Use the detected boxes to obtain new fields + augmented_instances = self._rescale_detected_boxes( + augmented_inputs, merged_instances, aug_vars + ) + # run forward on the detected boxes + outputs = self._batch_inference( + augmented_inputs, augmented_instances, do_postprocess=False + ) + # Delete now useless variables to avoid being out of memory + del augmented_inputs, augmented_instances, merged_instances + # average the predictions + outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars) + # postprocess + output = self._detector_postprocess(outputs[0], aug_vars) + return {"instances": output} + else: + return {"instances": merged_instances} + + def _get_augmented_inputs(self, input): + augmented_inputs = self.tta_mapper(input) + + do_hflip = [k.pop("horiz_flip", False) for k in augmented_inputs] + heights = [k["height"] for k in augmented_inputs] + widths = [k["width"] for k in augmented_inputs] + assert ( + len(set(heights)) == 1 and len(set(widths)) == 1 + ), "Augmented version of the inputs should have the same original resolution!" + height = heights[0] + width = widths[0] + aug_vars = {"height": height, "width": width, "do_hflip": do_hflip} + + return augmented_inputs, aug_vars + + def _get_augmented_boxes(self, augmented_inputs, aug_vars): + # 1: forward with all augmented images + outputs = self._batch_inference(augmented_inputs, do_postprocess=False) + # 2: union the results + all_boxes = [] + all_scores = [] + all_classes = [] + for idx, output in enumerate(outputs): + rescaled_output = self._detector_postprocess(output, aug_vars) + pred_boxes = rescaled_output.pred_boxes.tensor + if aug_vars["do_hflip"][idx]: + pred_boxes[:, [0, 2]] = aug_vars["width"] - pred_boxes[:, [2, 0]] + all_boxes.append(pred_boxes) + all_scores.extend(rescaled_output.scores) + all_classes.extend(rescaled_output.pred_classes) + all_boxes = torch.cat(all_boxes, dim=0).cpu() + return all_boxes, all_scores, all_classes + + def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw): + # select from the union of all results + num_boxes = len(all_boxes) + num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES + # +1 because fast_rcnn_inference expects background scores as well + all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) + for idx, cls, score in zip(count(), all_classes, all_scores): + all_scores_2d[idx, cls] = score + + merged_instances, _ = fast_rcnn_inference_single_image( + all_boxes, + all_scores_2d, + shape_hw, + 1e-8, + self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, + self.cfg.TEST.DETECTIONS_PER_IMAGE, + ) + + return merged_instances + + def _rescale_detected_boxes(self, augmented_inputs, merged_instances, aug_vars): + augmented_instances = [] + for idx, input in enumerate(augmented_inputs): + actual_height, actual_width = input["image"].shape[1:3] + scale_x = actual_width * 1.0 / aug_vars["width"] + scale_y = actual_height * 1.0 / aug_vars["height"] + pred_boxes = merged_instances.pred_boxes.clone() + pred_boxes.tensor[:, 0::2] *= scale_x + pred_boxes.tensor[:, 1::2] *= scale_y + if aug_vars["do_hflip"][idx]: + pred_boxes.tensor[:, [0, 2]] = actual_width - pred_boxes.tensor[:, [2, 0]] + + aug_instances = Instances( + image_size=(actual_height, actual_width), + pred_boxes=pred_boxes, + pred_classes=merged_instances.pred_classes, + scores=merged_instances.scores, + ) + augmented_instances.append(aug_instances) + return augmented_instances + + def _reduce_pred_masks(self, outputs, aug_vars): + for idx, output in enumerate(outputs): + if aug_vars["do_hflip"][idx]: + output.pred_masks = output.pred_masks.flip(dims=[3]) + all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) + avg_pred_masks = torch.mean(all_pred_masks, dim=0) + return avg_pred_masks diff --git a/detectron2/solver/__init__.py b/detectron2/solver/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..10f84e12d029a07d5c7d3ac29e18b572a92ef03c --- /dev/null +++ b/detectron2/solver/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .build import build_lr_scheduler, build_optimizer +from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/solver/build.py b/detectron2/solver/build.py new file mode 100644 index 0000000000000000000000000000000000000000..6d9d0ee5df1a6135c1a3df0151dfe0e36aa9971a --- /dev/null +++ b/detectron2/solver/build.py @@ -0,0 +1,165 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from enum import Enum +from typing import Any, Callable, Dict, Iterable, List, Set, Type, Union +import torch + +from detectron2.config import CfgNode + +from .lr_scheduler import WarmupCosineLR, WarmupMultiStepLR + +_GradientClipperInput = Union[torch.Tensor, Iterable[torch.Tensor]] +_GradientClipper = Callable[[_GradientClipperInput], None] + + +class GradientClipType(Enum): + VALUE = "value" + NORM = "norm" + + +def _create_gradient_clipper(cfg: CfgNode) -> _GradientClipper: + """ + Creates gradient clipping closure to clip by value or by norm, + according to the provided config. + """ + cfg = cfg.clone() + + def clip_grad_norm(p: _GradientClipperInput): + torch.nn.utils.clip_grad_norm_(p, cfg.CLIP_VALUE, cfg.NORM_TYPE) + + def clip_grad_value(p: _GradientClipperInput): + torch.nn.utils.clip_grad_value_(p, cfg.CLIP_VALUE) + + _GRADIENT_CLIP_TYPE_TO_CLIPPER = { + GradientClipType.VALUE: clip_grad_value, + GradientClipType.NORM: clip_grad_norm, + } + return _GRADIENT_CLIP_TYPE_TO_CLIPPER[GradientClipType(cfg.CLIP_TYPE)] + + +def _generate_optimizer_class_with_gradient_clipping( + optimizer_type: Type[torch.optim.Optimizer], gradient_clipper: _GradientClipper +) -> Type[torch.optim.Optimizer]: + """ + Dynamically creates a new type that inherits the type of a given instance + and overrides the `step` method to add gradient clipping + """ + + def optimizer_wgc_step(self, closure=None): + for group in self.param_groups: + for p in group["params"]: + gradient_clipper(p) + super(type(self), self).step(closure) + + OptimizerWithGradientClip = type( + optimizer_type.__name__ + "WithGradientClip", + (optimizer_type,), + {"step": optimizer_wgc_step}, + ) + return OptimizerWithGradientClip + + +def maybe_add_gradient_clipping( + cfg: CfgNode, optimizer: torch.optim.Optimizer +) -> torch.optim.Optimizer: + """ + If gradient clipping is enabled through config options, wraps the existing + optimizer instance of some type OptimizerType to become an instance + of the new dynamically created class OptimizerTypeWithGradientClip + that inherits OptimizerType and overrides the `step` method to + include gradient clipping. + + Args: + cfg: CfgNode + configuration options + optimizer: torch.optim.Optimizer + existing optimizer instance + + Return: + optimizer: torch.optim.Optimizer + either the unmodified optimizer instance (if gradient clipping is + disabled), or the same instance with adjusted __class__ to override + the `step` method and include gradient clipping + """ + if not cfg.SOLVER.CLIP_GRADIENTS.ENABLED: + return optimizer + grad_clipper = _create_gradient_clipper(cfg.SOLVER.CLIP_GRADIENTS) + OptimizerWithGradientClip = _generate_optimizer_class_with_gradient_clipping( + type(optimizer), grad_clipper + ) + optimizer.__class__ = OptimizerWithGradientClip + return optimizer + + +def build_optimizer(cfg: CfgNode, model: torch.nn.Module) -> torch.optim.Optimizer: + """ + Build an optimizer from config. + """ + norm_module_types = ( + torch.nn.BatchNorm1d, + torch.nn.BatchNorm2d, + torch.nn.BatchNorm3d, + torch.nn.SyncBatchNorm, + # NaiveSyncBatchNorm inherits from BatchNorm2d + torch.nn.GroupNorm, + torch.nn.InstanceNorm1d, + torch.nn.InstanceNorm2d, + torch.nn.InstanceNorm3d, + torch.nn.LayerNorm, + torch.nn.LocalResponseNorm, + ) + params: List[Dict[str, Any]] = [] + memo: Set[torch.nn.parameter.Parameter] = set() + for module in model.modules(): + for key, value in module.named_parameters(recurse=False): + if not value.requires_grad: + continue + # Avoid duplicating parameters + if value in memo: + continue + memo.add(value) + lr = cfg.SOLVER.BASE_LR + weight_decay = cfg.SOLVER.WEIGHT_DECAY + if isinstance(module, norm_module_types): + weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM + elif key == "bias": + # NOTE: unlike Detectron v1, we now default BIAS_LR_FACTOR to 1.0 + # and WEIGHT_DECAY_BIAS to WEIGHT_DECAY so that bias optimizer + # hyperparameters are by default exactly the same as for regular + # weights. + lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR + weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS + params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] + + optimizer = torch.optim.SGD( + params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM, nesterov=cfg.SOLVER.NESTEROV + ) + optimizer = maybe_add_gradient_clipping(cfg, optimizer) + return optimizer + + +def build_lr_scheduler( + cfg: CfgNode, optimizer: torch.optim.Optimizer +) -> torch.optim.lr_scheduler._LRScheduler: + """ + Build a LR scheduler from config. + """ + name = cfg.SOLVER.LR_SCHEDULER_NAME + if name == "WarmupMultiStepLR": + return WarmupMultiStepLR( + optimizer, + cfg.SOLVER.STEPS, + cfg.SOLVER.GAMMA, + warmup_factor=cfg.SOLVER.WARMUP_FACTOR, + warmup_iters=cfg.SOLVER.WARMUP_ITERS, + warmup_method=cfg.SOLVER.WARMUP_METHOD, + ) + elif name == "WarmupCosineLR": + return WarmupCosineLR( + optimizer, + cfg.SOLVER.MAX_ITER, + warmup_factor=cfg.SOLVER.WARMUP_FACTOR, + warmup_iters=cfg.SOLVER.WARMUP_ITERS, + warmup_method=cfg.SOLVER.WARMUP_METHOD, + ) + else: + raise ValueError("Unknown LR scheduler: {}".format(name)) diff --git a/detectron2/solver/lr_scheduler.py b/detectron2/solver/lr_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..6148d86785dae03ed2611792fb28da387d1103b8 --- /dev/null +++ b/detectron2/solver/lr_scheduler.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from bisect import bisect_right +from typing import List +import torch + +# NOTE: PyTorch's LR scheduler interface uses names that assume the LR changes +# only on epoch boundaries. We typically use iteration based schedules instead. +# As a result, "epoch" (e.g., as in self.last_epoch) should be understood to mean +# "iteration" instead. + +# FIXME: ideally this would be achieved with a CombinedLRScheduler, separating +# MultiStepLR with WarmupLR but the current LRScheduler design doesn't allow it. + + +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer: torch.optim.Optimizer, + milestones: List[int], + gamma: float = 0.1, + warmup_factor: float = 0.001, + warmup_iters: int = 1000, + warmup_method: str = "linear", + last_epoch: int = -1, + ): + if not list(milestones) == sorted(milestones): + raise ValueError( + "Milestones should be a list of" " increasing integers. Got {}", milestones + ) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super().__init__(optimizer, last_epoch) + + def get_lr(self) -> List[float]: + warmup_factor = _get_warmup_factor_at_iter( + self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor + ) + return [ + base_lr * warmup_factor * self.gamma ** bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] + + def _compute_values(self) -> List[float]: + # The new interface + return self.get_lr() + + +class WarmupCosineLR(torch.optim.lr_scheduler._LRScheduler): + def __init__( + self, + optimizer: torch.optim.Optimizer, + max_iters: int, + warmup_factor: float = 0.001, + warmup_iters: int = 1000, + warmup_method: str = "linear", + last_epoch: int = -1, + ): + self.max_iters = max_iters + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super().__init__(optimizer, last_epoch) + + def get_lr(self) -> List[float]: + warmup_factor = _get_warmup_factor_at_iter( + self.warmup_method, self.last_epoch, self.warmup_iters, self.warmup_factor + ) + # Different definitions of half-cosine with warmup are possible. For + # simplicity we multiply the standard half-cosine schedule by the warmup + # factor. An alternative is to start the period of the cosine at warmup_iters + # instead of at 0. In the case that warmup_iters << max_iters the two are + # very close to each other. + return [ + base_lr + * warmup_factor + * 0.5 + * (1.0 + math.cos(math.pi * self.last_epoch / self.max_iters)) + for base_lr in self.base_lrs + ] + + def _compute_values(self) -> List[float]: + # The new interface + return self.get_lr() + + +def _get_warmup_factor_at_iter( + method: str, iter: int, warmup_iters: int, warmup_factor: float +) -> float: + """ + Return the learning rate warmup factor at a specific iteration. + See :paper:`in1k1h` for more details. + + Args: + method (str): warmup method; either "constant" or "linear". + iter (int): iteration at which to calculate the warmup factor. + warmup_iters (int): the number of warmup iterations. + warmup_factor (float): the base warmup factor (the meaning changes according + to the method used). + + Returns: + float: the effective warmup factor at the given iteration. + """ + if iter >= warmup_iters: + return 1.0 + + if method == "constant": + return warmup_factor + elif method == "linear": + alpha = iter / warmup_iters + return warmup_factor * (1 - alpha) + alpha + else: + raise ValueError("Unknown warmup method: {}".format(method)) diff --git a/detectron2/structures/__init__.py b/detectron2/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..618f526753b5813b86645023271b67b421ea4cb5 --- /dev/null +++ b/detectron2/structures/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .boxes import Boxes, BoxMode, pairwise_iou +from .image_list import ImageList + +from .instances import Instances +from .keypoints import Keypoints, heatmaps_to_keypoints +from .masks import BitMasks, PolygonMasks, rasterize_polygons_within_box, polygons_to_bitmask +from .rotated_boxes import RotatedBoxes +from .rotated_boxes import pairwise_iou as pairwise_iou_rotated + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/detectron2/structures/boxes.py b/detectron2/structures/boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..e625803e23ec6c0f71ada847ba7bef8e15c8fa40 --- /dev/null +++ b/detectron2/structures/boxes.py @@ -0,0 +1,367 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +import numpy as np +from enum import IntEnum, unique +from typing import Iterator, List, Tuple, Union +import torch + +_RawBoxType = Union[List[float], Tuple[float, ...], torch.Tensor, np.ndarray] + + +@unique +class BoxMode(IntEnum): + """ + Enum of different ways to represent a box. + """ + + XYXY_ABS = 0 + """ + (x0, y0, x1, y1) in absolute floating points coordinates. + The coordinates in range [0, width or height]. + """ + XYWH_ABS = 1 + """ + (x0, y0, w, h) in absolute floating points coordinates. + """ + XYXY_REL = 2 + """ + Not yet supported! + (x0, y0, x1, y1) in range [0, 1]. They are relative to the size of the image. + """ + XYWH_REL = 3 + """ + Not yet supported! + (x0, y0, w, h) in range [0, 1]. They are relative to the size of the image. + """ + XYWHA_ABS = 4 + """ + (xc, yc, w, h, a) in absolute floating points coordinates. + (xc, yc) is the center of the rotated box, and the angle a is in degrees ccw. + """ + + @staticmethod + def convert(box: _RawBoxType, from_mode: "BoxMode", to_mode: "BoxMode") -> _RawBoxType: + """ + Args: + box: can be a k-tuple, k-list or an Nxk array/tensor, where k = 4 or 5 + from_mode, to_mode (BoxMode) + + Returns: + The converted box of the same type. + """ + if from_mode == to_mode: + return box + + original_type = type(box) + is_numpy = isinstance(box, np.ndarray) + single_box = isinstance(box, (list, tuple)) + if single_box: + assert len(box) == 4 or len(box) == 5, ( + "BoxMode.convert takes either a k-tuple/list or an Nxk array/tensor," + " where k == 4 or 5" + ) + arr = torch.tensor(box)[None, :] + else: + # avoid modifying the input box + if is_numpy: + arr = torch.from_numpy(np.asarray(box)).clone() + else: + arr = box.clone() + + assert to_mode.value not in [ + BoxMode.XYXY_REL, + BoxMode.XYWH_REL, + ] and from_mode.value not in [ + BoxMode.XYXY_REL, + BoxMode.XYWH_REL, + ], "Relative mode not yet supported!" + + if from_mode == BoxMode.XYWHA_ABS and to_mode == BoxMode.XYXY_ABS: + assert ( + arr.shape[-1] == 5 + ), "The last dimension of input shape must be 5 for XYWHA format" + original_dtype = arr.dtype + arr = arr.double() + + w = arr[:, 2] + h = arr[:, 3] + a = arr[:, 4] + c = torch.abs(torch.cos(a * math.pi / 180.0)) + s = torch.abs(torch.sin(a * math.pi / 180.0)) + # This basically computes the horizontal bounding rectangle of the rotated box + new_w = c * w + s * h + new_h = c * h + s * w + + # convert center to top-left corner + arr[:, 0] -= new_w / 2.0 + arr[:, 1] -= new_h / 2.0 + # bottom-right corner + arr[:, 2] = arr[:, 0] + new_w + arr[:, 3] = arr[:, 1] + new_h + + arr = arr[:, :4].to(dtype=original_dtype) + elif from_mode == BoxMode.XYWH_ABS and to_mode == BoxMode.XYWHA_ABS: + original_dtype = arr.dtype + arr = arr.double() + arr[:, 0] += arr[:, 2] / 2.0 + arr[:, 1] += arr[:, 3] / 2.0 + angles = torch.zeros((arr.shape[0], 1), dtype=arr.dtype) + arr = torch.cat((arr, angles), axis=1).to(dtype=original_dtype) + else: + if to_mode == BoxMode.XYXY_ABS and from_mode == BoxMode.XYWH_ABS: + arr[:, 2] += arr[:, 0] + arr[:, 3] += arr[:, 1] + elif from_mode == BoxMode.XYXY_ABS and to_mode == BoxMode.XYWH_ABS: + arr[:, 2] -= arr[:, 0] + arr[:, 3] -= arr[:, 1] + else: + raise NotImplementedError( + "Conversion from BoxMode {} to {} is not supported yet".format( + from_mode, to_mode + ) + ) + + if single_box: + return original_type(arr.flatten().tolist()) + if is_numpy: + return arr.numpy() + else: + return arr + + +class Boxes: + """ + This structure stores a list of boxes as a Nx4 torch.Tensor. + It supports some common methods about boxes + (`area`, `clip`, `nonempty`, etc), + and also behaves like a Tensor + (support indexing, `to(device)`, `.device`, and iteration over all boxes) + + Attributes: + tensor (torch.Tensor): float matrix of Nx4. Each row is (x1, y1, x2, y2). + """ + + BoxSizeType = Union[List[int], Tuple[int, int]] + + def __init__(self, tensor: torch.Tensor): + """ + Args: + tensor (Tensor[float]): a Nx4 matrix. Each row is (x1, y1, x2, y2). + """ + device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that does not depend on + # the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, 4)).to(dtype=torch.float32, device=device) + assert tensor.dim() == 2 and tensor.size(-1) == 4, tensor.size() + + self.tensor = tensor + + def clone(self) -> "Boxes": + """ + Clone the Boxes. + + Returns: + Boxes + """ + return Boxes(self.tensor.clone()) + + def to(self, device: str) -> "Boxes": + return Boxes(self.tensor.to(device)) + + def area(self) -> torch.Tensor: + """ + Computes the area of all the boxes. + + Returns: + torch.Tensor: a vector with areas of each box. + """ + box = self.tensor + area = (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1]) + return area + + def clip(self, box_size: BoxSizeType) -> None: + """ + Clip (in place) the boxes by limiting x coordinates to the range [0, width] + and y coordinates to the range [0, height]. + + Args: + box_size (height, width): The clipping box's size. + """ + assert torch.isfinite(self.tensor).all(), "Box tensor contains infinite or NaN!" + h, w = box_size + self.tensor[:, 0].clamp_(min=0, max=w) + self.tensor[:, 1].clamp_(min=0, max=h) + self.tensor[:, 2].clamp_(min=0, max=w) + self.tensor[:, 3].clamp_(min=0, max=h) + + def nonempty(self, threshold: float = 0.0) -> torch.Tensor: + """ + Find boxes that are non-empty. + A box is considered empty, if either of its side is no larger than threshold. + + Returns: + Tensor: + a binary vector which represents whether each box is empty + (False) or non-empty (True). + """ + box = self.tensor + widths = box[:, 2] - box[:, 0] + heights = box[:, 3] - box[:, 1] + keep = (widths > threshold) & (heights > threshold) + return keep + + def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Boxes": + """ + Returns: + Boxes: Create a new :class:`Boxes` by indexing. + + The following usage are allowed: + + 1. `new_boxes = boxes[3]`: return a `Boxes` which contains only one box. + 2. `new_boxes = boxes[2:10]`: return a slice of boxes. + 3. `new_boxes = boxes[vector]`, where vector is a torch.BoolTensor + with `length = len(boxes)`. Nonzero elements in the vector will be selected. + + Note that the returned Boxes might share storage with this Boxes, + subject to Pytorch's indexing semantics. + """ + if isinstance(item, int): + return Boxes(self.tensor[item].view(1, -1)) + b = self.tensor[item] + assert b.dim() == 2, "Indexing on Boxes with {} failed to return a matrix!".format(item) + return Boxes(b) + + def __len__(self) -> int: + return self.tensor.shape[0] + + def __repr__(self) -> str: + return "Boxes(" + str(self.tensor) + ")" + + def inside_box(self, box_size: BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor: + """ + Args: + box_size (height, width): Size of the reference box. + boundary_threshold (int): Boxes that extend beyond the reference box + boundary by more than boundary_threshold are considered "outside". + + Returns: + a binary vector, indicating whether each box is inside the reference box. + """ + height, width = box_size + inds_inside = ( + (self.tensor[..., 0] >= -boundary_threshold) + & (self.tensor[..., 1] >= -boundary_threshold) + & (self.tensor[..., 2] < width + boundary_threshold) + & (self.tensor[..., 3] < height + boundary_threshold) + ) + return inds_inside + + def get_centers(self) -> torch.Tensor: + """ + Returns: + The box centers in a Nx2 array of (x, y). + """ + return (self.tensor[:, :2] + self.tensor[:, 2:]) / 2 + + def scale(self, scale_x: float, scale_y: float) -> None: + """ + Scale the box with horizontal and vertical scaling factors + """ + self.tensor[:, 0::2] *= scale_x + self.tensor[:, 1::2] *= scale_y + + @classmethod + def cat(cls, boxes_list: List["Boxes"]) -> "Boxes": + """ + Concatenates a list of Boxes into a single Boxes + + Arguments: + boxes_list (list[Boxes]) + + Returns: + Boxes: the concatenated Boxes + """ + assert isinstance(boxes_list, (list, tuple)) + if len(boxes_list) == 0: + return cls(torch.empty(0)) + assert all(isinstance(box, Boxes) for box in boxes_list) + + # use torch.cat (v.s. layers.cat) so the returned boxes never share storage with input + cat_boxes = cls(torch.cat([b.tensor for b in boxes_list], dim=0)) + return cat_boxes + + @property + def device(self) -> torch.device: + return self.tensor.device + + def __iter__(self) -> Iterator[torch.Tensor]: + """ + Yield a box as a Tensor of shape (4,) at a time. + """ + yield from self.tensor + + +# implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py +# with slight modifications +def pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: + """ + Given two lists of boxes of size N and M, + compute the IoU (intersection over union) + between __all__ N x M pairs of boxes. + The box order must be (xmin, ymin, xmax, ymax). + + Args: + boxes1,boxes2 (Boxes): two `Boxes`. Contains N & M boxes, respectively. + + Returns: + Tensor: IoU, sized [N,M]. + """ + area1 = boxes1.area() + area2 = boxes2.area() + + boxes1, boxes2 = boxes1.tensor, boxes2.tensor + + width_height = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) - torch.max( + boxes1[:, None, :2], boxes2[:, :2] + ) # [N,M,2] + + width_height.clamp_(min=0) # [N,M,2] + inter = width_height.prod(dim=2) # [N,M] + del width_height + + # handle empty boxes + iou = torch.where( + inter > 0, + inter / (area1[:, None] + area2 - inter), + torch.zeros(1, dtype=inter.dtype, device=inter.device), + ) + return iou + + +def matched_boxlist_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor: + """ + Compute pairwise intersection over union (IOU) of two sets of matched + boxes. The box order must be (xmin, ymin, xmax, ymax). + Similar to boxlist_iou, but computes only diagonal elements of the matrix + Arguments: + boxes1: (Boxes) bounding boxes, sized [N,4]. + boxes2: (Boxes) bounding boxes, sized [N,4]. + Returns: + (tensor) iou, sized [N]. + """ + assert len(boxes1) == len( + boxes2 + ), "boxlists should have the same" "number of entries, got {}, {}".format( + len(boxes1), len(boxes2) + ) + area1 = boxes1.area() # [N] + area2 = boxes2.area() # [N] + box1, box2 = boxes1.tensor, boxes2.tensor + lt = torch.max(box1[:, :2], box2[:, :2]) # [N,2] + rb = torch.min(box1[:, 2:], box2[:, 2:]) # [N,2] + wh = (rb - lt).clamp(min=0) # [N,2] + inter = wh[:, 0] * wh[:, 1] # [N] + iou = inter / (area1 + area2 - inter) # [N] + return iou diff --git a/detectron2/structures/image_list.py b/detectron2/structures/image_list.py new file mode 100644 index 0000000000000000000000000000000000000000..2d89224b64402badf7f0b113188b5f653df912ac --- /dev/null +++ b/detectron2/structures/image_list.py @@ -0,0 +1,113 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +from __future__ import division +from typing import Any, List, Sequence, Tuple, Union +import torch +from torch.nn import functional as F + + +class ImageList(object): + """ + Structure that holds a list of images (of possibly + varying sizes) as a single tensor. + This works by padding the images to the same size, + and storing in a field the original sizes of each image + + Attributes: + image_sizes (list[tuple[int, int]]): each tuple is (h, w) + """ + + def __init__(self, tensor: torch.Tensor, image_sizes: List[Tuple[int, int]]): + """ + Arguments: + tensor (Tensor): of shape (N, H, W) or (N, C_1, ..., C_K, H, W) where K >= 1 + image_sizes (list[tuple[int, int]]): Each tuple is (h, w). It can + be smaller than (H, W) due to padding. + """ + self.tensor = tensor + self.image_sizes = image_sizes + + def __len__(self) -> int: + return len(self.image_sizes) + + def __getitem__(self, idx: Union[int, slice]) -> torch.Tensor: + """ + Access the individual image in its original size. + + Returns: + Tensor: an image of shape (H, W) or (C_1, ..., C_K, H, W) where K >= 1 + """ + size = self.image_sizes[idx] + return self.tensor[idx, ..., : size[0], : size[1]] # type: ignore + + def to(self, *args: Any, **kwargs: Any) -> "ImageList": + cast_tensor = self.tensor.to(*args, **kwargs) + return ImageList(cast_tensor, self.image_sizes) + + @property + def device(self) -> torch.device: + return self.tensor.device + + @staticmethod + def from_tensors( + tensors: Sequence[torch.Tensor], size_divisibility: int = 0, pad_value: float = 0.0 + ) -> "ImageList": + """ + Args: + tensors: a tuple or list of `torch.Tensors`, each of shape (Hi, Wi) or + (C_1, ..., C_K, Hi, Wi) where K >= 1. The Tensors will be padded + to the same shape with `pad_value`. + size_divisibility (int): If `size_divisibility > 0`, add padding to ensure + the common height and width is divisible by `size_divisibility`. + This depends on the model and many models need a divisibility of 32. + pad_value (float): value to pad + + Returns: + an `ImageList`. + """ + assert len(tensors) > 0 + assert isinstance(tensors, (tuple, list)) + for t in tensors: + assert isinstance(t, torch.Tensor), type(t) + assert t.shape[1:-2] == tensors[0].shape[1:-2], t.shape + # per dimension maximum (H, W) or (C_1, ..., C_K, H, W) where K >= 1 among all tensors + max_size = ( + # In tracing mode, x.shape[i] is Tensor, and should not be converted + # to int: this will cause the traced graph to have hard-coded shapes. + # Instead we should make max_size a Tensor that depends on these tensors. + # Using torch.stack twice seems to be the best way to convert + # list[list[ScalarTensor]] to a Tensor + torch.stack( + [ + torch.stack([torch.as_tensor(dim) for dim in size]) + for size in [tuple(img.shape) for img in tensors] + ] + ) + .max(0) + .values + ) + + if size_divisibility > 0: + stride = size_divisibility + # the last two dims are H,W, both subject to divisibility requirement + max_size = torch.cat([max_size[:-2], (max_size[-2:] + (stride - 1)) // stride * stride]) + + image_sizes = [tuple(im.shape[-2:]) for im in tensors] + + if len(tensors) == 1: + # This seems slightly (2%) faster. + # TODO: check whether it's faster for multiple images as well + image_size = image_sizes[0] + padding_size = [0, max_size[-1] - image_size[1], 0, max_size[-2] - image_size[0]] + if all(x == 0 for x in padding_size): # https://github.com/pytorch/pytorch/issues/31734 + batched_imgs = tensors[0].unsqueeze(0) + else: + padded = F.pad(tensors[0], padding_size, value=pad_value) + batched_imgs = padded.unsqueeze_(0) + else: + # max_size can be a tensor in tracing mode, therefore use tuple() + batch_shape = (len(tensors),) + tuple(max_size) + batched_imgs = tensors[0].new_full(batch_shape, pad_value) + for img, pad_img in zip(tensors, batched_imgs): + pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img) + + return ImageList(batched_imgs.contiguous(), image_sizes) diff --git a/detectron2/structures/instances.py b/detectron2/structures/instances.py new file mode 100644 index 0000000000000000000000000000000000000000..373de08c01517c0f78b14d94da7ff702daaf375d --- /dev/null +++ b/detectron2/structures/instances.py @@ -0,0 +1,185 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import itertools +from typing import Any, Dict, List, Tuple, Union +import torch + + +class Instances: + """ + This class represents a list of instances in an image. + It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields". + All fields must have the same ``__len__`` which is the number of instances. + + All other (non-field) attributes of this class are considered private: + they must start with '_' and are not modifiable by a user. + + Some basic usage: + + 1. Set/Get a field: + + .. code-block:: python + + instances.gt_boxes = Boxes(...) + print(instances.pred_masks) # a tensor of shape (N, H, W) + print('gt_masks' in instances) + + 2. ``len(instances)`` returns the number of instances + 3. Indexing: ``instances[indices]`` will apply the indexing on all the fields + and returns a new :class:`Instances`. + Typically, ``indices`` is a integer vector of indices, + or a binary mask of length ``num_instances``, + """ + + def __init__(self, image_size: Tuple[int, int], **kwargs: Any): + """ + Args: + image_size (height, width): the spatial size of the image. + kwargs: fields to add to this `Instances`. + """ + self._image_size = image_size + self._fields: Dict[str, Any] = {} + for k, v in kwargs.items(): + self.set(k, v) + + @property + def image_size(self) -> Tuple[int, int]: + """ + Returns: + tuple: height, width + """ + return self._image_size + + def __setattr__(self, name: str, val: Any) -> None: + if name.startswith("_"): + super().__setattr__(name, val) + else: + self.set(name, val) + + def __getattr__(self, name: str) -> Any: + if name == "_fields" or name not in self._fields: + raise AttributeError("Cannot find field '{}' in the given Instances!".format(name)) + return self._fields[name] + + def set(self, name: str, value: Any) -> None: + """ + Set the field named `name` to `value`. + The length of `value` must be the number of instances, + and must agree with other existing fields in this object. + """ + data_len = len(value) + if len(self._fields): + assert ( + len(self) == data_len + ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self)) + self._fields[name] = value + + def has(self, name: str) -> bool: + """ + Returns: + bool: whether the field called `name` exists. + """ + return name in self._fields + + def remove(self, name: str) -> None: + """ + Remove the field called `name`. + """ + del self._fields[name] + + def get(self, name: str) -> Any: + """ + Returns the field called `name`. + """ + return self._fields[name] + + def get_fields(self) -> Dict[str, Any]: + """ + Returns: + dict: a dict which maps names (str) to data of the fields + + Modifying the returned dict will modify this instance. + """ + return self._fields + + # Tensor-like methods + def to(self, device: str) -> "Instances": + """ + Returns: + Instances: all fields are called with a `to(device)`, if the field has this method. + """ + ret = Instances(self._image_size) + for k, v in self._fields.items(): + if hasattr(v, "to"): + v = v.to(device) + ret.set(k, v) + return ret + + def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances": + """ + Args: + item: an index-like object and will be used to index all the fields. + + Returns: + If `item` is a string, return the data in the corresponding field. + Otherwise, returns an `Instances` where all fields are indexed by `item`. + """ + if type(item) == int: + if item >= len(self) or item < -len(self): + raise IndexError("Instances index out of range!") + else: + item = slice(item, None, len(self)) + + ret = Instances(self._image_size) + for k, v in self._fields.items(): + ret.set(k, v[item]) + return ret + + def __len__(self) -> int: + for v in self._fields.values(): + return len(v) + raise NotImplementedError("Empty Instances does not support __len__!") + + def __iter__(self): + raise NotImplementedError("`Instances` object is not iterable!") + + @staticmethod + def cat(instance_lists: List["Instances"]) -> "Instances": + """ + Args: + instance_lists (list[Instances]) + + Returns: + Instances + """ + assert all(isinstance(i, Instances) for i in instance_lists) + assert len(instance_lists) > 0 + if len(instance_lists) == 1: + return instance_lists[0] + + image_size = instance_lists[0].image_size + for i in instance_lists[1:]: + assert i.image_size == image_size + ret = Instances(image_size) + for k in instance_lists[0]._fields.keys(): + values = [i.get(k) for i in instance_lists] + v0 = values[0] + if isinstance(v0, torch.Tensor): + values = torch.cat(values, dim=0) + elif isinstance(v0, list): + values = list(itertools.chain(*values)) + elif hasattr(type(v0), "cat"): + values = type(v0).cat(values) + else: + raise ValueError("Unsupported type {} for concatenation".format(type(v0))) + ret.set(k, values) + return ret + + def __str__(self) -> str: + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self)) + s += "image_height={}, ".format(self._image_size[0]) + s += "image_width={}, ".format(self._image_size[1]) + s += "fields=[{}])".format(", ".join((f"{k}: {v}" for k, v in self._fields.items()))) + return s + + __repr__ = __str__ diff --git a/detectron2/structures/keypoints.py b/detectron2/structures/keypoints.py new file mode 100644 index 0000000000000000000000000000000000000000..2242815f31dfe88aaabbf4b49f724c999a71912d --- /dev/null +++ b/detectron2/structures/keypoints.py @@ -0,0 +1,209 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +from typing import Any, List, Tuple, Union +import torch + +from detectron2.layers import interpolate + + +class Keypoints: + """ + Stores keypoint annotation data. GT Instances have a `gt_keypoints` property + containing the x,y location and visibility flag of each keypoint. This tensor has shape + (N, K, 3) where N is the number of instances and K is the number of keypoints per instance. + + The visibility flag follows the COCO format and must be one of three integers: + * v=0: not labeled (in which case x=y=0) + * v=1: labeled but not visible + * v=2: labeled and visible + """ + + def __init__(self, keypoints: Union[torch.Tensor, np.ndarray, List[List[float]]]): + """ + Arguments: + keypoints: A Tensor, numpy array, or list of the x, y, and visibility of each keypoint. + The shape should be (N, K, 3) where N is the number of + instances, and K is the number of keypoints per instance. + """ + device = keypoints.device if isinstance(keypoints, torch.Tensor) else torch.device("cpu") + keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=device) + assert keypoints.dim() == 3 and keypoints.shape[2] == 3, keypoints.shape + self.tensor = keypoints + + def __len__(self) -> int: + return self.tensor.size(0) + + def to(self, *args: Any, **kwargs: Any) -> "Keypoints": + return type(self)(self.tensor.to(*args, **kwargs)) + + @property + def device(self) -> torch.device: + return self.tensor.device + + def to_heatmap(self, boxes: torch.Tensor, heatmap_size: int) -> torch.Tensor: + """ + Arguments: + boxes: Nx4 tensor, the boxes to draw the keypoints to + + Returns: + heatmaps: + A tensor of shape (N, K) containing an integer spatial label + in the range [0, heatmap_size**2 - 1] for each keypoint in the input. + valid: + A tensor of shape (N, K) containing whether each keypoint is in the roi or not. + """ + return _keypoints_to_heatmap(self.tensor, boxes, heatmap_size) + + def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Keypoints": + """ + Create a new `Keypoints` by indexing on this `Keypoints`. + + The following usage are allowed: + + 1. `new_kpts = kpts[3]`: return a `Keypoints` which contains only one instance. + 2. `new_kpts = kpts[2:10]`: return a slice of key points. + 3. `new_kpts = kpts[vector]`, where vector is a torch.ByteTensor + with `length = len(kpts)`. Nonzero elements in the vector will be selected. + + Note that the returned Keypoints might share storage with this Keypoints, + subject to Pytorch's indexing semantics. + """ + if isinstance(item, int): + return Keypoints([self.tensor[item]]) + return Keypoints(self.tensor[item]) + + def __repr__(self) -> str: + s = self.__class__.__name__ + "(" + s += "num_instances={})".format(len(self.tensor)) + return s + + +# TODO make this nicer, this is a direct translation from C2 (but removing the inner loop) +def _keypoints_to_heatmap( + keypoints: torch.Tensor, rois: torch.Tensor, heatmap_size: int +) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Encode keypoint locations into a target heatmap for use in SoftmaxWithLoss across space. + + Maps keypoints from the half-open interval [x1, x2) on continuous image coordinates to the + closed interval [0, heatmap_size - 1] on discrete image coordinates. We use the + continuous-discrete conversion from Heckbert 1990 ("What is the coordinate of a pixel?"): + d = floor(c) and c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. + + Arguments: + keypoints: tensor of keypoint locations in of shape (N, K, 3). + rois: Nx4 tensor of rois in xyxy format + heatmap_size: integer side length of square heatmap. + + Returns: + heatmaps: A tensor of shape (N, K) containing an integer spatial label + in the range [0, heatmap_size**2 - 1] for each keypoint in the input. + valid: A tensor of shape (N, K) containing whether each keypoint is in + the roi or not. + """ + + if rois.numel() == 0: + return rois.new().long(), rois.new().long() + offset_x = rois[:, 0] + offset_y = rois[:, 1] + scale_x = heatmap_size / (rois[:, 2] - rois[:, 0]) + scale_y = heatmap_size / (rois[:, 3] - rois[:, 1]) + + offset_x = offset_x[:, None] + offset_y = offset_y[:, None] + scale_x = scale_x[:, None] + scale_y = scale_y[:, None] + + x = keypoints[..., 0] + y = keypoints[..., 1] + + x_boundary_inds = x == rois[:, 2][:, None] + y_boundary_inds = y == rois[:, 3][:, None] + + x = (x - offset_x) * scale_x + x = x.floor().long() + y = (y - offset_y) * scale_y + y = y.floor().long() + + x[x_boundary_inds] = heatmap_size - 1 + y[y_boundary_inds] = heatmap_size - 1 + + valid_loc = (x >= 0) & (y >= 0) & (x < heatmap_size) & (y < heatmap_size) + vis = keypoints[..., 2] > 0 + valid = (valid_loc & vis).long() + + lin_ind = y * heatmap_size + x + heatmaps = lin_ind * valid + + return heatmaps, valid + + +@torch.no_grad() +def heatmaps_to_keypoints(maps: torch.Tensor, rois: torch.Tensor) -> torch.Tensor: + """ + Extract predicted keypoint locations from heatmaps. + + Args: + maps (Tensor): (#ROIs, #keypoints, POOL_H, POOL_W). The predicted heatmap of logits for + each ROI and each keypoint. + rois (Tensor): (#ROIs, 4). The box of each ROI. + + Returns: + Tensor of shape (#ROIs, #keypoints, 4) with the last dimension corresponding to + (x, y, logit, score) for each keypoint. + + When converting discrete pixel indices in an NxN image to a continuous keypoint coordinate, + we maintain consistency with :meth:`Keypoints.to_heatmap` by using the conversion from + Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a continuous coordinate. + """ + offset_x = rois[:, 0] + offset_y = rois[:, 1] + + widths = (rois[:, 2] - rois[:, 0]).clamp(min=1) + heights = (rois[:, 3] - rois[:, 1]).clamp(min=1) + widths_ceil = widths.ceil() + heights_ceil = heights.ceil() + + num_rois, num_keypoints = maps.shape[:2] + xy_preds = maps.new_zeros(rois.shape[0], num_keypoints, 4) + + width_corrections = widths / widths_ceil + height_corrections = heights / heights_ceil + + keypoints_idx = torch.arange(num_keypoints, device=maps.device) + + for i in range(num_rois): + outsize = (int(heights_ceil[i]), int(widths_ceil[i])) + roi_map = interpolate(maps[[i]], size=outsize, mode="bicubic", align_corners=False).squeeze( + 0 + ) # #keypoints x H x W + + # softmax over the spatial region + max_score, _ = roi_map.view(num_keypoints, -1).max(1) + max_score = max_score.view(num_keypoints, 1, 1) + tmp_full_resolution = (roi_map - max_score).exp_() + tmp_pool_resolution = (maps[i] - max_score).exp_() + # Produce scores over the region H x W, but normalize with POOL_H x POOL_W, + # so that the scores of objects of different absolute sizes will be more comparable + roi_map_scores = tmp_full_resolution / tmp_pool_resolution.sum((1, 2), keepdim=True) + + w = roi_map.shape[2] + pos = roi_map.view(num_keypoints, -1).argmax(1) + + x_int = pos % w + y_int = (pos - x_int) // w + + assert ( + roi_map_scores[keypoints_idx, y_int, x_int] + == roi_map_scores.view(num_keypoints, -1).max(1)[0] + ).all() + + x = (x_int.float() + 0.5) * width_corrections[i] + y = (y_int.float() + 0.5) * height_corrections[i] + + xy_preds[i, :, 0] = x + offset_x[i] + xy_preds[i, :, 1] = y + offset_y[i] + xy_preds[i, :, 2] = roi_map[keypoints_idx, y_int, x_int] + xy_preds[i, :, 3] = roi_map_scores[keypoints_idx, y_int, x_int] + + return xy_preds diff --git a/detectron2/structures/masks.py b/detectron2/structures/masks.py new file mode 100644 index 0000000000000000000000000000000000000000..e363baf3d8cfc4694558fc12bbd2e9d65507b9d9 --- /dev/null +++ b/detectron2/structures/masks.py @@ -0,0 +1,424 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import itertools +import numpy as np +from typing import Any, Iterator, List, Union +import pycocotools.mask as mask_utils +import torch + +from detectron2.layers.roi_align import ROIAlign + +from .boxes import Boxes + + +def polygon_area(x, y): + # Using the shoelace formula + # https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates + return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1))) + + +def polygons_to_bitmask(polygons: List[np.ndarray], height: int, width: int) -> np.ndarray: + """ + Args: + polygons (list[ndarray]): each array has shape (Nx2,) + height, width (int) + + Returns: + ndarray: a bool mask of shape (height, width) + """ + assert len(polygons) > 0, "COCOAPI does not support empty polygons" + rles = mask_utils.frPyObjects(polygons, height, width) + rle = mask_utils.merge(rles) + return mask_utils.decode(rle).astype(np.bool) + + +def rasterize_polygons_within_box( + polygons: List[np.ndarray], box: np.ndarray, mask_size: int +) -> torch.Tensor: + """ + Rasterize the polygons into a mask image and + crop the mask content in the given box. + The cropped mask is resized to (mask_size, mask_size). + + This function is used when generating training targets for mask head in Mask R-CNN. + Given original ground-truth masks for an image, new ground-truth mask + training targets in the size of `mask_size x mask_size` + must be provided for each predicted box. This function will be called to + produce such targets. + + Args: + polygons (list[ndarray[float]]): a list of polygons, which represents an instance. + box: 4-element numpy array + mask_size (int): + + Returns: + Tensor: BoolTensor of shape (mask_size, mask_size) + """ + # 1. Shift the polygons w.r.t the boxes + w, h = box[2] - box[0], box[3] - box[1] + + polygons = copy.deepcopy(polygons) + for p in polygons: + p[0::2] = p[0::2] - box[0] + p[1::2] = p[1::2] - box[1] + + # 2. Rescale the polygons to the new box size + # max() to avoid division by small number + ratio_h = mask_size / max(h, 0.1) + ratio_w = mask_size / max(w, 0.1) + + if ratio_h == ratio_w: + for p in polygons: + p *= ratio_h + else: + for p in polygons: + p[0::2] *= ratio_w + p[1::2] *= ratio_h + + # 3. Rasterize the polygons with coco api + mask = polygons_to_bitmask(polygons, mask_size, mask_size) + mask = torch.from_numpy(mask) + return mask + + +class BitMasks: + """ + This class stores the segmentation masks for all objects in one image, in + the form of bitmaps. + + Attributes: + tensor: bool Tensor of N,H,W, representing N instances in the image. + """ + + def __init__(self, tensor: Union[torch.Tensor, np.ndarray]): + """ + Args: + tensor: bool Tensor of N,H,W, representing N instances in the image. + """ + device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.bool, device=device) + assert tensor.dim() == 3, tensor.size() + self.image_size = tensor.shape[1:] + self.tensor = tensor + + def to(self, device: str) -> "BitMasks": + return BitMasks(self.tensor.to(device)) + + @property + def device(self) -> torch.device: + return self.tensor.device + + def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "BitMasks": + """ + Returns: + BitMasks: Create a new :class:`BitMasks` by indexing. + + The following usage are allowed: + + 1. `new_masks = masks[3]`: return a `BitMasks` which contains only one mask. + 2. `new_masks = masks[2:10]`: return a slice of masks. + 3. `new_masks = masks[vector]`, where vector is a torch.BoolTensor + with `length = len(masks)`. Nonzero elements in the vector will be selected. + + Note that the returned object might share storage with this object, + subject to Pytorch's indexing semantics. + """ + if isinstance(item, int): + return BitMasks(self.tensor[item].view(1, -1)) + m = self.tensor[item] + assert m.dim() == 3, "Indexing on BitMasks with {} returns a tensor with shape {}!".format( + item, m.shape + ) + return BitMasks(m) + + def __iter__(self) -> torch.Tensor: + yield from self.tensor + + def __repr__(self) -> str: + s = self.__class__.__name__ + "(" + s += "num_instances={})".format(len(self.tensor)) + return s + + def __len__(self) -> int: + return self.tensor.shape[0] + + def nonempty(self) -> torch.Tensor: + """ + Find masks that are non-empty. + + Returns: + Tensor: a BoolTensor which represents + whether each mask is empty (False) or non-empty (True). + """ + return self.tensor.flatten(1).any(dim=1) + + @staticmethod + def from_polygon_masks( + polygon_masks: Union["PolygonMasks", List[List[np.ndarray]]], height: int, width: int + ) -> "BitMasks": + """ + Args: + polygon_masks (list[list[ndarray]] or PolygonMasks) + height, width (int) + """ + if isinstance(polygon_masks, PolygonMasks): + polygon_masks = polygon_masks.polygons + masks = [polygons_to_bitmask(p, height, width) for p in polygon_masks] + return BitMasks(torch.stack([torch.from_numpy(x) for x in masks])) + + def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor: + """ + Crop each bitmask by the given box, and resize results to (mask_size, mask_size). + This can be used to prepare training targets for Mask R-CNN. + It has less reconstruction error compared to rasterization with polygons. + However we observe no difference in accuracy, + but BitMasks requires more memory to store all the masks. + + Args: + boxes (Tensor): Nx4 tensor storing the boxes for each mask + mask_size (int): the size of the rasterized mask. + + Returns: + Tensor: + A bool tensor of shape (N, mask_size, mask_size), where + N is the number of predicted boxes for this image. + """ + assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self)) + device = self.tensor.device + + batch_inds = torch.arange(len(boxes), device=device).to(dtype=boxes.dtype)[:, None] + rois = torch.cat([batch_inds, boxes], dim=1) # Nx5 + + bit_masks = self.tensor.to(dtype=torch.float32) + rois = rois.to(device=device) + output = ( + ROIAlign((mask_size, mask_size), 1.0, 0, aligned=True) + .forward(bit_masks[:, None, :, :], rois) + .squeeze(1) + ) + output = output >= 0.5 + return output + + def get_bounding_boxes(self) -> None: + # not needed now + raise NotImplementedError + + @staticmethod + def cat(bitmasks_list: List["BitMasks"]) -> "BitMasks": + """ + Concatenates a list of BitMasks into a single BitMasks + + Arguments: + bitmasks_list (list[BitMasks]) + + Returns: + BitMasks: the concatenated BitMasks + """ + assert isinstance(bitmasks_list, (list, tuple)) + assert len(bitmasks_list) > 0 + assert all(isinstance(bitmask, BitMasks) for bitmask in bitmasks_list) + + cat_bitmasks = type(bitmasks_list[0])(torch.cat([bm.tensor for bm in bitmasks_list], dim=0)) + return cat_bitmasks + + +class PolygonMasks: + """ + This class stores the segmentation masks for all objects in one image, in the form of polygons. + + Attributes: + polygons: list[list[ndarray]]. Each ndarray is a float64 vector representing a polygon. + """ + + def __init__(self, polygons: List[List[Union[torch.Tensor, np.ndarray]]]): + """ + Arguments: + polygons (list[list[np.ndarray]]): The first + level of the list correspond to individual instances, + the second level to all the polygons that compose the + instance, and the third level to the polygon coordinates. + The third level array should have the format of + [x0, y0, x1, y1, ..., xn, yn] (n >= 3). + """ + assert isinstance(polygons, list), ( + "Cannot create PolygonMasks: Expect a list of list of polygons per image. " + "Got '{}' instead.".format(type(polygons)) + ) + + def _make_array(t: Union[torch.Tensor, np.ndarray]) -> np.ndarray: + # Use float64 for higher precision, because why not? + # Always put polygons on CPU (self.to is a no-op) since they + # are supposed to be small tensors. + # May need to change this assumption if GPU placement becomes useful + if isinstance(t, torch.Tensor): + t = t.cpu().numpy() + return np.asarray(t).astype("float64") + + def process_polygons( + polygons_per_instance: List[Union[torch.Tensor, np.ndarray]] + ) -> List[np.ndarray]: + assert isinstance(polygons_per_instance, list), ( + "Cannot create polygons: Expect a list of polygons per instance. " + "Got '{}' instead.".format(type(polygons_per_instance)) + ) + # transform the polygon to a tensor + polygons_per_instance = [_make_array(p) for p in polygons_per_instance] + for polygon in polygons_per_instance: + assert len(polygon) % 2 == 0 and len(polygon) >= 6 + return polygons_per_instance + + self.polygons: List[List[np.ndarray]] = [ + process_polygons(polygons_per_instance) for polygons_per_instance in polygons + ] + + def to(self, *args: Any, **kwargs: Any) -> "PolygonMasks": + return self + + @property + def device(self) -> torch.device: + return torch.device("cpu") + + def get_bounding_boxes(self) -> Boxes: + """ + Returns: + Boxes: tight bounding boxes around polygon masks. + """ + boxes = torch.zeros(len(self.polygons), 4, dtype=torch.float32) + for idx, polygons_per_instance in enumerate(self.polygons): + minxy = torch.as_tensor([float("inf"), float("inf")], dtype=torch.float32) + maxxy = torch.zeros(2, dtype=torch.float32) + for polygon in polygons_per_instance: + coords = torch.from_numpy(polygon).view(-1, 2).to(dtype=torch.float32) + minxy = torch.min(minxy, torch.min(coords, dim=0).values) + maxxy = torch.max(maxxy, torch.max(coords, dim=0).values) + boxes[idx, :2] = minxy + boxes[idx, 2:] = maxxy + return Boxes(boxes) + + def nonempty(self) -> torch.Tensor: + """ + Find masks that are non-empty. + + Returns: + Tensor: + a BoolTensor which represents whether each mask is empty (False) or not (True). + """ + keep = [1 if len(polygon) > 0 else 0 for polygon in self.polygons] + return torch.from_numpy(np.asarray(keep, dtype=np.bool)) + + def __getitem__(self, item: Union[int, slice, List[int], torch.BoolTensor]) -> "PolygonMasks": + """ + Support indexing over the instances and return a `PolygonMasks` object. + `item` can be: + + 1. An integer. It will return an object with only one instance. + 2. A slice. It will return an object with the selected instances. + 3. A list[int]. It will return an object with the selected instances, + correpsonding to the indices in the list. + 4. A vector mask of type BoolTensor, whose length is num_instances. + It will return an object with the instances whose mask is nonzero. + """ + if isinstance(item, int): + selected_polygons = [self.polygons[item]] + elif isinstance(item, slice): + selected_polygons = self.polygons[item] + elif isinstance(item, list): + selected_polygons = [self.polygons[i] for i in item] + elif isinstance(item, torch.Tensor): + # Polygons is a list, so we have to move the indices back to CPU. + if item.dtype == torch.bool: + assert item.dim() == 1, item.shape + item = item.nonzero().squeeze(1).cpu().numpy().tolist() + elif item.dtype in [torch.int32, torch.int64]: + item = item.cpu().numpy().tolist() + else: + raise ValueError("Unsupported tensor dtype={} for indexing!".format(item.dtype)) + selected_polygons = [self.polygons[i] for i in item] + return PolygonMasks(selected_polygons) + + def __iter__(self) -> Iterator[List[np.ndarray]]: + """ + Yields: + list[ndarray]: the polygons for one instance. + Each Tensor is a float64 vector representing a polygon. + """ + return iter(self.polygons) + + def __repr__(self) -> str: + s = self.__class__.__name__ + "(" + s += "num_instances={})".format(len(self.polygons)) + return s + + def __len__(self) -> int: + return len(self.polygons) + + def crop_and_resize(self, boxes: torch.Tensor, mask_size: int) -> torch.Tensor: + """ + Crop each mask by the given box, and resize results to (mask_size, mask_size). + This can be used to prepare training targets for Mask R-CNN. + + Args: + boxes (Tensor): Nx4 tensor storing the boxes for each mask + mask_size (int): the size of the rasterized mask. + + Returns: + Tensor: A bool tensor of shape (N, mask_size, mask_size), where + N is the number of predicted boxes for this image. + """ + assert len(boxes) == len(self), "{} != {}".format(len(boxes), len(self)) + + device = boxes.device + # Put boxes on the CPU, as the polygon representation is not efficient GPU-wise + # (several small tensors for representing a single instance mask) + boxes = boxes.to(torch.device("cpu")) + + results = [ + rasterize_polygons_within_box(poly, box.numpy(), mask_size) + for poly, box in zip(self.polygons, boxes) + ] + """ + poly: list[list[float]], the polygons for one instance + box: a tensor of shape (4,) + """ + if len(results) == 0: + return torch.empty(0, mask_size, mask_size, dtype=torch.bool, device=device) + return torch.stack(results, dim=0).to(device=device) + + def area(self): + """ + Computes area of the mask. + Only works with Polygons, using the shoelace formula: + https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates + + Returns: + Tensor: a vector, area for each instance + """ + + area = [] + for polygons_per_instance in self.polygons: + area_per_instance = 0 + for p in polygons_per_instance: + area_per_instance += polygon_area(p[0::2], p[1::2]) + area.append(area_per_instance) + + return torch.tensor(area) + + @staticmethod + def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks": + """ + Concatenates a list of PolygonMasks into a single PolygonMasks + + Arguments: + polymasks_list (list[PolygonMasks]) + + Returns: + PolygonMasks: the concatenated PolygonMasks + """ + assert isinstance(polymasks_list, (list, tuple)) + assert len(polymasks_list) > 0 + assert all(isinstance(polymask, PolygonMasks) for polymask in polymasks_list) + + cat_polymasks = type(polymasks_list[0])( + list(itertools.chain.from_iterable(pm.polygons for pm in polymasks_list)) + ) + return cat_polymasks diff --git a/detectron2/structures/rotated_boxes.py b/detectron2/structures/rotated_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..823cfb62a13d0ff060099d1b930bc900a4ca009b --- /dev/null +++ b/detectron2/structures/rotated_boxes.py @@ -0,0 +1,481 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from typing import Iterator, Union +import torch + +from detectron2.layers.rotated_boxes import pairwise_iou_rotated + +from .boxes import Boxes + + +class RotatedBoxes(Boxes): + """ + This structure stores a list of rotated boxes as a Nx5 torch.Tensor. + It supports some common methods about boxes + (`area`, `clip`, `nonempty`, etc), + and also behaves like a Tensor + (support indexing, `to(device)`, `.device`, and iteration over all boxes) + """ + + def __init__(self, tensor: torch.Tensor): + """ + Args: + tensor (Tensor[float]): a Nx5 matrix. Each row is + (x_center, y_center, width, height, angle), + in which angle is represented in degrees. + While there's no strict range restriction for it, + the recommended principal range is between [-180, 180) degrees. + + Assume we have a horizontal box B = (x_center, y_center, width, height), + where width is along the x-axis and height is along the y-axis. + The rotated box B_rot (x_center, y_center, width, height, angle) + can be seen as: + + 1. When angle == 0: + B_rot == B + 2. When angle > 0: + B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CCW; + 3. When angle < 0: + B_rot is obtained by rotating B w.r.t its center by :math:`|angle|` degrees CW. + + Mathematically, since the right-handed coordinate system for image space + is (y, x), where y is top->down and x is left->right, the 4 vertices of the + rotated rectangle :math:`(yr_i, xr_i)` (i = 1, 2, 3, 4) can be obtained from + the vertices of the horizontal rectangle (y_i, x_i) (i = 1, 2, 3, 4) + in the following way (:math:`\\theta = angle*\\pi/180` is the angle in radians, + (y_c, x_c) is the center of the rectangle): + + .. math:: + + yr_i = \\cos(\\theta) (y_i - y_c) - \\sin(\\theta) (x_i - x_c) + y_c, + + xr_i = \\sin(\\theta) (y_i - y_c) + \\cos(\\theta) (x_i - x_c) + x_c, + + which is the standard rigid-body rotation transformation. + + Intuitively, the angle is + (1) the rotation angle from y-axis in image space + to the height vector (top->down in the box's local coordinate system) + of the box in CCW, and + (2) the rotation angle from x-axis in image space + to the width vector (left->right in the box's local coordinate system) + of the box in CCW. + + More intuitively, consider the following horizontal box ABCD represented + in (x1, y1, x2, y2): (3, 2, 7, 4), + covering the [3, 7] x [2, 4] region of the continuous coordinate system + which looks like this: + + .. code:: none + + O--------> x + | + | A---B + | | | + | D---C + | + v y + + Note that each capital letter represents one 0-dimensional geometric point + instead of a 'square pixel' here. + + In the example above, using (x, y) to represent a point we have: + + .. math:: + + O = (0, 0), A = (3, 2), B = (7, 2), C = (7, 4), D = (3, 4) + + We name vector AB = vector DC as the width vector in box's local coordinate system, and + vector AD = vector BC as the height vector in box's local coordinate system. Initially, + when angle = 0 degree, they're aligned with the positive directions of x-axis and y-axis + in the image space, respectively. + + For better illustration, we denote the center of the box as E, + + .. code:: none + + O--------> x + | + | A---B + | | E | + | D---C + | + v y + + where the center E = ((3+7)/2, (2+4)/2) = (5, 3). + + Also, + + .. math:: + + width = |AB| = |CD| = 7 - 3 = 4, + height = |AD| = |BC| = 4 - 2 = 2. + + Therefore, the corresponding representation for the same shape in rotated box in + (x_center, y_center, width, height, angle) format is: + + (5, 3, 4, 2, 0), + + Now, let's consider (5, 3, 4, 2, 90), which is rotated by 90 degrees + CCW (counter-clockwise) by definition. It looks like this: + + .. code:: none + + O--------> x + | B-C + | | | + | |E| + | | | + | A-D + v y + + The center E is still located at the same point (5, 3), while the vertices + ABCD are rotated by 90 degrees CCW with regard to E: + A = (4, 5), B = (4, 1), C = (6, 1), D = (6, 5) + + Here, 90 degrees can be seen as the CCW angle to rotate from y-axis to + vector AD or vector BC (the top->down height vector in box's local coordinate system), + or the CCW angle to rotate from x-axis to vector AB or vector DC (the left->right + width vector in box's local coordinate system). + + .. math:: + + width = |AB| = |CD| = 5 - 1 = 4, + height = |AD| = |BC| = 6 - 4 = 2. + + Next, how about (5, 3, 4, 2, -90), which is rotated by 90 degrees CW (clockwise) + by definition? It looks like this: + + .. code:: none + + O--------> x + | D-A + | | | + | |E| + | | | + | C-B + v y + + The center E is still located at the same point (5, 3), while the vertices + ABCD are rotated by 90 degrees CW with regard to E: + A = (6, 1), B = (6, 5), C = (4, 5), D = (4, 1) + + .. math:: + + width = |AB| = |CD| = 5 - 1 = 4, + height = |AD| = |BC| = 6 - 4 = 2. + + This covers exactly the same region as (5, 3, 4, 2, 90) does, and their IoU + will be 1. However, these two will generate different RoI Pooling results and + should not be treated as an identical box. + + On the other hand, it's easy to see that (X, Y, W, H, A) is identical to + (X, Y, W, H, A+360N), for any integer N. For example (5, 3, 4, 2, 270) would be + identical to (5, 3, 4, 2, -90), because rotating the shape 270 degrees CCW is + equivalent to rotating the same shape 90 degrees CW. + + We could rotate further to get (5, 3, 4, 2, 180), or (5, 3, 4, 2, -180): + + .. code:: none + + O--------> x + | + | C---D + | | E | + | B---A + | + v y + + .. math:: + + A = (7, 4), B = (3, 4), C = (3, 2), D = (7, 2), + + width = |AB| = |CD| = 7 - 3 = 4, + height = |AD| = |BC| = 4 - 2 = 2. + + Finally, this is a very inaccurate (heavily quantized) illustration of + how (5, 3, 4, 2, 60) looks like in case anyone wonders: + + .. code:: none + + O--------> x + | B\ + | / C + | /E / + | A / + | `D + v y + + It's still a rectangle with center of (5, 3), width of 4 and height of 2, + but its angle (and thus orientation) is somewhere between + (5, 3, 4, 2, 0) and (5, 3, 4, 2, 90). + """ + device = tensor.device if isinstance(tensor, torch.Tensor) else torch.device("cpu") + tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) + if tensor.numel() == 0: + # Use reshape, so we don't end up creating a new tensor that does not depend on + # the inputs (and consequently confuses jit) + tensor = tensor.reshape((0, 5)).to(dtype=torch.float32, device=device) + assert tensor.dim() == 2 and tensor.size(-1) == 5, tensor.size() + + self.tensor = tensor + + def clone(self) -> "RotatedBoxes": + """ + Clone the RotatedBoxes. + + Returns: + RotatedBoxes + """ + return RotatedBoxes(self.tensor.clone()) + + def to(self, device: str) -> "RotatedBoxes": + return RotatedBoxes(self.tensor.to(device)) + + def area(self) -> torch.Tensor: + """ + Computes the area of all the boxes. + + Returns: + torch.Tensor: a vector with areas of each box. + """ + box = self.tensor + area = box[:, 2] * box[:, 3] + return area + + def normalize_angles(self) -> None: + """ + Restrict angles to the range of [-180, 180) degrees + """ + self.tensor[:, 4] = (self.tensor[:, 4] + 180.0) % 360.0 - 180.0 + + def clip(self, box_size: Boxes.BoxSizeType, clip_angle_threshold: float = 1.0) -> None: + """ + Clip (in place) the boxes by limiting x coordinates to the range [0, width] + and y coordinates to the range [0, height]. + + For RRPN: + Only clip boxes that are almost horizontal with a tolerance of + clip_angle_threshold to maintain backward compatibility. + + Rotated boxes beyond this threshold are not clipped for two reasons: + + 1. There are potentially multiple ways to clip a rotated box to make it + fit within the image. + 2. It's tricky to make the entire rectangular box fit within the image + and still be able to not leave out pixels of interest. + + Therefore we rely on ops like RoIAlignRotated to safely handle this. + + Args: + box_size (height, width): The clipping box's size. + clip_angle_threshold: + Iff. abs(normalized(angle)) <= clip_angle_threshold (in degrees), + we do the clipping as horizontal boxes. + """ + h, w = box_size + + # normalize angles to be within (-180, 180] degrees + self.normalize_angles() + + idx = torch.where(torch.abs(self.tensor[:, 4]) <= clip_angle_threshold)[0] + + # convert to (x1, y1, x2, y2) + x1 = self.tensor[idx, 0] - self.tensor[idx, 2] / 2.0 + y1 = self.tensor[idx, 1] - self.tensor[idx, 3] / 2.0 + x2 = self.tensor[idx, 0] + self.tensor[idx, 2] / 2.0 + y2 = self.tensor[idx, 1] + self.tensor[idx, 3] / 2.0 + + # clip + x1.clamp_(min=0, max=w) + y1.clamp_(min=0, max=h) + x2.clamp_(min=0, max=w) + y2.clamp_(min=0, max=h) + + # convert back to (xc, yc, w, h) + self.tensor[idx, 0] = (x1 + x2) / 2.0 + self.tensor[idx, 1] = (y1 + y2) / 2.0 + # make sure widths and heights do not increase due to numerical errors + self.tensor[idx, 2] = torch.min(self.tensor[idx, 2], x2 - x1) + self.tensor[idx, 3] = torch.min(self.tensor[idx, 3], y2 - y1) + + def nonempty(self, threshold: float = 0.0) -> torch.Tensor: + """ + Find boxes that are non-empty. + A box is considered empty, if either of its side is no larger than threshold. + + Returns: + Tensor: a binary vector which represents + whether each box is empty (False) or non-empty (True). + """ + box = self.tensor + widths = box[:, 2] + heights = box[:, 3] + keep = (widths > threshold) & (heights > threshold) + return keep + + def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "RotatedBoxes": + """ + Returns: + RotatedBoxes: Create a new :class:`RotatedBoxes` by indexing. + + The following usage are allowed: + + 1. `new_boxes = boxes[3]`: return a `RotatedBoxes` which contains only one box. + 2. `new_boxes = boxes[2:10]`: return a slice of boxes. + 3. `new_boxes = boxes[vector]`, where vector is a torch.ByteTensor + with `length = len(boxes)`. Nonzero elements in the vector will be selected. + + Note that the returned RotatedBoxes might share storage with this RotatedBoxes, + subject to Pytorch's indexing semantics. + """ + if isinstance(item, int): + return RotatedBoxes(self.tensor[item].view(1, -1)) + b = self.tensor[item] + assert b.dim() == 2, "Indexing on RotatedBoxes with {} failed to return a matrix!".format( + item + ) + return RotatedBoxes(b) + + def __len__(self) -> int: + return self.tensor.shape[0] + + def __repr__(self) -> str: + return "RotatedBoxes(" + str(self.tensor) + ")" + + def inside_box(self, box_size: Boxes.BoxSizeType, boundary_threshold: int = 0) -> torch.Tensor: + """ + Args: + box_size (height, width): Size of the reference box covering + [0, width] x [0, height] + boundary_threshold (int): Boxes that extend beyond the reference box + boundary by more than boundary_threshold are considered "outside". + + For RRPN, it might not be necessary to call this function since it's common + for rotated box to extend to outside of the image boundaries + (the clip function only clips the near-horizontal boxes) + + Returns: + a binary vector, indicating whether each box is inside the reference box. + """ + height, width = box_size + + cnt_x = self.tensor[..., 0] + cnt_y = self.tensor[..., 1] + half_w = self.tensor[..., 2] / 2.0 + half_h = self.tensor[..., 3] / 2.0 + a = self.tensor[..., 4] + c = torch.abs(torch.cos(a * math.pi / 180.0)) + s = torch.abs(torch.sin(a * math.pi / 180.0)) + # This basically computes the horizontal bounding rectangle of the rotated box + max_rect_dx = c * half_w + s * half_h + max_rect_dy = c * half_h + s * half_w + + inds_inside = ( + (cnt_x - max_rect_dx >= -boundary_threshold) + & (cnt_y - max_rect_dy >= -boundary_threshold) + & (cnt_x + max_rect_dx < width + boundary_threshold) + & (cnt_y + max_rect_dy < height + boundary_threshold) + ) + + return inds_inside + + def get_centers(self) -> torch.Tensor: + """ + Returns: + The box centers in a Nx2 array of (x, y). + """ + return self.tensor[:, :2] + + def scale(self, scale_x: float, scale_y: float) -> None: + """ + Scale the rotated box with horizontal and vertical scaling factors + Note: when scale_factor_x != scale_factor_y, + the rotated box does not preserve the rectangular shape when the angle + is not a multiple of 90 degrees under resize transformation. + Instead, the shape is a parallelogram (that has skew) + Here we make an approximation by fitting a rotated rectangle to the parallelogram. + """ + self.tensor[:, 0] *= scale_x + self.tensor[:, 1] *= scale_y + theta = self.tensor[:, 4] * math.pi / 180.0 + c = torch.cos(theta) + s = torch.sin(theta) + + # In image space, y is top->down and x is left->right + # Consider the local coordintate system for the rotated box, + # where the box center is located at (0, 0), and the four vertices ABCD are + # A(-w / 2, -h / 2), B(w / 2, -h / 2), C(w / 2, h / 2), D(-w / 2, h / 2) + # the midpoint of the left edge AD of the rotated box E is: + # E = (A+D)/2 = (-w / 2, 0) + # the midpoint of the top edge AB of the rotated box F is: + # F(0, -h / 2) + # To get the old coordinates in the global system, apply the rotation transformation + # (Note: the right-handed coordinate system for image space is yOx): + # (old_x, old_y) = (s * y + c * x, c * y - s * x) + # E(old) = (s * 0 + c * (-w/2), c * 0 - s * (-w/2)) = (-c * w / 2, s * w / 2) + # F(old) = (s * (-h / 2) + c * 0, c * (-h / 2) - s * 0) = (-s * h / 2, -c * h / 2) + # After applying the scaling factor (sfx, sfy): + # E(new) = (-sfx * c * w / 2, sfy * s * w / 2) + # F(new) = (-sfx * s * h / 2, -sfy * c * h / 2) + # The new width after scaling tranformation becomes: + + # w(new) = |E(new) - O| * 2 + # = sqrt[(sfx * c * w / 2)^2 + (sfy * s * w / 2)^2] * 2 + # = sqrt[(sfx * c)^2 + (sfy * s)^2] * w + # i.e., scale_factor_w = sqrt[(sfx * c)^2 + (sfy * s)^2] + # + # For example, + # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_w == scale_factor_x; + # when |angle| = 90, c = 0, |s| = 1, scale_factor_w == scale_factor_y + self.tensor[:, 2] *= torch.sqrt((scale_x * c) ** 2 + (scale_y * s) ** 2) + + # h(new) = |F(new) - O| * 2 + # = sqrt[(sfx * s * h / 2)^2 + (sfy * c * h / 2)^2] * 2 + # = sqrt[(sfx * s)^2 + (sfy * c)^2] * h + # i.e., scale_factor_h = sqrt[(sfx * s)^2 + (sfy * c)^2] + # + # For example, + # when angle = 0 or 180, |c| = 1, s = 0, scale_factor_h == scale_factor_y; + # when |angle| = 90, c = 0, |s| = 1, scale_factor_h == scale_factor_x + self.tensor[:, 3] *= torch.sqrt((scale_x * s) ** 2 + (scale_y * c) ** 2) + + # The angle is the rotation angle from y-axis in image space to the height + # vector (top->down in the box's local coordinate system) of the box in CCW. + # + # angle(new) = angle_yOx(O - F(new)) + # = angle_yOx( (sfx * s * h / 2, sfy * c * h / 2) ) + # = atan2(sfx * s * h / 2, sfy * c * h / 2) + # = atan2(sfx * s, sfy * c) + # + # For example, + # when sfx == sfy, angle(new) == atan2(s, c) == angle(old) + self.tensor[:, 4] = torch.atan2(scale_x * s, scale_y * c) * 180 / math.pi + + @property + def device(self) -> str: + return self.tensor.device + + def __iter__(self) -> Iterator[torch.Tensor]: + """ + Yield a box as a Tensor of shape (5,) at a time. + """ + yield from self.tensor + + +def pairwise_iou(boxes1: RotatedBoxes, boxes2: RotatedBoxes) -> None: + """ + Given two lists of rotated boxes of size N and M, + compute the IoU (intersection over union) + between __all__ N x M pairs of boxes. + The box order must be (x_center, y_center, width, height, angle). + + Args: + boxes1, boxes2 (RotatedBoxes): + two `RotatedBoxes`. Contains N & M rotated boxes, respectively. + + Returns: + Tensor: IoU, sized [N,M]. + """ + + return pairwise_iou_rotated(boxes1.tensor, boxes2.tensor) diff --git a/detectron2/utils/README.md b/detectron2/utils/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9765b24a730b77556104187ac3ef5439ab0859fd --- /dev/null +++ b/detectron2/utils/README.md @@ -0,0 +1,5 @@ +# Utility functions + +This folder contain utility functions that are not used in the +core library, but are useful for building models or training +code using the config system. diff --git a/detectron2/utils/__init__.py b/detectron2/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec --- /dev/null +++ b/detectron2/utils/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/detectron2/utils/analysis.py b/detectron2/utils/analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..c48e376c242f57f480280538ae770520d14110f8 --- /dev/null +++ b/detectron2/utils/analysis.py @@ -0,0 +1,164 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# -*- coding: utf-8 -*- + +import logging +import typing +import torch +from fvcore.nn import activation_count, flop_count, parameter_count, parameter_count_table +from torch import nn + +from detectron2.structures import BitMasks, Boxes, ImageList, Instances + +from .logger import log_first_n + +__all__ = [ + "activation_count_operators", + "flop_count_operators", + "parameter_count_table", + "parameter_count", +] + +FLOPS_MODE = "flops" +ACTIVATIONS_MODE = "activations" + + +# some extra ops to ignore from counting. +_IGNORED_OPS = [ + "aten::add", + "aten::add_", + "aten::batch_norm", + "aten::constant_pad_nd", + "aten::div", + "aten::div_", + "aten::exp", + "aten::log2", + "aten::max_pool2d", + "aten::meshgrid", + "aten::mul", + "aten::mul_", + "aten::nonzero_numpy", + "aten::relu", + "aten::relu_", + "aten::rsub", + "aten::sigmoid", + "aten::sigmoid_", + "aten::softmax", + "aten::sort", + "aten::sqrt", + "aten::sub", + "aten::upsample_nearest2d", + "prim::PythonOp", + "torchvision::nms", +] + + +def flop_count_operators( + model: nn.Module, inputs: list, **kwargs +) -> typing.DefaultDict[str, float]: + """ + Implement operator-level flops counting using jit. + This is a wrapper of fvcore.nn.flop_count, that supports standard detection models + in detectron2. + + Note: + The function runs the input through the model to compute flops. + The flops of a detection model is often input-dependent, for example, + the flops of box & mask head depends on the number of proposals & + the number of detected objects. + Therefore, the flops counting using a single input may not accurately + reflect the computation cost of a model. + + Args: + model: a detectron2 model that takes `list[dict]` as input. + inputs (list[dict]): inputs to model, in detectron2's standard format. + """ + return _wrapper_count_operators(model=model, inputs=inputs, mode=FLOPS_MODE, **kwargs) + + +def activation_count_operators( + model: nn.Module, inputs: list, **kwargs +) -> typing.DefaultDict[str, float]: + """ + Implement operator-level activations counting using jit. + This is a wrapper of fvcore.nn.activation_count, that supports standard detection models + in detectron2. + + Note: + The function runs the input through the model to compute activations. + The activations of a detection model is often input-dependent, for example, + the activations of box & mask head depends on the number of proposals & + the number of detected objects. + + Args: + model: a detectron2 model that takes `list[dict]` as input. + inputs (list[dict]): inputs to model, in detectron2's standard format. + """ + return _wrapper_count_operators(model=model, inputs=inputs, mode=ACTIVATIONS_MODE, **kwargs) + + +def _flatten_to_tuple(outputs): + result = [] + if isinstance(outputs, torch.Tensor): + result.append(outputs) + elif isinstance(outputs, (list, tuple)): + for v in outputs: + result.extend(_flatten_to_tuple(v)) + elif isinstance(outputs, dict): + for _, v in outputs.items(): + result.extend(_flatten_to_tuple(v)) + elif isinstance(outputs, Instances): + result.extend(_flatten_to_tuple(outputs.get_fields())) + elif isinstance(outputs, (Boxes, BitMasks, ImageList)): + result.append(outputs.tensor) + else: + log_first_n( + logging.WARN, + f"Output of type {type(outputs)} not included in flops/activations count.", + n=10, + ) + return tuple(result) + + +def _wrapper_count_operators( + model: nn.Module, inputs: list, mode: str, **kwargs +) -> typing.DefaultDict[str, float]: + + # ignore some ops + supported_ops = {k: lambda *args, **kwargs: {} for k in _IGNORED_OPS} + supported_ops.update(kwargs.pop("supported_ops", {})) + kwargs["supported_ops"] = supported_ops + + assert len(inputs) == 1, "Please use batch size=1" + tensor_input = inputs[0]["image"] + + class WrapModel(nn.Module): + def __init__(self, model): + super().__init__() + if isinstance( + model, (nn.parallel.distributed.DistributedDataParallel, nn.DataParallel) + ): + self.model = model.module + else: + self.model = model + + def forward(self, image): + # jit requires the input/output to be Tensors + inputs = [{"image": image}] + outputs = self.model.forward(inputs) + # Only the subgraph that computes the returned tuple of tensor will be + # counted. So we flatten everything we found to tuple of tensors. + return _flatten_to_tuple(outputs) + + old_train = model.training + with torch.no_grad(): + if mode == FLOPS_MODE: + ret = flop_count(WrapModel(model).train(False), (tensor_input,), **kwargs) + elif mode == ACTIVATIONS_MODE: + ret = activation_count(WrapModel(model).train(False), (tensor_input,), **kwargs) + else: + raise NotImplementedError("Count for mode {} is not supported yet.".format(mode)) + # compatible with change in fvcore + if isinstance(ret, tuple): + ret = ret[0] + model.train(old_train) + return ret diff --git a/detectron2/utils/collect_env.py b/detectron2/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..c25b99cb0ab626cc4f4dabca5eb81f710011f2e3 --- /dev/null +++ b/detectron2/utils/collect_env.py @@ -0,0 +1,160 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import importlib +import numpy as np +import os +import re +import subprocess +import sys +from collections import defaultdict +import PIL +import torch +import torchvision +from tabulate import tabulate + +__all__ = ["collect_env_info"] + + +def collect_torch_env(): + try: + import torch.__config__ + + return torch.__config__.show() + except ImportError: + # compatible with older versions of pytorch + from torch.utils.collect_env import get_pretty_env_info + + return get_pretty_env_info() + + +def get_env_module(): + var_name = "DETECTRON2_ENV_MODULE" + return var_name, os.environ.get(var_name, "") + + +def detect_compute_compatibility(CUDA_HOME, so_file): + try: + cuobjdump = os.path.join(CUDA_HOME, "bin", "cuobjdump") + if os.path.isfile(cuobjdump): + output = subprocess.check_output( + "'{}' --list-elf '{}'".format(cuobjdump, so_file), shell=True + ) + output = output.decode("utf-8").strip().split("\n") + sm = [] + for line in output: + line = re.findall(r"\.sm_[0-9]*\.", line)[0] + sm.append(line.strip(".")) + sm = sorted(set(sm)) + return ", ".join(sm) + else: + return so_file + "; cannot find cuobjdump" + except Exception: + # unhandled failure + return so_file + + +def collect_env_info(): + has_cuda = torch.cuda.is_available() + # NOTE: the use of CUDA_HOME requires the CUDA build deps, though in + # theory detectron2 should be made runnable with only the CUDA runtime + from torch.utils.cpp_extension import CUDA_HOME + + data = [] + data.append(("sys.platform", sys.platform)) + data.append(("Python", sys.version.replace("\n", ""))) + data.append(("numpy", np.__version__)) + + try: + import detectron2 # noqa + + data.append( + ("detectron2", detectron2.__version__ + " @" + os.path.dirname(detectron2.__file__)) + ) + except ImportError: + data.append(("detectron2", "failed to import")) + else: + try: + from detectron2 import _C + except ImportError: + data.append(("detectron2._C", "failed to import")) + else: + data.append(("detectron2 compiler", _C.get_compiler_version())) + data.append(("detectron2 CUDA compiler", _C.get_cuda_version())) + if has_cuda: + data.append( + ("detectron2 arch flags", detect_compute_compatibility(CUDA_HOME, _C.__file__)) + ) + + data.append(get_env_module()) + data.append(("PyTorch", torch.__version__ + " @" + os.path.dirname(torch.__file__))) + data.append(("PyTorch debug build", torch.version.debug)) + + data.append(("CUDA available", has_cuda)) + if has_cuda: + devices = defaultdict(list) + for k in range(torch.cuda.device_count()): + devices[torch.cuda.get_device_name(k)].append(str(k)) + for name, devids in devices.items(): + data.append(("GPU " + ",".join(devids), name)) + + from torch.utils.cpp_extension import CUDA_HOME + + data.append(("CUDA_HOME", str(CUDA_HOME))) + + if CUDA_HOME is not None and os.path.isdir(CUDA_HOME): + try: + nvcc = os.path.join(CUDA_HOME, "bin", "nvcc") + nvcc = subprocess.check_output("'{}' -V | tail -n1".format(nvcc), shell=True) + nvcc = nvcc.decode("utf-8").strip() + except subprocess.SubprocessError: + nvcc = "Not Available" + data.append(("NVCC", nvcc)) + + cuda_arch_list = os.environ.get("TORCH_CUDA_ARCH_LIST", None) + if cuda_arch_list: + data.append(("TORCH_CUDA_ARCH_LIST", cuda_arch_list)) + data.append(("Pillow", PIL.__version__)) + + try: + data.append( + ( + "torchvision", + str(torchvision.__version__) + " @" + os.path.dirname(torchvision.__file__), + ) + ) + if has_cuda: + try: + torchvision_C = importlib.util.find_spec("torchvision._C").origin + msg = detect_compute_compatibility(CUDA_HOME, torchvision_C) + data.append(("torchvision arch flags", msg)) + except ImportError: + data.append(("torchvision._C", "failed to find")) + except AttributeError: + data.append(("torchvision", "unknown")) + + try: + import fvcore + + data.append(("fvcore", fvcore.__version__)) + except ImportError: + pass + + try: + import cv2 + + data.append(("cv2", cv2.__version__)) + except ImportError: + pass + env_str = tabulate(data) + "\n" + env_str += collect_torch_env() + return env_str + + +if __name__ == "__main__": + try: + import detectron2 # noqa + except ImportError: + print(collect_env_info()) + else: + from detectron2.utils.collect_env import collect_env_info + + print(collect_env_info()) diff --git a/detectron2/utils/colormap.py b/detectron2/utils/colormap.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf1455e4ce9e077961143c8d734a7298d28476d --- /dev/null +++ b/detectron2/utils/colormap.py @@ -0,0 +1,140 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +""" +An awesome colormap for really neat visualizations. +Copied from Detectron, and removed gray colors. +""" + +import numpy as np + +__all__ = ["colormap", "random_color"] + +# fmt: off +# RGB: +_COLORS = np.array( + [ + 0.000, 0.447, 0.741, + 0.850, 0.325, 0.098, + 0.929, 0.694, 0.125, + 0.494, 0.184, 0.556, + 0.466, 0.674, 0.188, + 0.301, 0.745, 0.933, + 0.635, 0.078, 0.184, + 0.300, 0.300, 0.300, + 0.600, 0.600, 0.600, + 1.000, 0.000, 0.000, + 1.000, 0.500, 0.000, + 0.749, 0.749, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 1.000, + 0.667, 0.000, 1.000, + 0.333, 0.333, 0.000, + 0.333, 0.667, 0.000, + 0.333, 1.000, 0.000, + 0.667, 0.333, 0.000, + 0.667, 0.667, 0.000, + 0.667, 1.000, 0.000, + 1.000, 0.333, 0.000, + 1.000, 0.667, 0.000, + 1.000, 1.000, 0.000, + 0.000, 0.333, 0.500, + 0.000, 0.667, 0.500, + 0.000, 1.000, 0.500, + 0.333, 0.000, 0.500, + 0.333, 0.333, 0.500, + 0.333, 0.667, 0.500, + 0.333, 1.000, 0.500, + 0.667, 0.000, 0.500, + 0.667, 0.333, 0.500, + 0.667, 0.667, 0.500, + 0.667, 1.000, 0.500, + 1.000, 0.000, 0.500, + 1.000, 0.333, 0.500, + 1.000, 0.667, 0.500, + 1.000, 1.000, 0.500, + 0.000, 0.333, 1.000, + 0.000, 0.667, 1.000, + 0.000, 1.000, 1.000, + 0.333, 0.000, 1.000, + 0.333, 0.333, 1.000, + 0.333, 0.667, 1.000, + 0.333, 1.000, 1.000, + 0.667, 0.000, 1.000, + 0.667, 0.333, 1.000, + 0.667, 0.667, 1.000, + 0.667, 1.000, 1.000, + 1.000, 0.000, 1.000, + 1.000, 0.333, 1.000, + 1.000, 0.667, 1.000, + 0.333, 0.000, 0.000, + 0.500, 0.000, 0.000, + 0.667, 0.000, 0.000, + 0.833, 0.000, 0.000, + 1.000, 0.000, 0.000, + 0.000, 0.167, 0.000, + 0.000, 0.333, 0.000, + 0.000, 0.500, 0.000, + 0.000, 0.667, 0.000, + 0.000, 0.833, 0.000, + 0.000, 1.000, 0.000, + 0.000, 0.000, 0.167, + 0.000, 0.000, 0.333, + 0.000, 0.000, 0.500, + 0.000, 0.000, 0.667, + 0.000, 0.000, 0.833, + 0.000, 0.000, 1.000, + 0.000, 0.000, 0.000, + 0.143, 0.143, 0.143, + 0.857, 0.857, 0.857, + 1.000, 1.000, 1.000 + ] +).astype(np.float32).reshape(-1, 3) +# fmt: on + + +def colormap(rgb=False, maximum=255): + """ + Args: + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + + Returns: + ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] + """ + assert maximum in [255, 1], maximum + c = _COLORS * maximum + if not rgb: + c = c[:, ::-1] + return c + + +def random_color(rgb=False, maximum=255): + """ + Args: + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + + Returns: + ndarray: a vector of 3 numbers + """ + idx = np.random.randint(0, len(_COLORS)) + ret = _COLORS[idx] * maximum + if not rgb: + ret = ret[::-1] + return ret + + +if __name__ == "__main__": + import cv2 + + size = 100 + H, W = 10, 10 + canvas = np.random.rand(H * size, W * size, 3).astype("float32") + for h in range(H): + for w in range(W): + idx = h * W + w + if idx >= len(_COLORS): + break + canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] + cv2.imshow("a", canvas) + cv2.waitKey(0) diff --git a/detectron2/utils/comm.py b/detectron2/utils/comm.py new file mode 100644 index 0000000000000000000000000000000000000000..8cc7b3dac5a45db87fa91ac86fce50805ecf1bad --- /dev/null +++ b/detectron2/utils/comm.py @@ -0,0 +1,263 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +This file contains primitives for multi-gpu communication. +This is useful when doing distributed training. +""" + +import functools +import logging +import numpy as np +import pickle +import torch +import torch.distributed as dist + +_LOCAL_PROCESS_GROUP = None +""" +A torch process group which only includes processes that on the same machine as the current process. +This variable is set when processes are spawned by `launch()` in "engine/launch.py". +""" + + +def get_world_size() -> int: + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def get_rank() -> int: + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + return dist.get_rank() + + +def get_local_rank() -> int: + """ + Returns: + The rank of the current process within the local (per-machine) process group. + """ + if not dist.is_available(): + return 0 + if not dist.is_initialized(): + return 0 + assert _LOCAL_PROCESS_GROUP is not None + return dist.get_rank(group=_LOCAL_PROCESS_GROUP) + + +def get_local_size() -> int: + """ + Returns: + The size of the per-machine process group, + i.e. the number of processes per machine. + """ + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size(group=_LOCAL_PROCESS_GROUP) + + +def is_main_process() -> bool: + return get_rank() == 0 + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +@functools.lru_cache() +def _get_global_gloo_group(): + """ + Return a process group based on gloo backend, containing all the ranks + The result is cached. + """ + if dist.get_backend() == "nccl": + return dist.new_group(backend="gloo") + else: + return dist.group.WORLD + + +def _serialize_to_tensor(data, group): + backend = dist.get_backend(group) + assert backend in ["gloo", "nccl"] + device = torch.device("cpu" if backend == "gloo" else "cuda") + + buffer = pickle.dumps(data) + if len(buffer) > 1024 ** 3: + logger = logging.getLogger(__name__) + logger.warning( + "Rank {} trying to all-gather {:.2f} GB of data on device {}".format( + get_rank(), len(buffer) / (1024 ** 3), device + ) + ) + storage = torch.ByteStorage.from_buffer(buffer) + tensor = torch.ByteTensor(storage).to(device=device) + return tensor + + +def _pad_to_largest_tensor(tensor, group): + """ + Returns: + list[int]: size of the tensor, on each rank + Tensor: padded tensor that has the max size + """ + world_size = dist.get_world_size(group=group) + assert ( + world_size >= 1 + ), "comm.gather/all_gather must be called from ranks within the given group!" + local_size = torch.tensor([tensor.numel()], dtype=torch.int64, device=tensor.device) + size_list = [ + torch.zeros([1], dtype=torch.int64, device=tensor.device) for _ in range(world_size) + ] + dist.all_gather(size_list, local_size, group=group) + size_list = [int(size.item()) for size in size_list] + + max_size = max(size_list) + + # we pad the tensor because torch all_gather does not support + # gathering tensors of different shapes + if local_size != max_size: + padding = torch.zeros((max_size - local_size,), dtype=torch.uint8, device=tensor.device) + tensor = torch.cat((tensor, padding), dim=0) + return size_list, tensor + + +def all_gather(data, group=None): + """ + Run all_gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + + Returns: + list[data]: list of data gathered from each rank + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group) == 1: + return [data] + + tensor = _serialize_to_tensor(data, group) + + size_list, tensor = _pad_to_largest_tensor(tensor, group) + max_size = max(size_list) + + # receiving Tensor from all ranks + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list + ] + dist.all_gather(tensor_list, tensor, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + + return data_list + + +def gather(data, dst=0, group=None): + """ + Run gather on arbitrary picklable data (not necessarily tensors). + + Args: + data: any picklable object + dst (int): destination rank + group: a torch process group. By default, will use a group which + contains all ranks on gloo backend. + + Returns: + list[data]: on dst, a list of data gathered from each rank. Otherwise, + an empty list. + """ + if get_world_size() == 1: + return [data] + if group is None: + group = _get_global_gloo_group() + if dist.get_world_size(group=group) == 1: + return [data] + rank = dist.get_rank(group=group) + + tensor = _serialize_to_tensor(data, group) + size_list, tensor = _pad_to_largest_tensor(tensor, group) + + # receiving Tensor from all ranks + if rank == dst: + max_size = max(size_list) + tensor_list = [ + torch.empty((max_size,), dtype=torch.uint8, device=tensor.device) for _ in size_list + ] + dist.gather(tensor, tensor_list, dst=dst, group=group) + + data_list = [] + for size, tensor in zip(size_list, tensor_list): + buffer = tensor.cpu().numpy().tobytes()[:size] + data_list.append(pickle.loads(buffer)) + return data_list + else: + dist.gather(tensor, [], dst=dst, group=group) + return [] + + +def shared_random_seed(): + """ + Returns: + int: a random number that is the same across all workers. + If workers need a shared RNG, they can use this shared seed to + create one. + + All workers must call this function, otherwise it will deadlock. + """ + ints = np.random.randint(2 ** 31) + all_ints = all_gather(ints) + return all_ints[0] + + +def reduce_dict(input_dict, average=True): + """ + Reduce the values in the dictionary from all processes so that process with rank + 0 has the reduced results. + + Args: + input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor. + average (bool): whether to do average or sum + + Returns: + a dict with the same keys as input_dict, after reduction. + """ + world_size = get_world_size() + if world_size < 2: + return input_dict + with torch.no_grad(): + names = [] + values = [] + # sort the keys so that they are consistent across processes + for k in sorted(input_dict.keys()): + names.append(k) + values.append(input_dict[k]) + values = torch.stack(values, dim=0) + dist.reduce(values, dst=0) + if dist.get_rank() == 0 and average: + # only main process gets accumulated, so only divide by + # world_size in this case + values /= world_size + reduced_dict = {k: v for k, v in zip(names, values)} + return reduced_dict diff --git a/detectron2/utils/env.py b/detectron2/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..6769cae4cfb71ae05c605cb9e30eb12ee58c6ee7 --- /dev/null +++ b/detectron2/utils/env.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import importlib +import importlib.util +import logging +import numpy as np +import os +import random +import sys +from datetime import datetime +import torch + +__all__ = ["seed_all_rng"] + + +def seed_all_rng(seed=None): + """ + Set the random seed for the RNG in torch, numpy and python. + + Args: + seed (int): if None, will use a strong random seed. + """ + if seed is None: + seed = ( + os.getpid() + + int(datetime.now().strftime("%S%f")) + + int.from_bytes(os.urandom(2), "big") + ) + logger = logging.getLogger(__name__) + logger.info("Using a generated random seed {}".format(seed)) + np.random.seed(seed) + torch.set_rng_state(torch.manual_seed(seed).get_state()) + random.seed(seed) + + +# from https://stackoverflow.com/questions/67631/how-to-import-a-module-given-the-full-path +def _import_file(module_name, file_path, make_importable=False): + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + if make_importable: + sys.modules[module_name] = module + return module + + +def _configure_libraries(): + """ + Configurations for some libraries. + """ + # An environment option to disable `import cv2` globally, + # in case it leads to negative performance impact + disable_cv2 = int(os.environ.get("DETECTRON2_DISABLE_CV2", False)) + if disable_cv2: + sys.modules["cv2"] = None + else: + # Disable opencl in opencv since its interaction with cuda often has negative effects + # This envvar is supported after OpenCV 3.4.0 + os.environ["OPENCV_OPENCL_RUNTIME"] = "disabled" + try: + import cv2 + + if int(cv2.__version__.split(".")[0]) >= 3: + cv2.ocl.setUseOpenCL(False) + except ImportError: + pass + + def get_version(module, digit=2): + return tuple(map(int, module.__version__.split(".")[:digit])) + + # fmt: off + assert get_version(torch) >= (1, 4), "Requires torch>=1.4" + import fvcore + assert get_version(fvcore, 3) >= (0, 1, 1), "Requires fvcore>=0.1.1" + import yaml + assert get_version(yaml) >= (5, 1), "Requires pyyaml>=5.1" + # fmt: on + + +_ENV_SETUP_DONE = False + + +def setup_environment(): + """Perform environment setup work. The default setup is a no-op, but this + function allows the user to specify a Python source file or a module in + the $DETECTRON2_ENV_MODULE environment variable, that performs + custom setup work that may be necessary to their computing environment. + """ + global _ENV_SETUP_DONE + if _ENV_SETUP_DONE: + return + _ENV_SETUP_DONE = True + + _configure_libraries() + + custom_module_path = os.environ.get("DETECTRON2_ENV_MODULE") + + if custom_module_path: + setup_custom_environment(custom_module_path) + else: + # The default setup is a no-op + pass + + +def setup_custom_environment(custom_module): + """ + Load custom environment setup by importing a Python source file or a + module, and run the setup function. + """ + if custom_module.endswith(".py"): + module = _import_file("detectron2.utils.env.custom_module", custom_module) + else: + module = importlib.import_module(custom_module) + assert hasattr(module, "setup_environment") and callable(module.setup_environment), ( + "Custom environment module defined in {} does not have the " + "required callable attribute 'setup_environment'." + ).format(custom_module) + module.setup_environment() diff --git a/detectron2/utils/events.py b/detectron2/utils/events.py new file mode 100644 index 0000000000000000000000000000000000000000..a3c57edb05016d2df041d756f59e90dfabddd718 --- /dev/null +++ b/detectron2/utils/events.py @@ -0,0 +1,432 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import datetime +import json +import logging +import os +import time +from collections import defaultdict +from contextlib import contextmanager +import torch +from fvcore.common.file_io import PathManager +from fvcore.common.history_buffer import HistoryBuffer + +_CURRENT_STORAGE_STACK = [] + + +def get_event_storage(): + """ + Returns: + The :class:`EventStorage` object that's currently being used. + Throws an error if no :class:`EventStorage` is currently enabled. + """ + assert len( + _CURRENT_STORAGE_STACK + ), "get_event_storage() has to be called inside a 'with EventStorage(...)' context!" + return _CURRENT_STORAGE_STACK[-1] + + +class EventWriter: + """ + Base class for writers that obtain events from :class:`EventStorage` and process them. + """ + + def write(self): + raise NotImplementedError + + def close(self): + pass + + +class JSONWriter(EventWriter): + """ + Write scalars to a json file. + + It saves scalars as one json per line (instead of a big json) for easy parsing. + + Examples parsing such a json file: + + .. code-block:: none + + $ cat metrics.json | jq -s '.[0:2]' + [ + { + "data_time": 0.008433341979980469, + "iteration": 20, + "loss": 1.9228371381759644, + "loss_box_reg": 0.050025828182697296, + "loss_classifier": 0.5316952466964722, + "loss_mask": 0.7236229181289673, + "loss_rpn_box": 0.0856662318110466, + "loss_rpn_cls": 0.48198649287223816, + "lr": 0.007173333333333333, + "time": 0.25401854515075684 + }, + { + "data_time": 0.007216215133666992, + "iteration": 40, + "loss": 1.282649278640747, + "loss_box_reg": 0.06222952902317047, + "loss_classifier": 0.30682939291000366, + "loss_mask": 0.6970193982124329, + "loss_rpn_box": 0.038663312792778015, + "loss_rpn_cls": 0.1471673548221588, + "lr": 0.007706666666666667, + "time": 0.2490077018737793 + } + ] + + $ cat metrics.json | jq '.loss_mask' + 0.7126231789588928 + 0.689423680305481 + 0.6776131987571716 + ... + + """ + + def __init__(self, json_file, window_size=20): + """ + Args: + json_file (str): path to the json file. New data will be appended if the file exists. + window_size (int): the window size of median smoothing for the scalars whose + `smoothing_hint` are True. + """ + self._file_handle = PathManager.open(json_file, "a") + self._window_size = window_size + + def write(self): + storage = get_event_storage() + to_save = {"iteration": storage.iter} + to_save.update(storage.latest_with_smoothing_hint(self._window_size)) + self._file_handle.write(json.dumps(to_save, sort_keys=True) + "\n") + self._file_handle.flush() + try: + os.fsync(self._file_handle.fileno()) + except AttributeError: + pass + + def close(self): + self._file_handle.close() + + +class TensorboardXWriter(EventWriter): + """ + Write all scalars to a tensorboard file. + """ + + def __init__(self, log_dir: str, window_size: int = 20, **kwargs): + """ + Args: + log_dir (str): the directory to save the output events + window_size (int): the scalars will be median-smoothed by this window size + + kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` + """ + self._window_size = window_size + from torch.utils.tensorboard import SummaryWriter + + self._writer = SummaryWriter(log_dir, **kwargs) + + def write(self): + storage = get_event_storage() + for k, v in storage.latest_with_smoothing_hint(self._window_size).items(): + self._writer.add_scalar(k, v, storage.iter) + + # storage.put_{image,histogram} is only meant to be used by + # tensorboard writer. So we access its internal fields directly from here. + if len(storage._vis_data) >= 1: + for img_name, img, step_num in storage._vis_data: + self._writer.add_image(img_name, img, step_num) + # Storage stores all image data and rely on this writer to clear them. + # As a result it assumes only one writer will use its image data. + # An alternative design is to let storage store limited recent + # data (e.g. only the most recent image) that all writers can access. + # In that case a writer may not see all image data if its period is long. + storage.clear_images() + + if len(storage._histograms) >= 1: + for params in storage._histograms: + self._writer.add_histogram_raw(**params) + storage.clear_histograms() + + def close(self): + if hasattr(self, "_writer"): # doesn't exist when the code fails at import + self._writer.close() + + +class CommonMetricPrinter(EventWriter): + """ + Print **common** metrics to the terminal, including + iteration time, ETA, memory, all losses, and the learning rate. + + To print something different, please implement a similar printer by yourself. + """ + + def __init__(self, max_iter): + """ + Args: + max_iter (int): the maximum number of iterations to train. + Used to compute ETA. + """ + self.logger = logging.getLogger(__name__) + self._max_iter = max_iter + self._last_write = None + + def write(self): + storage = get_event_storage() + iteration = storage.iter + + try: + data_time = storage.history("data_time").avg(20) + except KeyError: + # they may not exist in the first few iterations (due to warmup) + # or when SimpleTrainer is not used + data_time = None + + eta_string = None + try: + iter_time = storage.history("time").global_avg() + eta_seconds = storage.history("time").median(1000) * (self._max_iter - iteration) + storage.put_scalar("eta_seconds", eta_seconds, smoothing_hint=False) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + except KeyError: + iter_time = None + # estimate eta on our own - more noisy + if self._last_write is not None: + estimate_iter_time = (time.perf_counter() - self._last_write[1]) / ( + iteration - self._last_write[0] + ) + eta_seconds = estimate_iter_time * (self._max_iter - iteration) + eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) + self._last_write = (iteration, time.perf_counter()) + + try: + lr = "{:.6f}".format(storage.history("lr").latest()) + except KeyError: + lr = "N/A" + + if torch.cuda.is_available(): + max_mem_mb = torch.cuda.max_memory_allocated() / 1024.0 / 1024.0 + else: + max_mem_mb = None + + # NOTE: max_mem is parsed by grep in "dev/parse_results.sh" + self.logger.info( + " {eta}iter: {iter} {losses} {time}{data_time}lr: {lr} {memory}".format( + eta=f"eta: {eta_string} " if eta_string else "", + iter=iteration, + losses=" ".join( + [ + "{}: {:.3f}".format(k, v.median(20)) + for k, v in storage.histories().items() + if "loss" in k + ] + ), + time="time: {:.4f} ".format(iter_time) if iter_time is not None else "", + data_time="data_time: {:.4f} ".format(data_time) if data_time is not None else "", + lr=lr, + memory="max_mem: {:.0f}M".format(max_mem_mb) if max_mem_mb is not None else "", + ) + ) + + +class EventStorage: + """ + The user-facing class that provides metric storage functionalities. + + In the future we may add support for storing / logging other types of data if needed. + """ + + def __init__(self, start_iter=0): + """ + Args: + start_iter (int): the iteration number to start with + """ + self._history = defaultdict(HistoryBuffer) + self._smoothing_hints = {} + self._latest_scalars = {} + self._iter = start_iter + self._current_prefix = "" + self._vis_data = [] + self._histograms = [] + + def put_image(self, img_name, img_tensor): + """ + Add an `img_tensor` associated with `img_name`, to be shown on + tensorboard. + + Args: + img_name (str): The name of the image to put into tensorboard. + img_tensor (torch.Tensor or numpy.array): An `uint8` or `float` + Tensor of shape `[channel, height, width]` where `channel` is + 3. The image format should be RGB. The elements in img_tensor + can either have values in [0, 1] (float32) or [0, 255] (uint8). + The `img_tensor` will be visualized in tensorboard. + """ + self._vis_data.append((img_name, img_tensor, self._iter)) + + def put_scalar(self, name, value, smoothing_hint=True): + """ + Add a scalar `value` to the `HistoryBuffer` associated with `name`. + + Args: + smoothing_hint (bool): a 'hint' on whether this scalar is noisy and should be + smoothed when logged. The hint will be accessible through + :meth:`EventStorage.smoothing_hints`. A writer may ignore the hint + and apply custom smoothing rule. + + It defaults to True because most scalars we save need to be smoothed to + provide any useful signal. + """ + name = self._current_prefix + name + history = self._history[name] + value = float(value) + history.update(value, self._iter) + self._latest_scalars[name] = value + + existing_hint = self._smoothing_hints.get(name) + if existing_hint is not None: + assert ( + existing_hint == smoothing_hint + ), "Scalar {} was put with a different smoothing_hint!".format(name) + else: + self._smoothing_hints[name] = smoothing_hint + + def put_scalars(self, *, smoothing_hint=True, **kwargs): + """ + Put multiple scalars from keyword arguments. + + Examples: + + storage.put_scalars(loss=my_loss, accuracy=my_accuracy, smoothing_hint=True) + """ + for k, v in kwargs.items(): + self.put_scalar(k, v, smoothing_hint=smoothing_hint) + + def put_histogram(self, hist_name, hist_tensor, bins=1000): + """ + Create a histogram from a tensor. + + Args: + hist_name (str): The name of the histogram to put into tensorboard. + hist_tensor (torch.Tensor): A Tensor of arbitrary shape to be converted + into a histogram. + bins (int): Number of histogram bins. + """ + ht_min, ht_max = hist_tensor.min().item(), hist_tensor.max().item() + + # Create a histogram with PyTorch + hist_counts = torch.histc(hist_tensor, bins=bins) + hist_edges = torch.linspace(start=ht_min, end=ht_max, steps=bins + 1, dtype=torch.float32) + + # Parameter for the add_histogram_raw function of SummaryWriter + hist_params = dict( + tag=hist_name, + min=ht_min, + max=ht_max, + num=len(hist_tensor), + sum=float(hist_tensor.sum()), + sum_squares=float(torch.sum(hist_tensor ** 2)), + bucket_limits=hist_edges[1:].tolist(), + bucket_counts=hist_counts.tolist(), + global_step=self._iter, + ) + self._histograms.append(hist_params) + + def history(self, name): + """ + Returns: + HistoryBuffer: the scalar history for name + """ + ret = self._history.get(name, None) + if ret is None: + raise KeyError("No history metric available for {}!".format(name)) + return ret + + def histories(self): + """ + Returns: + dict[name -> HistoryBuffer]: the HistoryBuffer for all scalars + """ + return self._history + + def latest(self): + """ + Returns: + dict[name -> number]: the scalars that's added in the current iteration. + """ + return self._latest_scalars + + def latest_with_smoothing_hint(self, window_size=20): + """ + Similar to :meth:`latest`, but the returned values + are either the un-smoothed original latest value, + or a median of the given window_size, + depend on whether the smoothing_hint is True. + + This provides a default behavior that other writers can use. + """ + result = {} + for k, v in self._latest_scalars.items(): + result[k] = self._history[k].median(window_size) if self._smoothing_hints[k] else v + return result + + def smoothing_hints(self): + """ + Returns: + dict[name -> bool]: the user-provided hint on whether the scalar + is noisy and needs smoothing. + """ + return self._smoothing_hints + + def step(self): + """ + User should call this function at the beginning of each iteration, to + notify the storage of the start of a new iteration. + The storage will then be able to associate the new data with the + correct iteration number. + """ + self._iter += 1 + self._latest_scalars = {} + + @property + def iter(self): + return self._iter + + @property + def iteration(self): + # for backward compatibility + return self._iter + + def __enter__(self): + _CURRENT_STORAGE_STACK.append(self) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + assert _CURRENT_STORAGE_STACK[-1] == self + _CURRENT_STORAGE_STACK.pop() + + @contextmanager + def name_scope(self, name): + """ + Yields: + A context within which all the events added to this storage + will be prefixed by the name scope. + """ + old_prefix = self._current_prefix + self._current_prefix = name.rstrip("/") + "/" + yield + self._current_prefix = old_prefix + + def clear_images(self): + """ + Delete all the stored images for visualization. This should be called + after images are written to tensorboard. + """ + self._vis_data = [] + + def clear_histograms(self): + """ + Delete all the stored histograms for visualization. + This should be called after histograms are written to tensorboard. + """ + self._histograms = [] diff --git a/detectron2/utils/logger.py b/detectron2/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b6496d9d6096f557ffa684be80342ec220c6014c --- /dev/null +++ b/detectron2/utils/logger.py @@ -0,0 +1,221 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import functools +import logging +import os +import sys +import time +from collections import Counter +from fvcore.common.file_io import PathManager +from tabulate import tabulate +from termcolor import colored + + +class _ColorfulFormatter(logging.Formatter): + def __init__(self, *args, **kwargs): + self._root_name = kwargs.pop("root_name") + "." + self._abbrev_name = kwargs.pop("abbrev_name", "") + if len(self._abbrev_name): + self._abbrev_name = self._abbrev_name + "." + super(_ColorfulFormatter, self).__init__(*args, **kwargs) + + def formatMessage(self, record): + record.name = record.name.replace(self._root_name, self._abbrev_name) + log = super(_ColorfulFormatter, self).formatMessage(record) + if record.levelno == logging.WARNING: + prefix = colored("WARNING", "red", attrs=["blink"]) + elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL: + prefix = colored("ERROR", "red", attrs=["blink", "underline"]) + else: + return log + return prefix + " " + log + + +@functools.lru_cache() # so that calling setup_logger multiple times won't add many handlers +def setup_logger( + output=None, distributed_rank=0, *, color=True, name="detectron2", abbrev_name=None +): + """ + Initialize the detectron2 logger and set its verbosity level to "DEBUG". + + Args: + output (str): a file name or a directory to save log. If None, will not save log file. + If ends with ".txt" or ".log", assumed to be a file name. + Otherwise, logs will be saved to `output/log.txt`. + name (str): the root module name of this logger + abbrev_name (str): an abbreviation of the module, to avoid long names in logs. + Set to "" to not log the root module in logs. + By default, will abbreviate "detectron2" to "d2" and leave other + modules unchanged. + + Returns: + logging.Logger: a logger + """ + logger = logging.getLogger(name) + logger.setLevel(logging.DEBUG) + logger.propagate = False + + if abbrev_name is None: + abbrev_name = "d2" if name == "detectron2" else name + + plain_formatter = logging.Formatter( + "[%(asctime)s] %(name)s %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S" + ) + # stdout logging: master only + if distributed_rank == 0: + ch = logging.StreamHandler(stream=sys.stdout) + ch.setLevel(logging.DEBUG) + if color: + formatter = _ColorfulFormatter( + colored("[%(asctime)s %(name)s]: ", "green") + "%(message)s", + datefmt="%m/%d %H:%M:%S", + root_name=name, + abbrev_name=str(abbrev_name), + ) + else: + formatter = plain_formatter + ch.setFormatter(formatter) + logger.addHandler(ch) + + # file logging: all workers + if output is not None: + if output.endswith(".txt") or output.endswith(".log"): + filename = output + else: + filename = os.path.join(output, "log.txt") + if distributed_rank > 0: + filename = filename + ".rank{}".format(distributed_rank) + PathManager.mkdirs(os.path.dirname(filename)) + + fh = logging.StreamHandler(_cached_log_stream(filename)) + fh.setLevel(logging.DEBUG) + fh.setFormatter(plain_formatter) + logger.addHandler(fh) + + return logger + + +# cache the opened file object, so that different calls to `setup_logger` +# with the same file name can safely write to the same file. +@functools.lru_cache(maxsize=None) +def _cached_log_stream(filename): + return PathManager.open(filename, "a") + + +""" +Below are some other convenient logging methods. +They are mainly adopted from +https://github.com/abseil/abseil-py/blob/master/absl/logging/__init__.py +""" + + +def _find_caller(): + """ + Returns: + str: module name of the caller + tuple: a hashable key to be used to identify different callers + """ + frame = sys._getframe(2) + while frame: + code = frame.f_code + if os.path.join("utils", "logger.") not in code.co_filename: + mod_name = frame.f_globals["__name__"] + if mod_name == "__main__": + mod_name = "detectron2" + return mod_name, (code.co_filename, frame.f_lineno, code.co_name) + frame = frame.f_back + + +_LOG_COUNTER = Counter() +_LOG_TIMER = {} + + +def log_first_n(lvl, msg, n=1, *, name=None, key="caller"): + """ + Log only for the first n times. + + Args: + lvl (int): the logging level + msg (str): + n (int): + name (str): name of the logger to use. Will use the caller's module by default. + key (str or tuple[str]): the string(s) can be one of "caller" or + "message", which defines how to identify duplicated logs. + For example, if called with `n=1, key="caller"`, this function + will only log the first call from the same caller, regardless of + the message content. + If called with `n=1, key="message"`, this function will log the + same content only once, even if they are called from different places. + If called with `n=1, key=("caller", "message")`, this function + will not log only if the same caller has logged the same message before. + """ + if isinstance(key, str): + key = (key,) + assert len(key) > 0 + + caller_module, caller_key = _find_caller() + hash_key = () + if "caller" in key: + hash_key = hash_key + caller_key + if "message" in key: + hash_key = hash_key + (msg,) + + _LOG_COUNTER[hash_key] += 1 + if _LOG_COUNTER[hash_key] <= n: + logging.getLogger(name or caller_module).log(lvl, msg) + + +def log_every_n(lvl, msg, n=1, *, name=None): + """ + Log once per n times. + + Args: + lvl (int): the logging level + msg (str): + n (int): + name (str): name of the logger to use. Will use the caller's module by default. + """ + caller_module, key = _find_caller() + _LOG_COUNTER[key] += 1 + if n == 1 or _LOG_COUNTER[key] % n == 1: + logging.getLogger(name or caller_module).log(lvl, msg) + + +def log_every_n_seconds(lvl, msg, n=1, *, name=None): + """ + Log no more than once per n seconds. + + Args: + lvl (int): the logging level + msg (str): + n (int): + name (str): name of the logger to use. Will use the caller's module by default. + """ + caller_module, key = _find_caller() + last_logged = _LOG_TIMER.get(key, None) + current_time = time.time() + if last_logged is None or current_time - last_logged >= n: + logging.getLogger(name or caller_module).log(lvl, msg) + _LOG_TIMER[key] = current_time + + +def create_small_table(small_dict): + """ + Create a small table using the keys of small_dict as headers. This is only + suitable for small dictionaries. + + Args: + small_dict (dict): a result dictionary of only a few items. + + Returns: + str: the table as a string. + """ + keys, values = tuple(zip(*small_dict.items())) + table = tabulate( + [values], + headers=keys, + tablefmt="pipe", + floatfmt=".3f", + stralign="center", + numalign="center", + ) + return table diff --git a/detectron2/utils/memory.py b/detectron2/utils/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..d495a1681f460668c96f64454e31e7f2fca8137a --- /dev/null +++ b/detectron2/utils/memory.py @@ -0,0 +1,86 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import logging +from contextlib import contextmanager +from functools import wraps +import torch + +__all__ = ["retry_if_cuda_oom"] + + +@contextmanager +def _ignore_torch_cuda_oom(): + """ + A context which ignores CUDA OOM exception from pytorch. + """ + try: + yield + except RuntimeError as e: + # NOTE: the string may change? + if "CUDA out of memory. " in str(e): + pass + else: + raise + + +def retry_if_cuda_oom(func): + """ + Makes a function retry itself after encountering + pytorch's CUDA OOM error. + It will first retry after calling `torch.cuda.empty_cache()`. + + If that still fails, it will then retry by trying to convert inputs to CPUs. + In this case, it expects the function to dispatch to CPU implementation. + The return values may become CPU tensors as well and it's user's + responsibility to convert it back to CUDA tensor if needed. + + Args: + func: a stateless callable that takes tensor-like objects as arguments + + Returns: + a callable which retries `func` if OOM is encountered. + + Examples: + + .. code-block:: python + + output = retry_if_cuda_oom(some_torch_function)(input1, input2) + # output may be on CPU even if inputs are on GPU + + Note: + 1. When converting inputs to CPU, it will only look at each argument and check + if it has `.device` and `.to` for conversion. Nested structures of tensors + are not supported. + + 2. Since the function might be called more than once, it has to be + stateless. + """ + + def maybe_to_cpu(x): + try: + like_gpu_tensor = x.device.type == "cuda" and hasattr(x, "to") + except AttributeError: + like_gpu_tensor = False + if like_gpu_tensor: + return x.to(device="cpu") + else: + return x + + @wraps(func) + def wrapped(*args, **kwargs): + with _ignore_torch_cuda_oom(): + return func(*args, **kwargs) + + # Clear cache and retry + torch.cuda.empty_cache() + with _ignore_torch_cuda_oom(): + return func(*args, **kwargs) + + # Try on CPU. This slows down the code significantly, therefore print a notice. + logger = logging.getLogger(__name__) + logger.info("Attempting to copy inputs of {} to CPU due to CUDA OOM".format(str(func))) + new_args = (maybe_to_cpu(x) for x in args) + new_kwargs = {k: maybe_to_cpu(v) for k, v in kwargs.items()} + return func(*new_args, **new_kwargs) + + return wrapped diff --git a/detectron2/utils/registry.py b/detectron2/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..fea1de961f0dbdacc934e11b9af5647b2a008051 --- /dev/null +++ b/detectron2/utils/registry.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Keep this module for backward compatibility. +from fvcore.common.registry import Registry # noqa + +__all__ = ["Registry"] diff --git a/detectron2/utils/serialize.py b/detectron2/utils/serialize.py new file mode 100644 index 0000000000000000000000000000000000000000..734a62c2c4ecfd520eb9e8b941857b6f7e17d4c8 --- /dev/null +++ b/detectron2/utils/serialize.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import cloudpickle + + +class PicklableWrapper(object): + """ + Wrap an object to make it more picklable, note that it uses + heavy weight serialization libraries that are slower than pickle. + It's best to use it only on closures (which are usually not picklable). + + This is a simplified version of + https://github.com/joblib/joblib/blob/master/joblib/externals/loky/cloudpickle_wrapper.py + """ + + def __init__(self, obj): + self._obj = obj + + def __reduce__(self): + s = cloudpickle.dumps(self._obj) + return cloudpickle.loads, (s,) + + def __call__(self, *args, **kwargs): + return self._obj(*args, **kwargs) + + def __getattr__(self, attr): + # Ensure that the wrapped object can be used seamlessly as the previous object. + if attr not in ["_obj"]: + return getattr(self._obj, attr) + return getattr(self, attr) diff --git a/detectron2/utils/video_visualizer.py b/detectron2/utils/video_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..0144b679d09bbb8049c30eb849099422355b492c --- /dev/null +++ b/detectron2/utils/video_visualizer.py @@ -0,0 +1,235 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import pycocotools.mask as mask_util + +from detectron2.utils.visualizer import ( + ColorMode, + Visualizer, + _create_text_labels, + _PanopticPrediction, +) + +from .colormap import random_color + + +class _DetectedInstance: + """ + Used to store data about detected objects in video frame, + in order to transfer color to objects in the future frames. + + Attributes: + label (int): + bbox (tuple[float]): + mask_rle (dict): + color (tuple[float]): RGB colors in range (0, 1) + ttl (int): time-to-live for the instance. For example, if ttl=2, + the instance color can be transferred to objects in the next two frames. + """ + + __slots__ = ["label", "bbox", "mask_rle", "color", "ttl"] + + def __init__(self, label, bbox, mask_rle, color, ttl): + self.label = label + self.bbox = bbox + self.mask_rle = mask_rle + self.color = color + self.ttl = ttl + + +class VideoVisualizer: + def __init__(self, metadata, instance_mode=ColorMode.IMAGE): + """ + Args: + metadata (MetadataCatalog): image metadata. + """ + self.metadata = metadata + self._old_instances = [] + assert instance_mode in [ + ColorMode.IMAGE, + ColorMode.IMAGE_BW, + ], "Other mode not supported yet." + self._instance_mode = instance_mode + + def draw_instance_predictions(self, frame, predictions): + """ + Draw instance-level prediction results on an image. + + Args: + frame (ndarray): an RGB image of shape (H, W, C), in the range [0, 255]. + predictions (Instances): the output of an instance detection/segmentation + model. Following fields will be used to draw: + "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). + + Returns: + output (VisImage): image object with visualizations. + """ + frame_visualizer = Visualizer(frame, self.metadata) + num_instances = len(predictions) + if num_instances == 0: + return frame_visualizer.output + + boxes = predictions.pred_boxes.tensor.numpy() if predictions.has("pred_boxes") else None + scores = predictions.scores if predictions.has("scores") else None + classes = predictions.pred_classes.numpy() if predictions.has("pred_classes") else None + keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None + + if predictions.has("pred_masks"): + masks = predictions.pred_masks + # mask IOU is not yet enabled + # masks_rles = mask_util.encode(np.asarray(masks.permute(1, 2, 0), order="F")) + # assert len(masks_rles) == num_instances + else: + masks = None + + detected = [ + _DetectedInstance(classes[i], boxes[i], mask_rle=None, color=None, ttl=8) + for i in range(num_instances) + ] + colors = self._assign_colors(detected) + + labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) + + if self._instance_mode == ColorMode.IMAGE_BW: + # any() returns uint8 tensor + frame_visualizer.output.img = frame_visualizer._create_grayscale_image( + (masks.any(dim=0) > 0).numpy() if masks is not None else None + ) + alpha = 0.3 + else: + alpha = 0.5 + + frame_visualizer.overlay_instances( + boxes=None if masks is not None else boxes, # boxes are a bit distracting + masks=masks, + labels=labels, + keypoints=keypoints, + assigned_colors=colors, + alpha=alpha, + ) + + return frame_visualizer.output + + def draw_sem_seg(self, frame, sem_seg, area_threshold=None): + """ + Args: + sem_seg (ndarray or Tensor): semantic segmentation of shape (H, W), + each value is the integer label. + area_threshold (Optional[int]): only draw segmentations larger than the threshold + """ + # don't need to do anything special + frame_visualizer = Visualizer(frame, self.metadata) + frame_visualizer.draw_sem_seg(sem_seg, area_threshold=None) + return frame_visualizer.output + + def draw_panoptic_seg_predictions( + self, frame, panoptic_seg, segments_info, area_threshold=None, alpha=0.5 + ): + frame_visualizer = Visualizer(frame, self.metadata) + pred = _PanopticPrediction(panoptic_seg, segments_info) + + if self._instance_mode == ColorMode.IMAGE_BW: + frame_visualizer.output.img = frame_visualizer._create_grayscale_image( + pred.non_empty_mask() + ) + + # draw mask for all semantic segments first i.e. "stuff" + for mask, sinfo in pred.semantic_masks(): + category_idx = sinfo["category_id"] + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] + except AttributeError: + mask_color = None + + frame_visualizer.draw_binary_mask( + mask, + color=mask_color, + text=self.metadata.stuff_classes[category_idx], + alpha=alpha, + area_threshold=area_threshold, + ) + + all_instances = list(pred.instance_masks()) + if len(all_instances) == 0: + return frame_visualizer.output + # draw mask for all instances second + masks, sinfo = list(zip(*all_instances)) + num_instances = len(masks) + masks_rles = mask_util.encode( + np.asarray(np.asarray(masks).transpose(1, 2, 0), dtype=np.uint8, order="F") + ) + assert len(masks_rles) == num_instances + + category_ids = [x["category_id"] for x in sinfo] + detected = [ + _DetectedInstance(category_ids[i], bbox=None, mask_rle=masks_rles[i], color=None, ttl=8) + for i in range(num_instances) + ] + colors = self._assign_colors(detected) + labels = [self.metadata.thing_classes[k] for k in category_ids] + + frame_visualizer.overlay_instances( + boxes=None, + masks=masks, + labels=labels, + keypoints=None, + assigned_colors=colors, + alpha=alpha, + ) + return frame_visualizer.output + + def _assign_colors(self, instances): + """ + Naive tracking heuristics to assign same color to the same instance, + will update the internal state of tracked instances. + + Returns: + list[tuple[float]]: list of colors. + """ + + # Compute iou with either boxes or masks: + is_crowd = np.zeros((len(instances),), dtype=np.bool) + if instances[0].bbox is None: + assert instances[0].mask_rle is not None + # use mask iou only when box iou is None + # because box seems good enough + rles_old = [x.mask_rle for x in self._old_instances] + rles_new = [x.mask_rle for x in instances] + ious = mask_util.iou(rles_old, rles_new, is_crowd) + threshold = 0.5 + else: + boxes_old = [x.bbox for x in self._old_instances] + boxes_new = [x.bbox for x in instances] + ious = mask_util.iou(boxes_old, boxes_new, is_crowd) + threshold = 0.6 + if len(ious) == 0: + ious = np.zeros((len(self._old_instances), len(instances)), dtype="float32") + + # Only allow matching instances of the same label: + for old_idx, old in enumerate(self._old_instances): + for new_idx, new in enumerate(instances): + if old.label != new.label: + ious[old_idx, new_idx] = 0 + + matched_new_per_old = np.asarray(ious).argmax(axis=1) + max_iou_per_old = np.asarray(ious).max(axis=1) + + # Try to find match for each old instance: + extra_instances = [] + for idx, inst in enumerate(self._old_instances): + if max_iou_per_old[idx] > threshold: + newidx = matched_new_per_old[idx] + if instances[newidx].color is None: + instances[newidx].color = inst.color + continue + # If an old instance does not match any new instances, + # keep it for the next frame in case it is just missed by the detector + inst.ttl -= 1 + if inst.ttl > 0: + extra_instances.append(inst) + + # Assign random color to newly-detected instances: + for inst in instances: + if inst.color is None: + inst.color = random_color(rgb=True, maximum=1) + self._old_instances = instances[:] + extra_instances + return [d.color for d in instances] diff --git a/detectron2/utils/visualizer.py b/detectron2/utils/visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..05da03da1338f0d4d39785718c796c70998c2d7e --- /dev/null +++ b/detectron2/utils/visualizer.py @@ -0,0 +1,1143 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import colorsys +import logging +import math +import numpy as np +from enum import Enum, unique +import cv2 +import matplotlib as mpl +import matplotlib.colors as mplc +import matplotlib.figure as mplfigure +import pycocotools.mask as mask_util +import torch +from fvcore.common.file_io import PathManager +from matplotlib.backends.backend_agg import FigureCanvasAgg +from PIL import Image + +from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes + +from .colormap import random_color + +logger = logging.getLogger(__name__) + +__all__ = ["ColorMode", "VisImage", "Visualizer"] + + +_SMALL_OBJECT_AREA_THRESH = 1000 +_LARGE_MASK_AREA_THRESH = 120000 +_OFF_WHITE = (1.0, 1.0, 240.0 / 255) +_BLACK = (0, 0, 0) +_RED = (1.0, 0, 0) + +_KEYPOINT_THRESHOLD = 0.05 + + +@unique +class ColorMode(Enum): + """ + Enum of different color modes to use for instance visualizations. + """ + + IMAGE = 0 + """ + Picks a random color for every instance and overlay segmentations with low opacity. + """ + SEGMENTATION = 1 + """ + Let instances of the same category have similar colors + (from metadata.thing_colors), and overlay them with + high opacity. This provides more attention on the quality of segmentation. + """ + IMAGE_BW = 2 + """ + Same as IMAGE, but convert all areas without masks to gray-scale. + Only available for drawing per-instance mask predictions. + """ + + +class GenericMask: + """ + Attribute: + polygons (list[ndarray]): list[ndarray]: polygons for this mask. + Each ndarray has format [x, y, x, y, ...] + mask (ndarray): a binary mask + """ + + def __init__(self, mask_or_polygons, height, width): + self._mask = self._polygons = self._has_holes = None + self.height = height + self.width = width + + m = mask_or_polygons + if isinstance(m, dict): + # RLEs + assert "counts" in m and "size" in m + if isinstance(m["counts"], list): # uncompressed RLEs + h, w = m["size"] + assert h == height and w == width + m = mask_util.frPyObjects(m, h, w) + self._mask = mask_util.decode(m)[:, :] + return + + if isinstance(m, list): # list[ndarray] + self._polygons = [np.asarray(x).reshape(-1) for x in m] + return + + if isinstance(m, np.ndarray): # assumed to be a binary mask + assert m.shape[1] != 2, m.shape + assert m.shape == (height, width), m.shape + self._mask = m.astype("uint8") + return + + raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) + + @property + def mask(self): + if self._mask is None: + self._mask = self.polygons_to_mask(self._polygons) + return self._mask + + @property + def polygons(self): + if self._polygons is None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + return self._polygons + + @property + def has_holes(self): + if self._has_holes is None: + if self._mask is not None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + else: + self._has_holes = False # if original format is polygon, does not have holes + return self._has_holes + + def mask_to_polygons(self, mask): + # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level + # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. + # Internal contours (holes) are placed in hierarchy-2. + # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. + mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr + res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) + hierarchy = res[-1] + if hierarchy is None: # empty mask + return [], False + has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 + res = res[-2] + res = [x.flatten() for x in res] + res = [x for x in res if len(x) >= 6] + return res, has_holes + + def polygons_to_mask(self, polygons): + rle = mask_util.frPyObjects(polygons, self.height, self.width) + rle = mask_util.merge(rle) + return mask_util.decode(rle)[:, :] + + def area(self): + return self.mask.sum() + + def bbox(self): + p = mask_util.frPyObjects(self.polygons, self.height, self.width) + p = mask_util.merge(p) + bbox = mask_util.toBbox(p) + bbox[2] += bbox[0] + bbox[3] += bbox[1] + return bbox + + +class _PanopticPrediction: + def __init__(self, panoptic_seg, segments_info): + self._seg = panoptic_seg + + self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info + segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) + areas = areas.numpy() + sorted_idxs = np.argsort(-areas) + self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] + self._seg_ids = self._seg_ids.tolist() + for sid, area in zip(self._seg_ids, self._seg_areas): + if sid in self._sinfo: + self._sinfo[sid]["area"] = float(area) + + def non_empty_mask(self): + """ + Returns: + (H, W) array, a mask for all pixels that have a prediction + """ + empty_ids = [] + for id in self._seg_ids: + if id not in self._sinfo: + empty_ids.append(id) + if len(empty_ids) == 0: + return np.zeros(self._seg.shape, dtype=np.uint8) + assert ( + len(empty_ids) == 1 + ), ">1 ids corresponds to no labels. This is currently not supported" + return (self._seg != empty_ids[0]).numpy().astype(np.bool) + + def semantic_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or sinfo["isthing"]: + # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. + continue + yield (self._seg == sid).numpy().astype(np.bool), sinfo + + def instance_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or not sinfo["isthing"]: + continue + mask = (self._seg == sid).numpy().astype(np.bool) + if mask.sum() > 0: + yield mask, sinfo + + +def _create_text_labels(classes, scores, class_names): + """ + Args: + classes (list[int] or None): + scores (list[float] or None): + class_names (list[str] or None): + + Returns: + list[str] or None + """ + labels = None + if classes is not None and class_names is not None and len(class_names) > 1: + labels = [class_names[i] for i in classes] + if scores is not None: + if labels is None: + labels = ["{:.0f}%".format(s * 100) for s in scores] + else: + labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] + return labels + + +class VisImage: + def __init__(self, img, scale=1.0): + """ + Args: + img (ndarray): an RGB image of shape (H, W, 3). + scale (float): scale the input image + """ + self.img = img + self.scale = scale + self.width, self.height = img.shape[1], img.shape[0] + self._setup_figure(img) + + def _setup_figure(self, img): + """ + Args: + Same as in :meth:`__init__()`. + + Returns: + fig (matplotlib.pyplot.figure): top level container for all the image plot elements. + ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. + """ + fig = mplfigure.Figure(frameon=False) + self.dpi = fig.get_dpi() + # add a small 1e-2 to avoid precision lost due to matplotlib's truncation + # (https://github.com/matplotlib/matplotlib/issues/15363) + fig.set_size_inches( + (self.width * self.scale + 1e-2) / self.dpi, + (self.height * self.scale + 1e-2) / self.dpi, + ) + self.canvas = FigureCanvasAgg(fig) + # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) + ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) + ax.axis("off") + ax.set_xlim(0.0, self.width) + ax.set_ylim(self.height) + + self.fig = fig + self.ax = ax + + def save(self, filepath): + """ + Args: + filepath (str): a string that contains the absolute path, including the file name, where + the visualized image will be saved. + """ + if filepath.lower().endswith(".jpg") or filepath.lower().endswith(".png"): + # faster than matplotlib's imshow + cv2.imwrite(filepath, self.get_image()[:, :, ::-1]) + else: + # support general formats (e.g. pdf) + self.ax.imshow(self.img, interpolation="nearest") + self.fig.savefig(filepath) + + def get_image(self): + """ + Returns: + ndarray: + the visualized image of shape (H, W, 3) (RGB) in uint8 type. + The shape is scaled w.r.t the input image using the given `scale` argument. + """ + canvas = self.canvas + s, (width, height) = canvas.print_to_buffer() + if (self.width, self.height) != (width, height): + img = cv2.resize(self.img, (width, height)) + else: + img = self.img + + # buf = io.BytesIO() # works for cairo backend + # canvas.print_rgba(buf) + # width, height = self.width, self.height + # s = buf.getvalue() + + buffer = np.frombuffer(s, dtype="uint8") + + # imshow is slow. blend manually (still quite slow) + img_rgba = buffer.reshape(height, width, 4) + rgb, alpha = np.split(img_rgba, [3], axis=2) + + try: + import numexpr as ne # fuse them with numexpr + + visualized_image = ne.evaluate("img * (1 - alpha / 255.0) + rgb * (alpha / 255.0)") + except ImportError: + alpha = alpha.astype("float32") / 255.0 + visualized_image = img * (1 - alpha) + rgb * alpha + + visualized_image = visualized_image.astype("uint8") + + return visualized_image + + +class Visualizer: + def __init__(self, img_rgb, metadata, scale=1.0, instance_mode=ColorMode.IMAGE): + """ + Args: + img_rgb: a numpy array of shape (H, W, C), where H and W correspond to + the height and width of the image respectively. C is the number of + color channels. The image is required to be in RGB format since that + is a requirement of the Matplotlib library. The image is also expected + to be in the range [0, 255]. + metadata (MetadataCatalog): image metadata. + """ + self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) + self.metadata = metadata + self.output = VisImage(self.img, scale=scale) + self.cpu_device = torch.device("cpu") + + # too small texts are useless, therefore clamp to 9 + self._default_font_size = max( + np.sqrt(self.output.height * self.output.width) // 90, 10 // scale + ) + self._instance_mode = instance_mode + + def draw_instance_predictions(self, predictions): + """ + Draw instance-level prediction results on an image. + + Args: + predictions (Instances): the output of an instance detection/segmentation + model. Following fields will be used to draw: + "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). + + Returns: + output (VisImage): image object with visualizations. + """ + boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None + scores = predictions.scores if predictions.has("scores") else None + classes = predictions.pred_classes if predictions.has("pred_classes") else None + labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None)) + keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None + + if predictions.has("pred_masks"): + masks = np.asarray(predictions.pred_masks) + masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] + else: + masks = None + + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): + colors = [ + self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes + ] + alpha = 0.8 + else: + colors = None + alpha = 0.5 + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.img = self._create_grayscale_image( + (predictions.pred_masks.any(dim=0) > 0).numpy() + ) + alpha = 0.3 + + self.overlay_instances( + masks=masks, + boxes=boxes, + labels=labels, + keypoints=keypoints, + assigned_colors=colors, + alpha=alpha, + ) + return self.output + + def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8): + """ + Draw semantic segmentation predictions/labels. + + Args: + sem_seg (Tensor or ndarray): the segmentation of shape (H, W). + Each value is the integer label of the pixel. + area_threshold (int): segments with less than `area_threshold` are not drawn. + alpha (float): the larger it is, the more opaque the segmentations are. + + Returns: + output (VisImage): image object with visualizations. + """ + if isinstance(sem_seg, torch.Tensor): + sem_seg = sem_seg.numpy() + labels, areas = np.unique(sem_seg, return_counts=True) + sorted_idxs = np.argsort(-areas).tolist() + labels = labels[sorted_idxs] + for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] + except (AttributeError, IndexError): + mask_color = None + + binary_mask = (sem_seg == label).astype(np.uint8) + text = self.metadata.stuff_classes[label] + self.draw_binary_mask( + binary_mask, + color=mask_color, + edge_color=_OFF_WHITE, + text=text, + alpha=alpha, + area_threshold=area_threshold, + ) + return self.output + + def draw_panoptic_seg_predictions( + self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7 + ): + """ + Draw panoptic prediction results on an image. + + Args: + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each + segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + area_threshold (int): stuff segments with less than `area_threshold` are not drawn. + + Returns: + output (VisImage): image object with visualizations. + """ + pred = _PanopticPrediction(panoptic_seg, segments_info) + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.img = self._create_grayscale_image(pred.non_empty_mask()) + + # draw mask for all semantic segments first i.e. "stuff" + for mask, sinfo in pred.semantic_masks(): + category_idx = sinfo["category_id"] + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] + except AttributeError: + mask_color = None + + text = self.metadata.stuff_classes[category_idx] + self.draw_binary_mask( + mask, + color=mask_color, + edge_color=_OFF_WHITE, + text=text, + alpha=alpha, + area_threshold=area_threshold, + ) + + # draw mask for all instances second + all_instances = list(pred.instance_masks()) + if len(all_instances) == 0: + return self.output + masks, sinfo = list(zip(*all_instances)) + category_ids = [x["category_id"] for x in sinfo] + + try: + scores = [x["score"] for x in sinfo] + except KeyError: + scores = None + labels = _create_text_labels(category_ids, scores, self.metadata.thing_classes) + + try: + colors = [random_color(rgb=True, maximum=1) for k in category_ids] + except AttributeError: + colors = None + self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha) + + return self.output + + def draw_dataset_dict(self, dic): + """ + Draw annotations/segmentaions in Detectron2 Dataset format. + + Args: + dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. + + Returns: + output (VisImage): image object with visualizations. + """ + annos = dic.get("annotations", None) + if annos: + if "segmentation" in annos[0]: + masks = [x["segmentation"] for x in annos] + else: + masks = None + if "keypoints" in annos[0]: + keypts = [x["keypoints"] for x in annos] + keypts = np.array(keypts).reshape(len(annos), -1, 3) + else: + keypts = None + + boxes = [BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) for x in annos] + + labels = [x["category_id"] for x in annos] + colors = None + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"): + colors = [ + self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in labels + ] + names = self.metadata.get("thing_classes", None) + if names: + labels = [names[i] for i in labels] + labels = [ + "{}".format(i) + ("|crowd" if a.get("iscrowd", 0) else "") + for i, a in zip(labels, annos) + ] + self.overlay_instances( + labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors + ) + + sem_seg = dic.get("sem_seg", None) + if sem_seg is None and "sem_seg_file_name" in dic: + with PathManager.open(dic["sem_seg_file_name"], "rb") as f: + sem_seg = Image.open(f) + sem_seg = np.asarray(sem_seg, dtype="uint8") + if sem_seg is not None: + self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) + return self.output + + def overlay_instances( + self, + *, + boxes=None, + labels=None, + masks=None, + keypoints=None, + assigned_colors=None, + alpha=0.5 + ): + """ + Args: + boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, + or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, + or a :class:`RotatedBoxes`, + or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image, + labels (list[str]): the text to be displayed for each instance. + masks (masks-like object): Supported types are: + + * :class:`detectron2.structures.PolygonMasks`, + :class:`detectron2.structures.BitMasks`. + * list[list[ndarray]]: contains the segmentation masks for all objects in one image. + The first level of the list corresponds to individual instances. The second + level to all the polygon that compose the instance, and the third level + to the polygon coordinates. The third level should have the format of + [x0, y0, x1, y1, ..., xn, yn] (n >= 3). + * list[ndarray]: each ndarray is a binary mask of shape (H, W). + * list[dict]: each dict is a COCO-style RLE. + keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), + where the N is the number of instances and K is the number of keypoints. + The last dimension corresponds to (x, y, visibility or score). + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = None + if boxes is not None: + boxes = self._convert_boxes(boxes) + num_instances = len(boxes) + if masks is not None: + masks = self._convert_masks(masks) + if num_instances: + assert len(masks) == num_instances + else: + num_instances = len(masks) + if keypoints is not None: + if num_instances: + assert len(keypoints) == num_instances + else: + num_instances = len(keypoints) + keypoints = self._convert_keypoints(keypoints) + if labels is not None: + assert len(labels) == num_instances + if assigned_colors is None: + assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + if num_instances == 0: + return self.output + if boxes is not None and boxes.shape[1] == 5: + return self.overlay_rotated_instances( + boxes=boxes, labels=labels, assigned_colors=assigned_colors + ) + + # Display in largest to smallest order to reduce occlusion. + areas = None + if boxes is not None: + areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) + elif masks is not None: + areas = np.asarray([x.area() for x in masks]) + + if areas is not None: + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] if boxes is not None else None + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None + assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] + keypoints = keypoints[sorted_idxs] if keypoints is not None else None + + for i in range(num_instances): + color = assigned_colors[i] + if boxes is not None: + self.draw_box(boxes[i], edge_color=color) + + if masks is not None: + for segment in masks[i].polygons: + self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) + + if labels is not None: + # first get a box + if boxes is not None: + x0, y0, x1, y1 = boxes[i] + text_pos = (x0, y0) # if drawing boxes, put text on the box corner. + horiz_align = "left" + elif masks is not None: + x0, y0, x1, y1 = masks[i].bbox() + + # draw text in the center (defined by median) when box is not drawn + # median is less sensitive to outliers. + text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] + horiz_align = "center" + else: + continue # drawing the box confidence for keypoints isn't very useful. + # for small objects, draw text at the side to avoid occlusion + instance_area = (y1 - y0) * (x1 - x0) + if ( + instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale + or y1 - y0 < 40 * self.output.scale + ): + if y1 >= self.output.height - 5: + text_pos = (x1, y0) + else: + text_pos = (x0, y1) + + height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) + * 0.5 + * self._default_font_size + ) + self.draw_text( + labels[i], + text_pos, + color=lighter_color, + horizontal_alignment=horiz_align, + font_size=font_size, + ) + + # draw keypoints + if keypoints is not None: + for keypoints_per_instance in keypoints: + self.draw_and_connect_keypoints(keypoints_per_instance) + + return self.output + + def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): + """ + Args: + boxes (ndarray): an Nx5 numpy array of + (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image. + labels (list[str]): the text to be displayed for each instance. + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + + Returns: + output (VisImage): image object with visualizations. + """ + + num_instances = len(boxes) + + if assigned_colors is None: + assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + if num_instances == 0: + return self.output + + # Display in largest to smallest order to reduce occlusion. + if boxes is not None: + areas = boxes[:, 2] * boxes[:, 3] + + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + colors = [assigned_colors[idx] for idx in sorted_idxs] + + for i in range(num_instances): + self.draw_rotated_box_with_label( + boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None + ) + + return self.output + + def draw_and_connect_keypoints(self, keypoints): + """ + Draws keypoints of an instance and follows the rules for keypoint connections + to draw lines between appropriate keypoints. This follows color heuristics for + line color. + + Args: + keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints + and the last dimension corresponds to (x, y, probability). + + Returns: + output (VisImage): image object with visualizations. + """ + visible = {} + keypoint_names = self.metadata.get("keypoint_names") + for idx, keypoint in enumerate(keypoints): + # draw keypoint + x, y, prob = keypoint + if prob > _KEYPOINT_THRESHOLD: + self.draw_circle((x, y), color=_RED) + if keypoint_names: + keypoint_name = keypoint_names[idx] + visible[keypoint_name] = (x, y) + + if self.metadata.get("keypoint_connection_rules"): + for kp0, kp1, color in self.metadata.keypoint_connection_rules: + if kp0 in visible and kp1 in visible: + x0, y0 = visible[kp0] + x1, y1 = visible[kp1] + color = tuple(x / 255.0 for x in color) + self.draw_line([x0, x1], [y0, y1], color=color) + + # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip + # Note that this strategy is specific to person keypoints. + # For other keypoints, it should just do nothing + try: + ls_x, ls_y = visible["left_shoulder"] + rs_x, rs_y = visible["right_shoulder"] + mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 + except KeyError: + pass + else: + # draw line from nose to mid-shoulder + nose_x, nose_y = visible.get("nose", (None, None)) + if nose_x is not None: + self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) + + try: + # draw line from mid-shoulder to mid-hip + lh_x, lh_y = visible["left_hip"] + rh_x, rh_y = visible["right_hip"] + except KeyError: + pass + else: + mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 + self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) + return self.output + + """ + Primitive drawing functions: + """ + + def draw_text( + self, + text, + position, + *, + font_size=None, + color="g", + horizontal_alignment="center", + rotation=0 + ): + """ + Args: + text (str): class label + position (tuple): a tuple of the x and y coordinates to place text on image. + font_size (int, optional): font of the text. If not provided, a font size + proportional to the image width is calculated and used. + color: color of the text. Refer to `matplotlib.colors` for full list + of formats that are accepted. + horizontal_alignment (str): see `matplotlib.text.Text` + rotation: rotation angle in degrees CCW + + Returns: + output (VisImage): image object with text drawn. + """ + if not font_size: + font_size = self._default_font_size + + # since the text background is dark, we don't want the text to be dark + color = np.maximum(list(mplc.to_rgb(color)), 0.2) + color[np.argmax(color)] = max(0.8, np.max(color)) + + x, y = position + self.output.ax.text( + x, + y, + text, + size=font_size * self.output.scale, + family="sans-serif", + bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, + verticalalignment="top", + horizontalalignment=horizontal_alignment, + color=color, + zorder=10, + rotation=rotation, + ) + return self.output + + def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"): + """ + Args: + box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 + are the coordinates of the image's top left corner. x1 and y1 are the + coordinates of the image's bottom right corner. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + + Returns: + output (VisImage): image object with box drawn. + """ + x0, y0, x1, y1 = box_coord + width = x1 - x0 + height = y1 - y0 + + linewidth = max(self._default_font_size / 4, 1) + + self.output.ax.add_patch( + mpl.patches.Rectangle( + (x0, y0), + width, + height, + fill=False, + edgecolor=edge_color, + linewidth=linewidth * self.output.scale, + alpha=alpha, + linestyle=line_style, + ) + ) + return self.output + + def draw_rotated_box_with_label( + self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None + ): + """ + Args: + rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), + where cnt_x and cnt_y are the center coordinates of the box. + w and h are the width and height of the box. angle represents how + many degrees the box is rotated CCW with regard to the 0-degree box. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + label (string): label for rotated box. It will not be rendered when set to None. + + Returns: + output (VisImage): image object with box drawn. + """ + cnt_x, cnt_y, w, h, angle = rotated_box + area = w * h + # use thinner lines when the box is small + linewidth = self._default_font_size / ( + 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 + ) + + theta = angle * math.pi / 180.0 + c = math.cos(theta) + s = math.sin(theta) + rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] + # x: left->right ; y: top->down + rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] + for k in range(4): + j = (k + 1) % 4 + self.draw_line( + [rotated_rect[k][0], rotated_rect[j][0]], + [rotated_rect[k][1], rotated_rect[j][1]], + color=edge_color, + linestyle="--" if k == 1 else line_style, + linewidth=linewidth, + ) + + if label is not None: + text_pos = rotated_rect[1] # topleft corner + + height_ratio = h / np.sqrt(self.output.height * self.output.width) + label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size + ) + self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) + + return self.output + + def draw_circle(self, circle_coord, color, radius=3): + """ + Args: + circle_coord (list(int) or tuple(int)): contains the x and y coordinates + of the center of the circle. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + radius (int): radius of the circle. + + Returns: + output (VisImage): image object with box drawn. + """ + x, y = circle_coord + self.output.ax.add_patch( + mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) + ) + return self.output + + def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): + """ + Args: + x_data (list[int]): a list containing x values of all the points being drawn. + Length of list should match the length of y_data. + y_data (list[int]): a list containing y values of all the points being drawn. + Length of list should match the length of x_data. + color: color of the line. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + linestyle: style of the line. Refer to `matplotlib.lines.Line2D` + for a full list of formats that are accepted. + linewidth (float or None): width of the line. When it's None, + a default value will be computed and used. + + Returns: + output (VisImage): image object with line drawn. + """ + if linewidth is None: + linewidth = self._default_font_size / 3 + linewidth = max(linewidth, 1) + self.output.ax.add_line( + mpl.lines.Line2D( + x_data, + y_data, + linewidth=linewidth * self.output.scale, + color=color, + linestyle=linestyle, + ) + ) + return self.output + + def draw_binary_mask( + self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=4096 + ): + """ + Args: + binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and + W is the image width. Each value in the array is either a 0 or 1 value of uint8 + type. + color: color of the mask. Refer to `matplotlib.colors` for a full list of + formats that are accepted. If None, will pick a random color. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. + text (str): if None, will be drawn in the object's center of mass. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + area_threshold (float): a connected component small than this will not be shown. + + Returns: + output (VisImage): image object with mask drawn. + """ + if color is None: + color = random_color(rgb=True, maximum=1) + if area_threshold is None: + area_threshold = 4096 + + has_valid_segment = False + binary_mask = binary_mask.astype("uint8") # opencv needs uint8 + mask = GenericMask(binary_mask, self.output.height, self.output.width) + shape2d = (binary_mask.shape[0], binary_mask.shape[1]) + + if not mask.has_holes: + # draw polygons for regular masks + for segment in mask.polygons: + area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) + if area < area_threshold: + continue + has_valid_segment = True + segment = segment.reshape(-1, 2) + self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) + else: + rgba = np.zeros(shape2d + (4,), dtype="float32") + rgba[:, :, :3] = color + rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha + has_valid_segment = True + self.output.ax.imshow(rgba) + + if text is not None and has_valid_segment: + # TODO sometimes drawn on wrong objects. the heuristics here can improve. + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) + largest_component_id = np.argmax(stats[1:, -1]) + 1 + + # draw text on the largest component, as well as other very large components. + for cid in range(1, _num_cc): + if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: + # median is more stable than centroid + # center = centroids[largest_component_id] + center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] + self.draw_text(text, center, color=lighter_color) + return self.output + + def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): + """ + Args: + segment: numpy array of shape Nx2, containing all the points in the polygon. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. If not provided, a darker shade + of the polygon color will be used instead. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + + Returns: + output (VisImage): image object with polygon drawn. + """ + if edge_color is None: + # make edge color darker than the polygon color + if alpha > 0.8: + edge_color = self._change_color_brightness(color, brightness_factor=-0.7) + else: + edge_color = color + edge_color = mplc.to_rgb(edge_color) + (1,) + + polygon = mpl.patches.Polygon( + segment, + fill=True, + facecolor=mplc.to_rgb(color) + (alpha,), + edgecolor=edge_color, + linewidth=max(self._default_font_size // 15 * self.output.scale, 1), + ) + self.output.ax.add_patch(polygon) + return self.output + + """ + Internal methods: + """ + + def _jitter(self, color): + """ + Randomly modifies given color to produce a slightly different color than the color given. + + Args: + color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color + picked. The values in the list are in the [0.0, 1.0] range. + + Returns: + jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the + color after being jittered. The values in the list are in the [0.0, 1.0] range. + """ + color = mplc.to_rgb(color) + vec = np.random.rand(3) + # better to do it in another color space + vec = vec / np.linalg.norm(vec) * 0.5 + res = np.clip(vec + color, 0, 1) + return tuple(res) + + def _create_grayscale_image(self, mask=None): + """ + Create a grayscale version of the original image. + The colors in masked area, if given, will be kept. + """ + img_bw = self.img.astype("f4").mean(axis=2) + img_bw = np.stack([img_bw] * 3, axis=2) + if mask is not None: + img_bw[mask] = self.img[mask] + return img_bw + + def _change_color_brightness(self, color, brightness_factor): + """ + Depending on the brightness_factor, gives a lighter or darker color i.e. a color with + less or more saturation than the original color. + + Args: + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of + 0 will correspond to no change, a factor in [-1.0, 0) range will result in + a darker color and a factor in (0, 1.0] range will result in a lighter color. + + Returns: + modified_color (tuple[double]): a tuple containing the RGB values of the + modified color. Each value in the tuple is in the [0.0, 1.0] range. + """ + assert brightness_factor >= -1.0 and brightness_factor <= 1.0 + color = mplc.to_rgb(color) + polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) + modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) + modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness + modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness + modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) + return modified_color + + def _convert_boxes(self, boxes): + """ + Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. + """ + if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): + return boxes.tensor.numpy() + else: + return np.asarray(boxes) + + def _convert_masks(self, masks_or_polygons): + """ + Convert different format of masks or polygons to a tuple of masks and polygons. + + Returns: + list[GenericMask]: + """ + + m = masks_or_polygons + if isinstance(m, PolygonMasks): + m = m.polygons + if isinstance(m, BitMasks): + m = m.tensor.numpy() + if isinstance(m, torch.Tensor): + m = m.numpy() + ret = [] + for x in m: + if isinstance(x, GenericMask): + ret.append(x) + else: + ret.append(GenericMask(x, self.output.height, self.output.width)) + return ret + + def _convert_keypoints(self, keypoints): + if isinstance(keypoints, Keypoints): + keypoints = keypoints.tensor + keypoints = np.asarray(keypoints) + return keypoints + + def get_output(self): + """ + Returns: + output (VisImage): the image output containing the visualizations added + to the image. + """ + return self.output diff --git a/dev/README.md b/dev/README.md new file mode 100644 index 0000000000000000000000000000000000000000..cc0d3297b2d436f279c3546c16c86f296402f6c5 --- /dev/null +++ b/dev/README.md @@ -0,0 +1,7 @@ + +## Some scripts for developers to use, include: + +- `linter.sh`: lint the codebase before commit +- `run_{inference,instant}_tests.sh`: run inference/training for a few iterations. + Note that these tests require 2 GPUs. +- `parse_results.sh`: parse results from a log file. diff --git a/dev/linter.sh b/dev/linter.sh new file mode 100755 index 0000000000000000000000000000000000000000..fd7081dbc27b85e5323d25085fb79c7ee3b54e4a --- /dev/null +++ b/dev/linter.sh @@ -0,0 +1,46 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Run this script at project root by "./dev/linter.sh" before you commit + +vergte() { + [ "$2" = "$(echo -e "$1\\n$2" | sort -V | head -n1)" ] +} + +{ + black --version | grep -E "(19.3b0.*6733274)|(19.3b0\\+8)" > /dev/null +} || { + echo "Linter requires 'black @ git+https://github.com/psf/black@673327449f86fce558adde153bb6cbe54bfebad2' !" + exit 1 +} + +ISORT_TARGET_VERSION="4.3.21" +ISORT_VERSION=$(isort -v | grep VERSION | awk '{print $2}') +vergte "$ISORT_VERSION" "$ISORT_TARGET_VERSION" || { + echo "Linter requires isort>=${ISORT_TARGET_VERSION} !" + exit 1 +} + +set -v + +echo "Running isort ..." +isort -y -sp . --atomic + +echo "Running black ..." +black -l 100 . + +echo "Running flake8 ..." +if [ -x "$(command -v flake8-3)" ]; then + flake8-3 . +else + python3 -m flake8 . +fi + +# echo "Running mypy ..." +# Pytorch does not have enough type annotations +# mypy detectron2/solver detectron2/structures detectron2/config + +echo "Running clang-format ..." +find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i + +command -v arc > /dev/null && arc lint diff --git a/dev/packaging/README.md b/dev/packaging/README.md new file mode 100644 index 0000000000000000000000000000000000000000..095684fcc1c5593805158c81aa0168263eb57ced --- /dev/null +++ b/dev/packaging/README.md @@ -0,0 +1,17 @@ + +## To build a cu101 wheel for release: + +``` +$ nvidia-docker run -it --storage-opt "size=20GB" --name pt pytorch/manylinux-cuda101 +# inside the container: +# git clone https://github.com/facebookresearch/detectron2/ +# cd detectron2 +# export CU_VERSION=cu101 D2_VERSION_SUFFIX= PYTHON_VERSION=3.7 PYTORCH_VERSION=1.4 +# ./dev/packaging/build_wheel.sh +``` + +## To build all wheels for `CUDA {9.2,10.0,10.1}` x `Python {3.6,3.7,3.8}`: +``` +./dev/packaging/build_all_wheels.sh +./dev/packaging/gen_wheel_index.sh /path/to/wheels +``` diff --git a/dev/packaging/build_all_wheels.sh b/dev/packaging/build_all_wheels.sh new file mode 100755 index 0000000000000000000000000000000000000000..eb64dea70cda26f5d101c414af43645ef7e3a349 --- /dev/null +++ b/dev/packaging/build_all_wheels.sh @@ -0,0 +1,57 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +PYTORCH_VERSION=1.5 + +build_for_one_cuda() { + cu=$1 + + case "$cu" in + cu*) + container_name=manylinux-cuda${cu/cu/} + ;; + cpu) + container_name=manylinux-cuda101 + ;; + *) + echo "Unrecognized cu=$cu" + exit 1 + ;; + esac + + echo "Launching container $container_name ..." + + for py in 3.6 3.7 3.8; do + docker run -itd \ + --name $container_name \ + --mount type=bind,source="$(pwd)",target=/detectron2 \ + pytorch/$container_name + + cat </dev/null 2>&1 && pwd )" +. "$script_dir/pkg_helpers.bash" + +echo "Build Settings:" +echo "CU_VERSION: $CU_VERSION" # e.g. cu101 +echo "D2_VERSION_SUFFIX: $D2_VERSION_SUFFIX" # e.g. +cu101 or "" +echo "PYTHON_VERSION: $PYTHON_VERSION" # e.g. 3.6 +echo "PYTORCH_VERSION: $PYTORCH_VERSION" # e.g. 1.4 + +setup_cuda +setup_wheel_python +yum install ninja-build -y && ln -sv /usr/bin/ninja-build /usr/bin/ninja + +export TORCH_VERSION_SUFFIX="+$CU_VERSION" +if [[ "$CU_VERSION" == "cu102" ]]; then + export TORCH_VERSION_SUFFIX="" +fi +pip_install pip numpy -U +pip_install "torch==$PYTORCH_VERSION$TORCH_VERSION_SUFFIX" \ + -f https://download.pytorch.org/whl/$CU_VERSION/torch_stable.html + +# use separate directories to allow parallel build +BASE_BUILD_DIR=build/$CU_VERSION/$PYTHON_VERSION +python setup.py \ + build -b $BASE_BUILD_DIR \ + bdist_wheel -b $BASE_BUILD_DIR/build_dist -d wheels/$CU_VERSION diff --git a/dev/packaging/gen_wheel_index.sh b/dev/packaging/gen_wheel_index.sh new file mode 100755 index 0000000000000000000000000000000000000000..44d6041cdf45afdd39a85d413f08373e8516999b --- /dev/null +++ b/dev/packaging/gen_wheel_index.sh @@ -0,0 +1,27 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + + +root=$1 +if [[ -z "$root" ]]; then + echo "Usage: ./gen_wheel_index.sh /path/to/wheels" + exit +fi + +index=$root/index.html + +cd "$root" +for cu in cpu cu92 cu100 cu101 cu102; do + cd $cu + echo "Creating $PWD/index.html ..." + for whl in *.whl; do + echo "$whl
" + done > index.html + cd "$root" +done + +echo "Creating $index ..." +for whl in $(find . -type f -name '*.whl' -printf '%P\n' | sort); do + echo "$whl
" +done > "$index" + diff --git a/dev/packaging/pkg_helpers.bash b/dev/packaging/pkg_helpers.bash new file mode 100755 index 0000000000000000000000000000000000000000..51e6185c7fba6ba0f7a325c467993196f1c9b4ef --- /dev/null +++ b/dev/packaging/pkg_helpers.bash @@ -0,0 +1,57 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# Function to retry functions that sometimes timeout or have flaky failures +retry () { + $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) +} +# Install with pip a bit more robustly than the default +pip_install() { + retry pip install --progress-bar off "$@" +} + + +setup_cuda() { + # Now work out the CUDA settings + # Like other torch domain libraries, we choose common GPU architectures only. + export FORCE_CUDA=1 + case "$CU_VERSION" in + cu102) + export CUDA_HOME=/usr/local/cuda-10.2/ + export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX" + ;; + cu101) + export CUDA_HOME=/usr/local/cuda-10.1/ + export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX" + ;; + cu100) + export CUDA_HOME=/usr/local/cuda-10.0/ + export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX;7.5+PTX" + ;; + cu92) + export CUDA_HOME=/usr/local/cuda-9.2/ + export TORCH_CUDA_ARCH_LIST="3.5;3.7;5.0;5.2;6.0+PTX;6.1+PTX;7.0+PTX" + ;; + cpu) + unset FORCE_CUDA + export CUDA_VISIBLE_DEVICES= + ;; + *) + echo "Unrecognized CU_VERSION=$CU_VERSION" + exit 1 + ;; + esac +} + +setup_wheel_python() { + case "$PYTHON_VERSION" in + 3.6) python_abi=cp36-cp36m ;; + 3.7) python_abi=cp37-cp37m ;; + 3.8) python_abi=cp38-cp38 ;; + *) + echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION" + exit 1 + ;; + esac + export PATH="/opt/python/$python_abi/bin:$PATH" +} diff --git a/dev/parse_results.sh b/dev/parse_results.sh new file mode 100755 index 0000000000000000000000000000000000000000..7f0243635402c741fa7e8145bfed2593d6a2dc34 --- /dev/null +++ b/dev/parse_results.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# A shell script that parses metrics from the log file. +# Make it easier for developers to track performance of models. + +LOG="$1" + +if [[ -z "$LOG" ]]; then + echo "Usage: $0 /path/to/log/file" + exit 1 +fi + +# [12/15 11:47:32] trainer INFO: Total training time: 12:15:04.446477 (0.4900 s / it) +# [12/15 11:49:03] inference INFO: Total inference time: 0:01:25.326167 (0.13652186737060548 s / img per device, on 8 devices) +# [12/15 11:49:03] inference INFO: Total inference pure compute time: ..... + +# training time +trainspeed=$(grep -o 'Overall training.*' "$LOG" | grep -Eo '\(.*\)' | grep -o '[0-9\.]*') +echo "Training speed: $trainspeed s/it" + +# inference time: there could be multiple inference during training +inferencespeed=$(grep -o 'Total inference pure.*' "$LOG" | tail -n1 | grep -Eo '\(.*\)' | grep -o '[0-9\.]*' | head -n1) +echo "Inference speed: $inferencespeed s/it" + +# [12/15 11:47:18] trainer INFO: eta: 0:00:00 iter: 90000 loss: 0.5407 (0.7256) loss_classifier: 0.1744 (0.2446) loss_box_reg: 0.0838 (0.1160) loss_mask: 0.2159 (0.2722) loss_objectness: 0.0244 (0.0429) loss_rpn_box_reg: 0.0279 (0.0500) time: 0.4487 (0.4899) data: 0.0076 (0.0975) lr: 0.000200 max mem: 4161 +memory=$(grep -o 'max[_ ]mem: [0-9]*' "$LOG" | tail -n1 | grep -o '[0-9]*') +echo "Training memory: $memory MB" + +echo "Easy to copypaste:" +echo "$trainspeed","$inferencespeed","$memory" + +echo "------------------------------" + +# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: bbox +# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl +# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0017,0.0024,0.0017,0.0005,0.0019,0.0011 +# [12/26 17:26:32] engine.coco_evaluation: copypaste: Task: segm +# [12/26 17:26:32] engine.coco_evaluation: copypaste: AP,AP50,AP75,APs,APm,APl +# [12/26 17:26:32] engine.coco_evaluation: copypaste: 0.0014,0.0021,0.0016,0.0005,0.0016,0.0011 + +echo "COCO Results:" +num_tasks=$(grep -o 'copypaste:.*Task.*' "$LOG" | sort -u | wc -l) +# each task has 3 lines +grep -o 'copypaste:.*' "$LOG" | cut -d ' ' -f 2- | tail -n $((num_tasks * 3)) diff --git a/dev/run_inference_tests.sh b/dev/run_inference_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..17e422d576e5fe9efcd85790954c569c962657d6 --- /dev/null +++ b/dev/run_inference_tests.sh @@ -0,0 +1,44 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +BIN="python tools/train_net.py" +OUTPUT="inference_test_output" +NUM_GPUS=2 + +CFG_LIST=( "${@:1}" ) + +if [ ${#CFG_LIST[@]} -eq 0 ]; then + CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml ) +fi + +echo "========================================================================" +echo "Configs to run:" +echo "${CFG_LIST[@]}" +echo "========================================================================" + + +for cfg in "${CFG_LIST[@]}"; do + echo "========================================================================" + echo "Running $cfg ..." + echo "========================================================================" + $BIN \ + --eval-only \ + --num-gpus $NUM_GPUS \ + --config-file "$cfg" \ + OUTPUT_DIR $OUTPUT + rm -rf $OUTPUT +done + + +echo "========================================================================" +echo "Running demo.py ..." +echo "========================================================================" +DEMO_BIN="python demo/demo.py" +COCO_DIR=datasets/coco/val2014 +mkdir -pv $OUTPUT + +set -v + +$DEMO_BIN --config-file ./configs/quick_schedules/panoptic_fpn_R_50_inference_acc_test.yaml \ + --input $COCO_DIR/COCO_val2014_0000001933* --output $OUTPUT +rm -rf $OUTPUT diff --git a/dev/run_instant_tests.sh b/dev/run_instant_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..2c51de649262e7371fb173210c8edc377e8177e0 --- /dev/null +++ b/dev/run_instant_tests.sh @@ -0,0 +1,27 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +BIN="python tools/train_net.py" +OUTPUT="instant_test_output" +NUM_GPUS=2 + +CFG_LIST=( "${@:1}" ) +if [ ${#CFG_LIST[@]} -eq 0 ]; then + CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml ) +fi + +echo "========================================================================" +echo "Configs to run:" +echo "${CFG_LIST[@]}" +echo "========================================================================" + +for cfg in "${CFG_LIST[@]}"; do + echo "========================================================================" + echo "Running $cfg ..." + echo "========================================================================" + $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \ + SOLVER.IMS_PER_BATCH $(($NUM_GPUS * 2)) \ + OUTPUT_DIR "$OUTPUT" + rm -rf "$OUTPUT" +done + diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2a8603903e36eafb3a61fac0a086a919cc67fe38 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,49 @@ +FROM nvidia/cuda:10.1-cudnn7-devel + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get install -y \ + python3-opencv ca-certificates python3-dev git wget sudo \ + cmake ninja-build protobuf-compiler libprotobuf-dev && \ + rm -rf /var/lib/apt/lists/* +RUN ln -sv /usr/bin/python3 /usr/bin/python + +# create a non-root user +ARG USER_ID=1000 +RUN useradd -m --no-log-init --system --uid ${USER_ID} appuser -g sudo +RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers +USER appuser +WORKDIR /home/appuser + +ENV PATH="/home/appuser/.local/bin:${PATH}" +RUN wget https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py --user && \ + rm get-pip.py + +# install dependencies +# See https://pytorch.org/ for other options if you use a different version of CUDA +RUN pip install --user tensorboard cython +RUN pip install --user torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip install --user 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' + +RUN pip install --user 'git+https://github.com/facebookresearch/fvcore' +# install detectron2 +RUN git clone https://github.com/facebookresearch/detectron2 detectron2_repo +# set FORCE_CUDA because during `docker build` cuda is not accessible +ENV FORCE_CUDA="1" +# This will by default build detectron2 for all common cuda architectures and take a lot more time, +# because inside `docker build`, there is no way to tell which architecture will be used. +ARG TORCH_CUDA_ARCH_LIST="Kepler;Kepler+Tesla;Maxwell;Maxwell+Tegra;Pascal;Volta;Turing" +ENV TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST}" + +RUN pip install --user -e detectron2_repo + +# Set a fixed model cache directory. +ENV FVCORE_CACHE="/tmp" +WORKDIR /home/appuser/detectron2_repo + +# run detectron2 under user "appuser": +# wget http://images.cocodataset.org/val2017/000000439715.jpg -O input.jpg +# python3 demo/demo.py \ + #--config-file configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \ + #--input input.jpg --output outputs/ \ + #--opts MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl diff --git a/docker/Dockerfile-circleci b/docker/Dockerfile-circleci new file mode 100644 index 0000000000000000000000000000000000000000..bc0be845adc247eb458d212ae5352c594cd80a72 --- /dev/null +++ b/docker/Dockerfile-circleci @@ -0,0 +1,17 @@ +FROM nvidia/cuda:10.1-cudnn7-devel +# This dockerfile only aims to provide an environment for unittest on CircleCI + +ENV DEBIAN_FRONTEND noninteractive +RUN apt-get update && apt-get install -y \ + python3-opencv ca-certificates python3-dev git wget sudo ninja-build && \ + rm -rf /var/lib/apt/lists/* + +RUN wget -q https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py && \ + rm get-pip.py + +# install dependencies +# See https://pytorch.org/ for other options if you use a different version of CUDA +RUN pip install tensorboard cython +RUN pip install torch==1.5+cu101 torchvision==0.6+cu101 -f https://download.pytorch.org/whl/torch_stable.html +RUN pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI' diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..760c4054d0e4fa56a67ab4b59c14979498e2f94a --- /dev/null +++ b/docker/README.md @@ -0,0 +1,36 @@ + +## Use the container (with docker ≥ 19.03) + +``` +cd docker/ +# Build: +docker build --build-arg USER_ID=$UID -t detectron2:v0 . +# Run: +docker run --gpus all -it \ + --shm-size=8gb --env="DISPLAY" --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ + --name=detectron2 detectron2:v0 + +# Grant docker access to host X server to show images +xhost +local:`docker inspect --format='{{ .Config.Hostname }}' detectron2` +``` + +## Use the container (with docker < 19.03) + +Install docker-compose and nvidia-docker2, then run: +``` +cd docker && USER_ID=$UID docker-compose run detectron2 +``` + +#### Using a persistent cache directory + +You can prevent models from being re-downloaded on every run, +by storing them in a cache directory. + +To do this, add `--volume=$HOME/.torch/fvcore_cache:/tmp:rw` in the run command. + +## Install new dependencies +Add the following to `Dockerfile` to make persistent changes. +``` +RUN sudo apt-get update && sudo apt-get install -y vim +``` +Or run them in the container to make temporary changes. diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000000000000000000000000000000000000..e660f44645a5cc164cd5a59f2cdcf7e1ded60c2e --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,18 @@ +version: "2.3" +services: + detectron2: + build: + context: . + dockerfile: Dockerfile + args: + USER_ID: ${USER_ID:-1000} + runtime: nvidia # TODO: Exchange with "gpu: all" in the future (see https://github.com/facebookresearch/detectron2/pull/197/commits/00545e1f376918db4a8ce264d427a07c1e896c5a). + shm_size: "8gb" + ulimits: + memlock: -1 + stack: 67108864 + volumes: + - /tmp/.X11-unix:/tmp/.X11-unix:ro + environment: + - DISPLAY=$DISPLAY + - NVIDIA_VISIBLE_DEVICES=all diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..e35d8850c9688b1ce82711694692cc574a799396 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +_build diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d537643dd411736a5f309383cfef52ea7d5e4599 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,19 @@ +# Minimal makefile for Sphinx documentation +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2c65c3676b488f3654b7e3231e1cfd06df48d4be --- /dev/null +++ b/docs/README.md @@ -0,0 +1,16 @@ +# Read the docs: + +The latest documentation built from this directory is available at [detectron2.readthedocs.io](https://detectron2.readthedocs.io/). +Documents in this directory are not meant to be read on github. + +# Build the docs: + +1. Install detectron2 according to [INSTALL.md](INSTALL.md). +2. Install additional libraries required to build docs: + - docutils==0.16 + - Sphinx==3.0.0 + - recommonmark==0.6.0 + - sphinx_rtd_theme + - mock + +3. Run `make html` from this directory. diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..44e9f2b4db549a3a5ef1420b27d408915e86657c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,335 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +# flake8: noqa + +# Configuration file for the Sphinx documentation builder. +# +# This file does only contain a selection of the most common options. For a +# full list see the documentation: +# http://www.sphinx-doc.org/en/master/config + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import sys +import mock +from sphinx.domains import Domain +from typing import Dict, List, Tuple + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +import sphinx_rtd_theme + + +class GithubURLDomain(Domain): + """ + Resolve certain links in markdown files to github source. + """ + + name = "githuburl" + ROOT = "https://github.com/facebookresearch/detectron2/blob/master/" + LINKED_DOC = ["tutorials/install", "tutorials/getting_started"] + + def resolve_any_xref(self, env, fromdocname, builder, target, node, contnode): + github_url = None + if not target.endswith("html") and target.startswith("../../"): + url = target.replace("../", "") + github_url = url + if fromdocname in self.LINKED_DOC: + # unresolved links in these docs are all github links + github_url = target + + if github_url is not None: + if github_url.endswith("MODEL_ZOO") or github_url.endswith("README"): + # bug of recommonmark. + # https://github.com/readthedocs/recommonmark/blob/ddd56e7717e9745f11300059e4268e204138a6b1/recommonmark/parser.py#L152-L155 + github_url += ".md" + print("Ref {} resolved to github:{}".format(target, github_url)) + contnode["refuri"] = self.ROOT + github_url + return [("githuburl:any", contnode)] + else: + return [] + + +# to support markdown +from recommonmark.parser import CommonMarkParser + +sys.path.insert(0, os.path.abspath("../")) +os.environ["DOC_BUILDING"] = "True" +DEPLOY = os.environ.get("READTHEDOCS") == "True" + + +# -- Project information ----------------------------------------------------- + +# fmt: off +try: + import torch # noqa +except ImportError: + for m in [ + "torch", "torchvision", "torch.nn", "torch.nn.parallel", "torch.distributed", "torch.multiprocessing", "torch.autograd", + "torch.autograd.function", "torch.nn.modules", "torch.nn.modules.utils", "torch.utils", "torch.utils.data", "torch.onnx", + "torchvision", "torchvision.ops", + ]: + sys.modules[m] = mock.Mock(name=m) + sys.modules['torch'].__version__ = "1.5" # fake version + +for m in [ + "cv2", "scipy", "portalocker", "detectron2._C", + "pycocotools", "pycocotools.mask", "pycocotools.coco", "pycocotools.cocoeval", + "google", "google.protobuf", "google.protobuf.internal", "onnx", + "caffe2", "caffe2.proto", "caffe2.python", "caffe2.python.utils", "caffe2.python.onnx", "caffe2.python.onnx.backend", +]: + sys.modules[m] = mock.Mock(name=m) +# fmt: on +sys.modules["cv2"].__version__ = "3.4" + +import detectron2 # isort: skip + + +project = "detectron2" +copyright = "2019-2020, detectron2 contributors" +author = "detectron2 contributors" + +# The short X.Y version +version = detectron2.__version__ +# The full version, including alpha/beta/rc tags +release = version + + +# -- General configuration --------------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +needs_sphinx = "3.0" + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "recommonmark", + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.intersphinx", + "sphinx.ext.todo", + "sphinx.ext.coverage", + "sphinx.ext.mathjax", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", +] + +# -- Configurations for plugins ------------ +napoleon_google_docstring = True +napoleon_include_init_with_doc = True +napoleon_include_special_with_doc = True +napoleon_numpy_docstring = False +napoleon_use_rtype = False +autodoc_inherit_docstrings = False +autodoc_member_order = "bysource" + +if DEPLOY: + intersphinx_timeout = 10 +else: + # skip this when building locally + intersphinx_timeout = 0.1 +intersphinx_mapping = { + "python": ("https://docs.python.org/3.6", None), + "numpy": ("https://docs.scipy.org/doc/numpy/", None), + "torch": ("https://pytorch.org/docs/master/", None), +} +# ------------------------- + + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +source_suffix = [".rst", ".md"] + +# The master toctree document. +master_doc = "index" + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "build", "README.md", "tutorials/README.md"] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = "sphinx" + + +# -- Options for HTML output ------------------------------------------------- + +html_theme = "sphinx_rtd_theme" +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] + +# Custom sidebar templates, must be a dictionary that maps document names +# to template names. +# +# The default sidebars (for documents that don't match any pattern) are +# defined by theme itself. Builtin themes are using these templates by +# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', +# 'searchbox.html']``. +# +# html_sidebars = {} + + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = "detectron2doc" + + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, "detectron2.tex", "detectron2 Documentation", "detectron2 contributors", "manual") +] + + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(master_doc, "detectron2", "detectron2 Documentation", [author], 1)] + + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + ( + master_doc, + "detectron2", + "detectron2 Documentation", + author, + "detectron2", + "One line description of project.", + "Miscellaneous", + ) +] + + +# -- Options for todo extension ---------------------------------------------- + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = True + + +_DEPRECATED_NAMES = set() + + +def autodoc_skip_member(app, what, name, obj, skip, options): + # we hide something deliberately + if getattr(obj, "__HIDE_SPHINX_DOC__", False): + return True + # Hide some names that are deprecated or not intended to be used + if name in _DEPRECATED_NAMES: + return True + return None + + +_PAPER_DATA = { + "resnet": ("1512.03385", "Deep Residual Learning for Image Recognition"), + "fpn": ("1612.03144", "Feature Pyramid Networks for Object Detection"), + "mask r-cnn": ("1703.06870", "Mask R-CNN"), + "faster r-cnn": ( + "1506.01497", + "Faster R-CNN: Towards Real-Time Object Detection with Region Proposal Networks", + ), + "deformconv": ("1703.06211", "Deformable Convolutional Networks"), + "deformconv2": ("1811.11168", "Deformable ConvNets v2: More Deformable, Better Results"), + "panopticfpn": ("1901.02446", "Panoptic Feature Pyramid Networks"), + "retinanet": ("1708.02002", "Focal Loss for Dense Object Detection"), + "cascade r-cnn": ("1712.00726", "Cascade R-CNN: Delving into High Quality Object Detection"), + "lvis": ("1908.03195", "LVIS: A Dataset for Large Vocabulary Instance Segmentation"), + "rrpn": ("1703.01086", "Arbitrary-Oriented Scene Text Detection via Rotation Proposals"), + "in1k1h": ("1706.02677", "Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour"), +} + + +def paper_ref_role( + typ: str, + rawtext: str, + text: str, + lineno: int, + inliner, + options: Dict = {}, + content: List[str] = [], +): + """ + Parse :paper:`xxx`. Similar to the "extlinks" sphinx extension. + """ + from docutils import nodes, utils + from sphinx.util.nodes import split_explicit_title + + text = utils.unescape(text) + has_explicit_title, title, link = split_explicit_title(text) + link = link.lower() + if link not in _PAPER_DATA: + inliner.reporter.warning("Cannot find paper " + link) + paper_url, paper_title = "#", link + else: + paper_url, paper_title = _PAPER_DATA[link] + if "/" not in paper_url: + paper_url = "https://arxiv.org/abs/" + paper_url + if not has_explicit_title: + title = paper_title + pnode = nodes.reference(title, title, internal=False, refuri=paper_url) + return [pnode], [] + + +def setup(app): + from recommonmark.transform import AutoStructify + + app.add_domain(GithubURLDomain) + app.connect("autodoc-skip-member", autodoc_skip_member) + app.add_role("paper", paper_ref_role) + app.add_config_value( + "recommonmark_config", + {"enable_math": True, "enable_inline_math": True, "enable_eval_rst": True}, + True, + ) + app.add_transform(AutoStructify) diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..8634b7b12ab906c10a78d6053428029799282ffd --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,14 @@ +.. detectron2 documentation master file, created by + sphinx-quickstart on Sat Sep 21 13:46:45 2019. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to detectron2's documentation! +====================================== + +.. toctree:: + :maxdepth: 2 + + tutorials/index + notes/index + modules/index diff --git a/docs/modules/checkpoint.rst b/docs/modules/checkpoint.rst new file mode 100644 index 0000000000000000000000000000000000000000..616cb186c40212d7a0ca311d21691245b2fce996 --- /dev/null +++ b/docs/modules/checkpoint.rst @@ -0,0 +1,7 @@ +detectron2.checkpoint package +============================= + +.. automodule:: detectron2.checkpoint + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/config.rst b/docs/modules/config.rst new file mode 100644 index 0000000000000000000000000000000000000000..034bd5f5e8a79d9eb2109f86b7aa12eea9c8b786 --- /dev/null +++ b/docs/modules/config.rst @@ -0,0 +1,17 @@ +detectron2.config package +========================= + +.. automodule:: detectron2.config + :members: + :undoc-members: + :show-inheritance: + :inherited-members: + + +Config References +----------------- + +.. literalinclude:: ../../detectron2/config/defaults.py + :language: python + :linenos: + :lines: 4- diff --git a/docs/modules/data.rst b/docs/modules/data.rst new file mode 100644 index 0000000000000000000000000000000000000000..3697f0e22f3351a68ee40e4cadbd3ee6d978af8d --- /dev/null +++ b/docs/modules/data.rst @@ -0,0 +1,40 @@ +detectron2.data package +======================= + +.. automodule:: detectron2.data + :members: + :undoc-members: + :show-inheritance: + +detectron2.data.detection\_utils module +--------------------------------------- + +.. automodule:: detectron2.data.detection_utils + :members: + :undoc-members: + :show-inheritance: + +detectron2.data.datasets module +--------------------------------------- + +.. automodule:: detectron2.data.datasets + :members: + :undoc-members: + :show-inheritance: + +detectron2.data.samplers module +--------------------------------------- + +.. automodule:: detectron2.data.samplers + :members: + :undoc-members: + :show-inheritance: + + +detectron2.data.transforms module +--------------------------------------- + +.. automodule:: detectron2.data.transforms + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/engine.rst b/docs/modules/engine.rst new file mode 100644 index 0000000000000000000000000000000000000000..bb8b533aee225b1096fe4353b03533208f92732e --- /dev/null +++ b/docs/modules/engine.rst @@ -0,0 +1,25 @@ +detectron2.engine package +========================= + + +.. automodule:: detectron2.engine + :members: + :undoc-members: + :show-inheritance: + + +detectron2.engine.defaults module +--------------------------------- + +.. automodule:: detectron2.engine.defaults + :members: + :undoc-members: + :show-inheritance: + +detectron2.engine.hooks module +--------------------------------- + +.. automodule:: detectron2.engine.hooks + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/evaluation.rst b/docs/modules/evaluation.rst new file mode 100644 index 0000000000000000000000000000000000000000..d9d34ff1a21c42b33ce2ad8b4415052af194397f --- /dev/null +++ b/docs/modules/evaluation.rst @@ -0,0 +1,7 @@ +detectron2.evaluation package +============================= + +.. automodule:: detectron2.evaluation + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/export.rst b/docs/modules/export.rst new file mode 100644 index 0000000000000000000000000000000000000000..bb7c3c9173cae323e67cb9330b292fefc40ec760 --- /dev/null +++ b/docs/modules/export.rst @@ -0,0 +1,7 @@ +detectron2.export package +========================= + +.. automodule:: detectron2.export + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/index.rst b/docs/modules/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1b246f570070b4f8ef47d00968498d49f0310a6e --- /dev/null +++ b/docs/modules/index.rst @@ -0,0 +1,17 @@ +API Documentation +================== + +.. toctree:: + + checkpoint + config + data + engine + evaluation + layers + model_zoo + modeling + solver + structures + utils + export diff --git a/docs/modules/layers.rst b/docs/modules/layers.rst new file mode 100644 index 0000000000000000000000000000000000000000..6aeb5213a4b27edeb7c0b2bdb816fd1af8d22ce4 --- /dev/null +++ b/docs/modules/layers.rst @@ -0,0 +1,7 @@ +detectron2.layers package +========================= + +.. automodule:: detectron2.layers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/model_zoo.rst b/docs/modules/model_zoo.rst new file mode 100644 index 0000000000000000000000000000000000000000..8b1c7d598f509db2361928aac1be4f25854d9f93 --- /dev/null +++ b/docs/modules/model_zoo.rst @@ -0,0 +1,7 @@ +detectron2.model_zoo package +============================ + +.. automodule:: detectron2.model_zoo + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/modeling.rst b/docs/modules/modeling.rst new file mode 100644 index 0000000000000000000000000000000000000000..58ccd2c591774f3766f71da00b6938a0f4f3f592 --- /dev/null +++ b/docs/modules/modeling.rst @@ -0,0 +1,58 @@ +detectron2.modeling package +=========================== + +.. automodule:: detectron2.modeling + :members: + :undoc-members: + :show-inheritance: + + +detectron2.modeling.poolers module +--------------------------------------- + +.. automodule:: detectron2.modeling.poolers + :members: + :undoc-members: + :show-inheritance: + + +detectron2.modeling.sampling module +------------------------------------ + +.. automodule:: detectron2.modeling.sampling + :members: + :undoc-members: + :show-inheritance: + + +detectron2.modeling.box_regression module +------------------------------------------ + +.. automodule:: detectron2.modeling.box_regression + :members: + :undoc-members: + :show-inheritance: + + +Model Registries +----------------- + +These are different registries provided in modeling. +Each registry provide you the ability to replace it with your customized component, +without having to modify detectron2's code. + +Note that it is impossible to allow users to customize any line of code directly. +Even just to add one line at some place, +you'll likely need to find out the smallest registry which contains that line, +and register your component to that registry. + + +.. autodata:: detectron2.modeling.META_ARCH_REGISTRY +.. autodata:: detectron2.modeling.BACKBONE_REGISTRY +.. autodata:: detectron2.modeling.PROPOSAL_GENERATOR_REGISTRY +.. autodata:: detectron2.modeling.RPN_HEAD_REGISTRY +.. autodata:: detectron2.modeling.ANCHOR_GENERATOR_REGISTRY +.. autodata:: detectron2.modeling.ROI_HEADS_REGISTRY +.. autodata:: detectron2.modeling.ROI_BOX_HEAD_REGISTRY +.. autodata:: detectron2.modeling.ROI_MASK_HEAD_REGISTRY +.. autodata:: detectron2.modeling.ROI_KEYPOINT_HEAD_REGISTRY diff --git a/docs/modules/solver.rst b/docs/modules/solver.rst new file mode 100644 index 0000000000000000000000000000000000000000..7f4a49f2ebaef2760b91eb7cecd32dcbff038efb --- /dev/null +++ b/docs/modules/solver.rst @@ -0,0 +1,7 @@ +detectron2.solver package +========================= + +.. automodule:: detectron2.solver + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/structures.rst b/docs/modules/structures.rst new file mode 100644 index 0000000000000000000000000000000000000000..5701c61abf5f74f61807e131f708304a8c9bab82 --- /dev/null +++ b/docs/modules/structures.rst @@ -0,0 +1,7 @@ +detectron2.structures package +============================= + +.. automodule:: detectron2.structures + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/modules/utils.rst b/docs/modules/utils.rst new file mode 100644 index 0000000000000000000000000000000000000000..8b57292ac0e655f40756b19c8eea259bddb62aab --- /dev/null +++ b/docs/modules/utils.rst @@ -0,0 +1,80 @@ +detectron2.utils package +======================== + +detectron2.utils.colormap module +-------------------------------- + +.. automodule:: detectron2.utils.colormap + :members: + :undoc-members: + :show-inheritance: + +detectron2.utils.comm module +---------------------------- + +.. automodule:: detectron2.utils.comm + :members: + :undoc-members: + :show-inheritance: + + +detectron2.utils.events module +------------------------------ + +.. automodule:: detectron2.utils.events + :members: + :undoc-members: + :show-inheritance: + + +detectron2.utils.logger module +------------------------------ + +.. automodule:: detectron2.utils.logger + :members: + :undoc-members: + :show-inheritance: + + +detectron2.utils.registry module +-------------------------------- + +.. automodule:: detectron2.utils.registry + :members: + :undoc-members: + :show-inheritance: + +detectron2.utils.memory module +---------------------------------- + +.. automodule:: detectron2.utils.memory + :members: + :undoc-members: + :show-inheritance: + + +detectron2.utils.analysis module +---------------------------------- + +.. automodule:: detectron2.utils.analysis + :members: + :undoc-members: + :show-inheritance: + + +detectron2.utils.visualizer module +---------------------------------- + +.. automodule:: detectron2.utils.visualizer + :members: + :undoc-members: + :show-inheritance: + +detectron2.utils.video\_visualizer module +----------------------------------------- + +.. automodule:: detectron2.utils.video_visualizer + :members: + :undoc-members: + :show-inheritance: + diff --git a/docs/notes/benchmarks.md b/docs/notes/benchmarks.md new file mode 100644 index 0000000000000000000000000000000000000000..963f9210b39ce3ae248541644362631cb325d2b2 --- /dev/null +++ b/docs/notes/benchmarks.md @@ -0,0 +1,196 @@ + +# Benchmarks + +Here we benchmark the training speed of a Mask R-CNN in detectron2, +with some other popular open source Mask R-CNN implementations. + + +### Settings + +* Hardware: 8 NVIDIA V100s with NVLink. +* Software: Python 3.7, CUDA 10.1, cuDNN 7.6.5, PyTorch 1.5, + TensorFlow 1.15.0rc2, Keras 2.2.5, MxNet 1.6.0b20190820. +* Model: an end-to-end R-50-FPN Mask-RCNN model, using the same hyperparameter as the + [Detectron baseline config](https://github.com/facebookresearch/Detectron/blob/master/configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml) + (it does no have scale augmentation). +* Metrics: We use the average throughput in iterations 100-500 to skip GPU warmup time. + Note that for R-CNN-style models, the throughput of a model typically changes during training, because + it depends on the predictions of the model. Therefore this metric is not directly comparable with + "train speed" in model zoo, which is the average speed of the entire training run. + + +### Main Results + +```eval_rst ++-------------------------------+--------------------+ +| Implementation | Throughput (img/s) | ++===============================+====================+ +| |D2| |PT| | 62 | ++-------------------------------+--------------------+ +| mmdetection_ |PT| | 53 | ++-------------------------------+--------------------+ +| maskrcnn-benchmark_ |PT| | 53 | ++-------------------------------+--------------------+ +| tensorpack_ |TF| | 50 | ++-------------------------------+--------------------+ +| simpledet_ |mxnet| | 39 | ++-------------------------------+--------------------+ +| Detectron_ |C2| | 19 | ++-------------------------------+--------------------+ +| `matterport/Mask_RCNN`__ |TF| | 14 | ++-------------------------------+--------------------+ + +.. _maskrcnn-benchmark: https://github.com/facebookresearch/maskrcnn-benchmark/ +.. _tensorpack: https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN +.. _mmdetection: https://github.com/open-mmlab/mmdetection/ +.. _simpledet: https://github.com/TuSimple/simpledet/ +.. _Detectron: https://github.com/facebookresearch/Detectron +__ https://github.com/matterport/Mask_RCNN/ + +.. |D2| image:: https://github.com/facebookresearch/detectron2/raw/master/.github/Detectron2-Logo-Horz.svg?sanitize=true + :height: 15pt + :target: https://github.com/facebookresearch/detectron2/ +.. |PT| image:: https://pytorch.org/assets/images/logo-icon.svg + :width: 15pt + :height: 15pt + :target: https://pytorch.org +.. |TF| image:: https://static.nvidiagrid.net/ngc/containers/tensorflow.png + :width: 15pt + :height: 15pt + :target: https://tensorflow.org +.. |mxnet| image:: https://github.com/dmlc/web-data/raw/master/mxnet/image/mxnet_favicon.png + :width: 15pt + :height: 15pt + :target: https://mxnet.apache.org/ +.. |C2| image:: https://caffe2.ai/static/logo.svg + :width: 15pt + :height: 15pt + :target: https://caffe2.ai +``` + + +Details for each implementation: + +* __Detectron2__: with release v0.1.2, run: + ``` + python tools/train_net.py --config-file configs/Detectron1-Comparisons/mask_rcnn_R_50_FPN_noaug_1x.yaml --num-gpus 8 + ``` + +* __mmdetection__: at commit `b0d845f`, run + ``` + ./tools/dist_train.sh configs/mask_rcnn/mask_rcnn_r50_caffe_fpn_1x_coco.py 8 + ``` + +* __maskrcnn-benchmark__: use commit `0ce8f6f` with `sed -i ‘s/torch.uint8/torch.bool/g’ **/*.py; sed -i 's/AT_CHECK/TORCH_CHECK/g' **/*.cu` + to make it compatible with PyTorch 1.5. Then, run training with + ``` + python -m torch.distributed.launch --nproc_per_node=8 tools/train_net.py --config-file configs/e2e_mask_rcnn_R_50_FPN_1x.yaml + ``` + The speed we observed is faster than its model zoo, likely due to different software versions. + +* __tensorpack__: at commit `caafda`, `export TF_CUDNN_USE_AUTOTUNE=0`, then run + ``` + mpirun -np 8 ./train.py --config DATA.BASEDIR=/data/coco TRAINER=horovod BACKBONE.STRIDE_1X1=True TRAIN.STEPS_PER_EPOCH=50 --load ImageNet-R50-AlignPadding.npz + ``` + +* __SimpleDet__: at commit `9187a1`, run + ``` + python detection_train.py --config config/mask_r50v1_fpn_1x.py + ``` + +* __Detectron__: run + ``` + python tools/train_net.py --cfg configs/12_2017_baselines/e2e_mask_rcnn_R-50-FPN_1x.yaml + ``` + Note that many of its ops run on CPUs, therefore the performance is limited. + +* __matterport/Mask_RCNN__: at commit `3deaec`, apply the following diff, `export TF_CUDNN_USE_AUTOTUNE=0`, then run + ``` + python coco.py train --dataset=/data/coco/ --model=imagenet + ``` + Note that many small details in this implementation might be different + from Detectron's standards. + +
+ + (diff to make it use the same hyperparameters - click to expand) + + + ```diff + diff --git i/mrcnn/model.py w/mrcnn/model.py + index 62cb2b0..61d7779 100644 + --- i/mrcnn/model.py + +++ w/mrcnn/model.py + @@ -2367,8 +2367,8 @@ class MaskRCNN(): + epochs=epochs, + steps_per_epoch=self.config.STEPS_PER_EPOCH, + callbacks=callbacks, + - validation_data=val_generator, + - validation_steps=self.config.VALIDATION_STEPS, + + #validation_data=val_generator, + + #validation_steps=self.config.VALIDATION_STEPS, + max_queue_size=100, + workers=workers, + use_multiprocessing=True, + diff --git i/mrcnn/parallel_model.py w/mrcnn/parallel_model.py + index d2bf53b..060172a 100644 + --- i/mrcnn/parallel_model.py + +++ w/mrcnn/parallel_model.py + @@ -32,6 +32,7 @@ class ParallelModel(KM.Model): + keras_model: The Keras model to parallelize + gpu_count: Number of GPUs. Must be > 1 + """ + + super().__init__() + self.inner_model = keras_model + self.gpu_count = gpu_count + merged_outputs = self.make_parallel() + diff --git i/samples/coco/coco.py w/samples/coco/coco.py + index 5d172b5..239ed75 100644 + --- i/samples/coco/coco.py + +++ w/samples/coco/coco.py + @@ -81,7 +81,10 @@ class CocoConfig(Config): + IMAGES_PER_GPU = 2 + + # Uncomment to train on 8 GPUs (default is 1) + - # GPU_COUNT = 8 + + GPU_COUNT = 8 + + BACKBONE = "resnet50" + + STEPS_PER_EPOCH = 50 + + TRAIN_ROIS_PER_IMAGE = 512 + + # Number of classes (including background) + NUM_CLASSES = 1 + 80 # COCO has 80 classes + @@ -496,29 +499,10 @@ if __name__ == '__main__': + # *** This training schedule is an example. Update to your needs *** + + # Training - Stage 1 + - print("Training network heads") + model.train(dataset_train, dataset_val, + learning_rate=config.LEARNING_RATE, + epochs=40, + - layers='heads', + - augmentation=augmentation) + - + - # Training - Stage 2 + - # Finetune layers from ResNet stage 4 and up + - print("Fine tune Resnet stage 4 and up") + - model.train(dataset_train, dataset_val, + - learning_rate=config.LEARNING_RATE, + - epochs=120, + - layers='4+', + - augmentation=augmentation) + - + - # Training - Stage 3 + - # Fine tune all layers + - print("Fine tune all layers") + - model.train(dataset_train, dataset_val, + - learning_rate=config.LEARNING_RATE / 10, + - epochs=160, + - layers='all', + + layers='3+', + augmentation=augmentation) + + elif args.command == "evaluate": + ``` + +
diff --git a/docs/notes/changelog.md b/docs/notes/changelog.md new file mode 100644 index 0000000000000000000000000000000000000000..c0d4f5900bc64dbc4d2ce2d9bd31d32b9ee39f8f --- /dev/null +++ b/docs/notes/changelog.md @@ -0,0 +1,26 @@ +# Change Log + +### Releases +See release log at +[https://github.com/facebookresearch/detectron2/releases](https://github.com/facebookresearch/detectron2/releases). + +### Notable Backward Incompatible Changes: + +* 03/30/2020: Custom box head's `output_size` changed to `output_shape`. +* 02/14/2020,02/18/2020: Mask head and keypoint head now include logic for losses & inference. Custom heads + should overwrite the feature computation by `layers()` method. +* 11/11/2019: `detectron2.data.detection_utils.read_image` transposes images with exif information. + +### Config Version Change Log + +* v1: Rename `RPN_HEAD.NAME` to `RPN.HEAD_NAME`. +* v2: A batch of rename of many configurations before release. + +### Silent Regression in Historical Versions: + +We list a few silent regressions since they may silently produce incorrect results and will be hard to debug. + +* 04/01/2020 - 05/11/2020: Bad accuracy if `TRAIN_ON_PRED_BOXES` is set to True. +* 03/30/2020 - 04/01/2020: ResNets are not correctly built. +* 12/19/2019 - 12/26/2019: Using aspect ratio grouping causes a drop in accuracy. +* release - 11/9/2019: Test time augmentation does not predict the last category. diff --git a/docs/notes/compatibility.md b/docs/notes/compatibility.md new file mode 100644 index 0000000000000000000000000000000000000000..f7b66c2e384b162864fb96a2fed44ba3084b8226 --- /dev/null +++ b/docs/notes/compatibility.md @@ -0,0 +1,83 @@ +# Compatibility with Other Libraries + +## Compatibility with Detectron (and maskrcnn-benchmark) + +Detectron2 addresses some legacy issues left in Detectron. As a result, their models +are not compatible: +running inference with the same model weights will produce different results in the two code bases. + +The major differences regarding inference are: + +- The height and width of a box with corners (x1, y1) and (x2, y2) is now computed more naturally as + width = x2 - x1 and height = y2 - y1; + In Detectron, a "+ 1" was added both height and width. + + Note that the relevant ops in Caffe2 have [adopted this change of convention](https://github.com/pytorch/pytorch/pull/20550) + with an extra option. + So it is still possible to run inference with a Detectron2-trained model in Caffe2. + + The change in height/width calculations most notably changes: + - encoding/decoding in bounding box regression. + - non-maximum suppression. The effect here is very negligible, though. + +- RPN now uses simpler anchors with fewer quantization artifacts. + + In Detectron, the anchors were quantized and + [do not have accurate areas](https://github.com/facebookresearch/Detectron/issues/227). + In Detectron2, the anchors are center-aligned to feature grid points and not quantized. + +- Classification layers have a different ordering of class labels. + + This involves any trainable parameter with shape (..., num_categories + 1, ...). + In Detectron2, integer labels [0, K-1] correspond to the K = num_categories object categories + and the label "K" corresponds to the special "background" category. + In Detectron, label "0" means background, and labels [1, K] correspond to the K categories. + +- ROIAlign is implemented differently. The new implementation is [available in Caffe2](https://github.com/pytorch/pytorch/pull/23706). + + 1. All the ROIs are shifted by half a pixel compared to Detectron in order to create better image-feature-map alignment. + See `layers/roi_align.py` for details. + To enable the old behavior, use `ROIAlign(aligned=False)`, or `POOLER_TYPE=ROIAlign` instead of + `ROIAlignV2` (the default). + + 1. The ROIs are not required to have a minimum size of 1. + This will lead to tiny differences in the output, but should be negligible. + +- Mask inference function is different. + + In Detectron2, the "paste_mask" function is different and should be more accurate than in Detectron. This change + can improve mask AP on COCO by ~0.5% absolute. + +There are some other differences in training as well, but they won't affect +model-level compatibility. The major ones are: + +- We fixed a [bug](https://github.com/facebookresearch/Detectron/issues/459) in + Detectron, by making `RPN.POST_NMS_TOPK_TRAIN` per-image, rather than per-batch. + The fix may lead to a small accuracy drop for a few models (e.g. keypoint + detection) and will require some parameter tuning to match the Detectron results. +- For simplicity, we change the default loss in bounding box regression to L1 loss, instead of smooth L1 loss. + We have observed that this tends to slightly decrease box AP50 while improving box AP for higher + overlap thresholds (and leading to a slight overall improvement in box AP). +- We interpret the coordinates in COCO bounding box and segmentation annotations + as coordinates in range `[0, width]` or `[0, height]`. The coordinates in + COCO keypoint annotations are interpreted as pixel indices in range `[0, width - 1]` or `[0, height - 1]`. + Note that this affects how flip augmentation is implemented. + + +We will later share more details and rationale behind the above mentioned issues +about pixels, coordinates, and "+1"s. + + +## Compatibility with Caffe2 + +As mentioned above, despite the incompatibilities with Detectron, the relevant +ops have been implemented in Caffe2. +Therefore, models trained with detectron2 can be converted in Caffe2. +See [Deployment](../tutorials/deployment.md) for the tutorial. + +## Compatibility with TensorFlow + +Most ops are available in TensorFlow, although some tiny differences in +the implementation of resize / ROIAlign / padding need to be addressed. +A working conversion script is provided by [tensorpack FasterRCNN](https://github.com/tensorpack/tensorpack/tree/master/examples/FasterRCNN/convert_d2) +to run a standard detectron2 model in TensorFlow. diff --git a/docs/notes/contributing.md b/docs/notes/contributing.md new file mode 120000 index 0000000000000000000000000000000000000000..95181235eaff1cb5cbb2dc554e8d4991b603d0e5 --- /dev/null +++ b/docs/notes/contributing.md @@ -0,0 +1 @@ +../../.github/CONTRIBUTING.md \ No newline at end of file diff --git a/docs/notes/index.rst b/docs/notes/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..63cf907be7bb15f5316af6d44a46df601755a86b --- /dev/null +++ b/docs/notes/index.rst @@ -0,0 +1,10 @@ +Notes +====================================== + +.. toctree:: + :maxdepth: 2 + + benchmarks + compatibility + contributing + changelog diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..62537cd89614c1720df16209ea891658443c67a5 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,20 @@ +termcolor +numpy +tqdm +docutils==0.16 +Sphinx==3.0.0 +recommonmark==0.6.0 +sphinx_rtd_theme +mock +matplotlib +termcolor +yacs +tabulate +cloudpickle +Pillow==6.2.2 +future +requests +six +git+git://github.com/facebookresearch/fvcore.git +https://download.pytorch.org/whl/cpu/torch-1.5.0%2Bcpu-cp37-cp37m-linux_x86_64.whl +https://download.pytorch.org/whl/cpu/torchvision-0.6.0%2Bcpu-cp37-cp37m-linux_x86_64.whl diff --git a/docs/tutorials/README.md b/docs/tutorials/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1ca9c94d042ef838143a45490fe6b4556c19f3c9 --- /dev/null +++ b/docs/tutorials/README.md @@ -0,0 +1,4 @@ +# Read the docs: + +The latest documentation built from this directory is available at [detectron2.readthedocs.io](https://detectron2.readthedocs.io/). +Documents in this directory are not meant to be read on github. diff --git a/docs/tutorials/builtin_datasets.md b/docs/tutorials/builtin_datasets.md new file mode 120000 index 0000000000000000000000000000000000000000..0ba82423ad498bdd86274ada56a201134a590d94 --- /dev/null +++ b/docs/tutorials/builtin_datasets.md @@ -0,0 +1 @@ +../../datasets/README.md \ No newline at end of file diff --git a/docs/tutorials/configs.md b/docs/tutorials/configs.md new file mode 100644 index 0000000000000000000000000000000000000000..ea82583825b51955993ca87d14c17ffb3ab031f4 --- /dev/null +++ b/docs/tutorials/configs.md @@ -0,0 +1,58 @@ +# Configs + +Detectron2 provides a key-value based config system that can be +used to obtain standard, common behaviors. + +Detectron2's config system uses YAML and [yacs](https://github.com/rbgirshick/yacs). +In addition to the [basic operations](../modules/config.html#detectron2.config.CfgNode) +that access and update a config, we provide the following extra functionalities: + +1. The config can have `_BASE_: base.yaml` field, which will load a base config first. + Values in the base config will be overwritten in sub-configs, if there are any conflicts. + We provided several base configs for standard model architectures. +2. We provide config versioning, for backward compatibility. + If your config file is versioned with a config line like `VERSION: 2`, + detectron2 will still recognize it even if we change some keys in the future. + +"Config" is a very limited abstraction. +We do not expect all features in detectron2 to be available through configs. +If you need something that's not available in the config space, +please write code using detectron2's API. + +### Basic Usage + +Some basic usage of the `CfgNode` object is shown here. See more in [documentation](../modules/config.html#detectron2.config.CfgNode). +```python +from detectron2.config import get_cfg +cfg = get_cfg() # obtain detectron2's default config +cfg.xxx = yyy # add new configs for your own custom components +cfg.merge_from_file("my_cfg.yaml") # load values from a file + +cfg.merge_from_list(["MODEL.WEIGHTS", "weights.pth"]) # can also load values from a list of str +print(cfg.dump()) # print formatted configs +``` + +Many builtin tools in detectron2 accepts command line config overwrite: +Key-value pairs provided in the command line will overwrite the existing values in the config file. +For example, [demo.py](../../demo/demo.py) can be used with +``` +./demo.py --config-file config.yaml [--other-options] \ + --opts MODEL.WEIGHTS /path/to/weights INPUT.MIN_SIZE_TEST 1000 +``` + +To see a list of available configs in detectron2 and what they mean, +check [Config References](../modules/config.html#config-references) + + +### Best Practice with Configs + +1. Treat the configs you write as "code": avoid copying them or duplicating them; use `_BASE_` + to share common parts between configs. + +2. Keep the configs you write simple: don't include keys that do not affect the experimental setting. + +3. Keep a version number in your configs (or the base config), e.g., `VERSION: 2`, + for backward compatibility. + We print a warning when reading a config without version number. + The official configs do not include version number because they are meant to + be always up-to-date. diff --git a/docs/tutorials/data_loading.md b/docs/tutorials/data_loading.md new file mode 100644 index 0000000000000000000000000000000000000000..bb037ca534ccbb0cf82c456d0cd54544520b3a3f --- /dev/null +++ b/docs/tutorials/data_loading.md @@ -0,0 +1,77 @@ + +# Use Custom Dataloaders + +## How the Existing Dataloader Works + +Detectron2 contains a builtin data loading pipeline. +It's good to understand how it works, in case you need to write a custom one. + +Detectron2 provides two functions +[build_detection_{train,test}_loader](../modules/data.html#detectron2.data.build_detection_train_loader) +that create a default data loader from a given config. +Here is how `build_detection_{train,test}_loader` work: + +1. It takes the name of a registered dataset (e.g., "coco_2017_train") and loads a `list[dict]` representing the dataset items + in a lightweight, canonical format. These dataset items are not yet ready to be used by the model (e.g., images are + not loaded into memory, random augmentations have not been applied, etc.). + Details about the dataset format and dataset registration can be found in + [datasets](./datasets.md). +2. Each dict in this list is mapped by a function ("mapper"): + * Users can customize this mapping function by specifying the "mapper" argument in + `build_detection_{train,test}_loader`. The default mapper is [DatasetMapper](../modules/data.html#detectron2.data.DatasetMapper). + * The output format of such function can be arbitrary, as long as it is accepted by the consumer of this data loader (usually the model). + The outputs of the default mapper, after batching, follow the default model input format documented in + [Use Models](./models.html#model-input-format). + * The role of the mapper is to transform the lightweight, canonical representation of a dataset item into a format + that is ready for the model to consume (including, e.g., read images, perform random data augmentation and convert to torch Tensors). + If you would like to perform custom transformations to data, you often want a custom mapper. +3. The outputs of the mapper are batched (simply into a list). +4. This batched data is the output of the data loader. Typically, it's also the input of + `model.forward()`. + + +## Write a Custom Dataloader + +Using a different "mapper" with `build_detection_{train,test}_loader(mapper=)` works for most use cases +of custom data loading. +For example, if you want to resize all images to a fixed size for Mask R-CNN training, write this: + +```python +from detectron2.data import build_detection_train_loader +from detectron2.data import transforms as T +from detectron2.data import detection_utils as utils + +def mapper(dataset_dict): + # Implement a mapper, similar to the default DatasetMapper, but with your own customizations + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format="BGR") + image, transforms = T.apply_transform_gens([T.Resize((800, 800))], image) + dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) + + annos = [ + utils.transform_instance_annotations(obj, transforms, image.shape[:2]) + for obj in dataset_dict.pop("annotations") + if obj.get("iscrowd", 0) == 0 + ] + instances = utils.annotations_to_instances(annos, image.shape[:2]) + dataset_dict["instances"] = utils.filter_empty_instances(instances) + return dataset_dict + +data_loader = build_detection_train_loader(cfg, mapper=mapper) +# use this dataloader instead of the default +``` +Refer to [API documentation of detectron2.data](../modules/data) for details. + +If you want to change not only the mapper (e.g., to write different sampling or batching logic), +you can write your own data loader. The data loader is simply a +python iterator that produces [the format](./models.md) your model accepts. +You can implement it using any tools you like. + +## Use a Custom Dataloader + +If you use [DefaultTrainer](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer), +you can overwrite its `build_{train,test}_loader` method to use your own dataloader. +See the [densepose dataloader](../../projects/DensePose/train_net.py) +for an example. + +If you write your own training loop, you can plug in your data loader easily. diff --git a/docs/tutorials/datasets.md b/docs/tutorials/datasets.md new file mode 100644 index 0000000000000000000000000000000000000000..8dc1c0c55598887e4de73e988567753ebf4538e2 --- /dev/null +++ b/docs/tutorials/datasets.md @@ -0,0 +1,221 @@ +# Use Custom Datasets + +Datasets that have builtin support in detectron2 are listed in [datasets](../../datasets). +If you want to use a custom dataset while also reusing detectron2's data loaders, +you will need to + +1. __Register__ your dataset (i.e., tell detectron2 how to obtain your dataset). +2. Optionally, __register metadata__ for your dataset. + +Next, we explain the above two concepts in detail. + +The [Colab tutorial](https://colab.research.google.com/drive/16jcaJoc6bCFAQ96jDe2HwtXj7BMD_-m5) +has a live example of how to register and train on a dataset of custom formats. + +### Register a Dataset + +To let detectron2 know how to obtain a dataset named "my_dataset", you will implement +a function that returns the items in your dataset and then tell detectron2 about this +function: +```python +def my_dataset_function(): + ... + return list[dict] in the following format + +from detectron2.data import DatasetCatalog +DatasetCatalog.register("my_dataset", my_dataset_function) +``` + +Here, the snippet associates a dataset "my_dataset" with a function that returns the data. +The registration stays effective until the process exists. + +The function can processes data from its original format into either one of the following: +1. Detectron2's standard dataset dict, described below. This will work with many other builtin + features in detectron2, so it's recommended to use it when it's sufficient for your task. +2. Your custom dataset dict. You can also return arbitrary dicts in your own format, + such as adding extra keys for new tasks. + Then you will need to handle them properly downstream as well. + See below for more details. + +#### Standard Dataset Dicts + +For standard tasks +(instance detection, instance/semantic/panoptic segmentation, keypoint detection), +we load the original dataset into `list[dict]` with a specification similar to COCO's json annotations. +This is our standard representation for a dataset. + +Each dict contains information about one image. +The dict may have the following fields, +and the required fields vary based on what the dataloader or the task needs (see more below). + ++ `file_name`: the full path to the image file. Will apply rotation and flipping if the image has such exif information. ++ `height`, `width`: integer. The shape of image. ++ `image_id` (str or int): a unique id that identifies this image. Used + during evaluation to identify the images, but a dataset may use it for different purposes. ++ `annotations` (list[dict]): each dict corresponds to annotations of one instance + in this image. Required by instance detection/segmentation or keypoint detection tasks. + + Images with empty `annotations` will by default be removed from training, + but can be included using `DATALOADER.FILTER_EMPTY_ANNOTATIONS`. + + Each dict contains the following keys, of which `bbox`,`bbox_mode` and `category_id` are required: + + `bbox` (list[float]): list of 4 numbers representing the bounding box of the instance. + + `bbox_mode` (int): the format of bbox. + It must be a member of + [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode). + Currently supports: `BoxMode.XYXY_ABS`, `BoxMode.XYWH_ABS`. + + `category_id` (int): an integer in the range [0, num_categories) representing the category label. + The value num_categories is reserved to represent the "background" category, if applicable. + + `segmentation` (list[list[float]] or dict): the segmentation mask of the instance. + + If `list[list[float]]`, it represents a list of polygons, one for each connected component + of the object. Each `list[float]` is one simple polygon in the format of `[x1, y1, ..., xn, yn]`. + The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates, + depend on whether "bbox_mode" is relative. + + If `dict`, it represents the per-pixel segmentation mask in COCO's RLE format. The dict should have + keys "size" and "counts". You can convert a uint8 segmentation mask of 0s and 1s into + RLE format by `pycocotools.mask.encode(np.asarray(mask, order="F"))`. + + `keypoints` (list[float]): in the format of [x1, y1, v1,..., xn, yn, vn]. + v[i] means the [visibility](http://cocodataset.org/#format-data) of this keypoint. + `n` must be equal to the number of keypoint categories. + The Xs and Ys are either relative coordinates in [0, 1], or absolute coordinates, + depend on whether "bbox_mode" is relative. + + Note that the coordinate annotations in COCO format are integers in range [0, H-1 or W-1]. + By default, detectron2 adds 0.5 to absolute keypoint coordinates to convert them from discrete + pixel indices to floating point coordinates. + + `iscrowd`: 0 (default) or 1. Whether this instance is labeled as COCO's "crowd + region". Don't include this field if you don't know what it means. ++ `sem_seg_file_name`: the full path to the ground truth semantic segmentation file. + Required by semantic segmentation task. + It should be an image whose pixel values are integer labels. + + +Fast R-CNN (with precomputed proposals) is rarely used today. +To train a Fast R-CNN, the following extra keys are needed: + ++ `proposal_boxes` (array): 2D numpy array with shape (K, 4) representing K precomputed proposal boxes for this image. ++ `proposal_objectness_logits` (array): numpy array with shape (K, ), which corresponds to the objectness + logits of proposals in 'proposal_boxes'. ++ `proposal_bbox_mode` (int): the format of the precomputed proposal bbox. + It must be a member of + [structures.BoxMode](../modules/structures.html#detectron2.structures.BoxMode). + Default is `BoxMode.XYXY_ABS`. + +#### Custom Dataset Dicts for New Tasks + +In the `list[dict]` that your dataset function returns, the dictionary can also have arbitrary custom data. +This will be useful for a new task that needs extra information not supported +by the standard dataset dicts. In this case, you need to make sure the downstream code can handle your data +correctly. Usually this requires writing a new `mapper` for the dataloader (see [Use Custom Dataloaders](./data_loading.md)). + +When designing a custom format, note that all dicts are stored in memory +(sometimes serialized and with multiple copies). +To save memory, each dict is meant to contain small but sufficient information +about each sample, such as file names and annotations. +Loading full samples typically happens in the data loader. + +For attributes shared among the entire dataset, use `Metadata` (see below). +To avoid extra memory, do not save such information repeatly for each sample. + +### "Metadata" for Datasets + +Each dataset is associated with some metadata, accessible through +`MetadataCatalog.get(dataset_name).some_metadata`. +Metadata is a key-value mapping that contains information that's shared among +the entire dataset, and usually is used to interpret what's in the dataset, e.g., +names of classes, colors of classes, root of files, etc. +This information will be useful for augmentation, evaluation, visualization, logging, etc. +The structure of metadata depends on the what is needed from the corresponding downstream code. + +If you register a new dataset through `DatasetCatalog.register`, +you may also want to add its corresponding metadata through +`MetadataCatalog.get(dataset_name).some_key = some_value`, to enable any features that need the metadata. +You can do it like this (using the metadata key "thing_classes" as an example): + +```python +from detectron2.data import MetadataCatalog +MetadataCatalog.get("my_dataset").thing_classes = ["person", "dog"] +``` + +Here is a list of metadata keys that are used by builtin features in detectron2. +If you add your own dataset without these metadata, some features may be +unavailable to you: + +* `thing_classes` (list[str]): Used by all instance detection/segmentation tasks. + A list of names for each instance/thing category. + If you load a COCO format dataset, it will be automatically set by the function `load_coco_json`. + +* `thing_colors` (list[tuple(r, g, b)]): Pre-defined color (in [0, 255]) for each thing category. + Used for visualization. If not given, random colors are used. + +* `stuff_classes` (list[str]): Used by semantic and panoptic segmentation tasks. + A list of names for each stuff category. + +* `stuff_colors` (list[tuple(r, g, b)]): Pre-defined color (in [0, 255]) for each stuff category. + Used for visualization. If not given, random colors are used. + +* `keypoint_names` (list[str]): Used by keypoint localization. A list of names for each keypoint. + +* `keypoint_flip_map` (list[tuple[str]]): Used by the keypoint localization task. A list of pairs of names, + where each pair are the two keypoints that should be flipped if the image is + flipped horizontally during augmentation. +* `keypoint_connection_rules`: list[tuple(str, str, (r, g, b))]. Each tuple specifies a pair of keypoints + that are connected and the color to use for the line between them when visualized. + +Some additional metadata that are specific to the evaluation of certain datasets (e.g. COCO): + +* `thing_dataset_id_to_contiguous_id` (dict[int->int]): Used by all instance detection/segmentation tasks in the COCO format. + A mapping from instance class ids in the dataset to contiguous ids in range [0, #class). + Will be automatically set by the function `load_coco_json`. + +* `stuff_dataset_id_to_contiguous_id` (dict[int->int]): Used when generating prediction json files for + semantic/panoptic segmentation. + A mapping from semantic segmentation class ids in the dataset + to contiguous ids in [0, num_categories). It is useful for evaluation only. + +* `json_file`: The COCO annotation json file. Used by COCO evaluation for COCO-format datasets. +* `panoptic_root`, `panoptic_json`: Used by panoptic evaluation. +* `evaluator_type`: Used by the builtin main training script to select + evaluator. Don't use it in a new training script. + You can just provide the [DatasetEvaluator](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluator) + for your dataset directly in your main script. + +NOTE: For background on the concept of "thing" and "stuff", see +[On Seeing Stuff: The Perception of Materials by Humans and Machines](http://persci.mit.edu/pub_pdfs/adelson_spie_01.pdf). +In detectron2, the term "thing" is used for instance-level tasks, +and "stuff" is used for semantic segmentation tasks. +Both are used in panoptic segmentation. + +### Register a COCO Format Dataset + +If your dataset is already a json file in the COCO format, +the dataset and its associated metadata can be registered easily with: +```python +from detectron2.data.datasets import register_coco_instances +register_coco_instances("my_dataset", {}, "json_annotation.json", "path/to/image/dir") +``` + +If your dataset is in COCO format but with extra custom per-instance annotations, +the [load_coco_json](../modules/data.html#detectron2.data.datasets.load_coco_json) +function might be useful. + +### Update the Config for New Datasets + +Once you've registered the dataset, you can use the name of the dataset (e.g., "my_dataset" in +example above) in `cfg.DATASETS.{TRAIN,TEST}`. +There are other configs you might want to change to train or evaluate on new datasets: + +* `MODEL.ROI_HEADS.NUM_CLASSES` and `MODEL.RETINANET.NUM_CLASSES` are the number of thing classes + for R-CNN and RetinaNet models, respectively. +* `MODEL.ROI_KEYPOINT_HEAD.NUM_KEYPOINTS` sets the number of keypoints for Keypoint R-CNN. + You'll also need to set [Keypoint OKS](http://cocodataset.org/#keypoints-eval) + with `TEST.KEYPOINT_OKS_SIGMAS` for evaluation. +* `MODEL.SEM_SEG_HEAD.NUM_CLASSES` sets the number of stuff classes for Semantic FPN & Panoptic FPN. +* If you're training Fast R-CNN (with precomputed proposals), `DATASETS.PROPOSAL_FILES_{TRAIN,TEST}` + need to match the datasets. The format of proposal files are documented + [here](../modules/data.html#detectron2.data.load_proposals_into_dataset). + +New models +(e.g. [TensorMask](../../projects/TensorMask), +[PointRend](../../projects/PointRend)) +often have similar configs of their own that need to be changed as well. diff --git a/docs/tutorials/deployment.md b/docs/tutorials/deployment.md new file mode 100644 index 0000000000000000000000000000000000000000..a473247abf7df74e35b6de71c018f1aa34eaf435 --- /dev/null +++ b/docs/tutorials/deployment.md @@ -0,0 +1,92 @@ +# Deployment + +## Caffe2 Deployment +We currently support converting a detectron2 model to Caffe2 format through ONNX. +The converted Caffe2 model is able to run without detectron2 dependency in either Python or C++. +It has a runtime optimized for CPU & mobile inference, but not for GPU inference. + +Caffe2 conversion requires PyTorch ≥ 1.4 and ONNX ≥ 1.6. + +### Coverage + +It supports 3 most common meta architectures: `GeneralizedRCNN`, `RetinaNet`, `PanopticFPN`, +and most official models under these 3 meta architectures. + +Users' custom extensions under these architectures (added through registration) are supported +as long as they do not contain control flow or operators not available in Caffe2 (e.g. deformable convolution). +For example, custom backbones and heads are often supported out of the box. + +### Usage + +The conversion APIs are documented at [the API documentation](../modules/export). +We provide a tool, `caffe2_converter.py` as an example that uses +these APIs to convert a standard model. + +To convert an official Mask R-CNN trained on COCO, first +[prepare the COCO dataset](../../datasets/), then pick the model from [Model Zoo](../../MODEL_ZOO.md), and run: +``` +cd tools/deploy/ && ./caffe2_converter.py --config-file ../../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml \ + --output ./caffe2_model --run-eval \ + MODEL.WEIGHTS detectron2://COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x/137849600/model_final_f10217.pkl \ + MODEL.DEVICE cpu +``` + +Note that: +1. The conversion needs valid sample inputs & weights to trace the model. That's why the script requires the dataset. + You can modify the script to obtain sample inputs in other ways. +2. With the `--run-eval` flag, it will evaluate the converted models to verify its accuracy. + The accuracy is typically slightly different (within 0.1 AP) from PyTorch due to + numerical precisions between different implementations. + It's recommended to always verify the accuracy in case your custom model is not supported by the + conversion. + +The converted model is available at the specified `caffe2_model/` directory. Two files `model.pb` +and `model_init.pb` that contain network structure and network parameters are necessary for deployment. +These files can then be loaded in C++ or Python using Caffe2's APIs. + +The script generates `model.svg` file which contains a visualization of the network. +You can also load `model.pb` to tools such as [netron](https://github.com/lutzroeder/netron) to visualize it. + +### Use the model in C++/Python + +The model can be loaded in C++. An example [caffe2_mask_rcnn.cpp](../../tools/deploy/) is given, +which performs CPU/GPU inference using `COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x`. + +The C++ example needs to be built with: +* PyTorch with caffe2 inside +* gflags, glog, opencv +* protobuf headers that match the version of your caffe2 +* MKL headers if caffe2 is built with MKL + +The following can compile the example inside [official detectron2 docker](../../docker/): +``` +sudo apt update && sudo apt install libgflags-dev libgoogle-glog-dev libopencv-dev +pip install mkl-include +wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protobuf-cpp-3.6.1.tar.gz +tar xf protobuf-cpp-3.6.1.tar.gz +export CPATH=$(readlink -f ./protobuf-3.6.1/src/):$HOME/.local/include +export CMAKE_PREFIX_PATH=$HOME/.local/lib/python3.6/site-packages/torch/ +mkdir build && cd build +cmake -DTORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST .. && make + +# To run: +./caffe2_mask_rcnn --predict_net=./model.pb --init_net=./model_init.pb --input=input.jpg +``` + +Note that: + +* All converted models (the .pb files) take two input tensors: + "data" is an NCHW image, and "im_info" is an Nx3 tensor consisting of (height, width, 1.0) for + each image (the shape of "data" might be larger than that in "im_info" due to padding). + +* The converted models do not contain post-processing operations that + transform raw layer outputs into formatted predictions. + The example only produces raw outputs (28x28 masks) from the final + layers that are not post-processed, because in actual deployment, an application often needs + its custom lightweight post-processing (e.g. full-image masks for every detected object is often not necessary). + +We also provide a python wrapper around the converted model, in the +[Caffe2Model.\_\_call\_\_](../modules/export.html#detectron2.export.Caffe2Model.__call__) method. +This method has an interface that's identical to the [pytorch versions of models](./models.md), +and it internally applies pre/post-processing code to match the formats. +They can serve as a reference for pre/post-processing in actual deployment. diff --git a/docs/tutorials/evaluation.md b/docs/tutorials/evaluation.md new file mode 100644 index 0000000000000000000000000000000000000000..c71adb7eb2e554e5ea848f1feb44bbee01a13f8e --- /dev/null +++ b/docs/tutorials/evaluation.md @@ -0,0 +1,43 @@ + +# Evaluation + +Evaluation is a process that takes a number of inputs/outputs pairs and aggregate them. +You can always [use the model](./models.md) directly and just parse its inputs/outputs manually to perform +evaluation. +Alternatively, evaluation is implemented in detectron2 using the [DatasetEvaluator](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluator) +interface. + +Detectron2 includes a few `DatasetEvaluator` that computes metrics using standard dataset-specific +APIs (e.g., COCO, LVIS). +You can also implement your own `DatasetEvaluator` that performs some other jobs +using the inputs/outputs pairs. +For example, to count how many instances are detected on the validation set: + +``` +class Counter(DatasetEvaluator): + def reset(self): + self.count = 0 + def process(self, inputs, outputs): + for output in outputs: + self.count += len(output["instances"]) + def evaluate(self): + # save self.count somewhere, or print it, or return it. + return {"count": self.count} +``` + +Once you have some `DatasetEvaluator`, you can run it with +[inference_on_dataset](../modules/evaluation.html#detectron2.evaluation.inference_on_dataset). +For example, + +```python +val_results = inference_on_dataset( + model, + val_data_loader, + DatasetEvaluators([COCOEvaluator(...), Counter()])) +``` +Compared to running the evaluation manually using the model, the benefit of this function is that +you can merge evaluators together using [DatasetEvaluators](../modules/evaluation.html#detectron2.evaluation.DatasetEvaluators). +In this way you can run all evaluations without having to go through the dataset multiple times. + +The `inference_on_dataset` function also provides accurate speed benchmarks for the +given model and dataset. diff --git a/docs/tutorials/extend.md b/docs/tutorials/extend.md new file mode 100644 index 0000000000000000000000000000000000000000..4232185757139e45078bf58c4f0fffb5fa0e4c04 --- /dev/null +++ b/docs/tutorials/extend.md @@ -0,0 +1,53 @@ +# Extend Detectron2's Defaults + +__Research is about doing things in new ways__. +This brings a tension in how to create abstractions in code, +which is a challenge for any research engineering project of a significant size: + +1. On one hand, it needs to have very thin abstractions to allow for the possibility of doing + everything in new ways. It should be reasonably easy to break existing + abstractions and replace them with new ones. + +2. On the other hand, such a project also needs reasonably high-level + abstractions, so that users can easily do things in standard ways, + without worrying too much about the details that only certain researchers care about. + +In detectron2, there are two types of interfaces that address this tension together: + +1. Functions and classes that take a config (`cfg`) argument + (sometimes with only a few extra arguments). + + Such functions and classes implement + the "standard default" behavior: it will read what it needs from the + config and do the "standard" thing. + Users only need to load a given config and pass it around, without having to worry about + which arguments are used and what they all mean. + +2. Functions and classes that have well-defined explicit arguments. + + Each of these is a small building block of the entire system. + They require users' expertise to understand what each argument should be, + and require more effort to stitch together to a larger system. + But they can be stitched together in more flexible ways. + + When you need to implement something not supported by the "standard defaults" + included in detectron2, these well-defined components can be reused. + +3. (experimental) A few classes are implemented with the + [@configurable](../../modules/config.html#detectron2.config.configurable) + decorator - they can be called with either a config, or with explicit arguments. + Their explicit argument interfaces are currently __experimental__ and subject to change. + + +If you only need the standard behavior, the [Beginner's Tutorial](./getting_started.md) +should suffice. If you need to extend detectron2 to your own needs, +see the following tutorials for more details: + +* Detectron2 includes a few standard datasets. To use custom ones, see + [Use Custom Datasets](./datasets.md). +* Detectron2 contains the standard logic that creates a data loader for training/testing from a + dataset, but you can write your own as well. See [Use Custom Data Loaders](./data_loading.md). +* Detectron2 implements many standard detection models, and provide ways for you + to overwrite their behaviors. See [Use Models](./models.md) and [Write Models](./write-models.md). +* Detectron2 provides a default training loop that is good for common training tasks. + You can customize it with hooks, or write your own loop instead. See [training](./training.md). diff --git a/docs/tutorials/getting_started.md b/docs/tutorials/getting_started.md new file mode 120000 index 0000000000000000000000000000000000000000..e90bde77a3197b77f4cfdce86ca8f96491650acd --- /dev/null +++ b/docs/tutorials/getting_started.md @@ -0,0 +1 @@ +../../GETTING_STARTED.md \ No newline at end of file diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..896e71e64139a35a566bbdd76e4b57006af35e2d --- /dev/null +++ b/docs/tutorials/index.rst @@ -0,0 +1,18 @@ +Tutorials +====================================== + +.. toctree:: + :maxdepth: 2 + + install + getting_started + builtin_datasets + extend + datasets + data_loading + models + write-models + training + evaluation + configs + deployment diff --git a/docs/tutorials/install.md b/docs/tutorials/install.md new file mode 120000 index 0000000000000000000000000000000000000000..5f52b2be3c9650cfc3e16ffb8fa374d3fcbad371 --- /dev/null +++ b/docs/tutorials/install.md @@ -0,0 +1 @@ +../../INSTALL.md \ No newline at end of file diff --git a/docs/tutorials/models.md b/docs/tutorials/models.md new file mode 100644 index 0000000000000000000000000000000000000000..456f36d1c03f657ba0b63eb6f26506c4b1b0d60f --- /dev/null +++ b/docs/tutorials/models.md @@ -0,0 +1,151 @@ +# Use Models + +Models (and their sub-models) in detectron2 are built by +functions such as `build_model`, `build_backbone`, `build_roi_heads`: +```python +from detectron2.modeling import build_model +model = build_model(cfg) # returns a torch.nn.Module +``` + +`build_model` only builds the model structure, and fill it with random parameters. +See below for how to load an existing checkpoint to the model, +and how to use the `model` object. + +### Load/Save a Checkpoint +```python +from detectron2.checkpoint import DetectionCheckpointer +DetectionCheckpointer(model).load(file_path) # load a file to model + +checkpointer = DetectionCheckpointer(model, save_dir="output") +checkpointer.save("model_999") # save to output/model_999.pth +``` + +Detectron2's checkpointer recognizes models in pytorch's `.pth` format, as well as the `.pkl` files +in our model zoo. +See [API doc](../modules/checkpoint.html#detectron2.checkpoint.DetectionCheckpointer) +for more details about its usage. + +The model files can be arbitrarily manipulated using `torch.{load,save}` for `.pth` files or +`pickle.{dump,load}` for `.pkl` files. + +### Use a Model + +A model can be called by `outputs = model(inputs)`, where `inputs` is a `list[dict]`. +Each dict corresponds to one image and the required keys +depend on the type of model, and whether the model is in training or evaluation mode. +For example, in order to do inference, +all existing models expect the "image" key, and optionally "height" and "width". +The detailed format of inputs and outputs of existing models are explained below. + +When in training mode, all models are required to be used under an `EventStorage`. +The training statistics will be put into the storage: +```python +from detectron2.utils.events import EventStorage +with EventStorage() as storage: + losses = model(inputs) +``` + +If you only want to do simple inference using an existing model, +[DefaultPredictor](../modules/engine.html#detectron2.engine.defaults.DefaultPredictor) +is a wrapper around model that provides such basic functionality. +It includes default behavior including model loading, preprocessing, +and operates on single image rather than batches. + +### Model Input Format + +Users can implement custom models that support any arbitrary input format. +Here we describe the standard input format that all builtin models support in detectron2. +They all take a `list[dict]` as the inputs. Each dict +corresponds to information about one image. + +The dict may contain the following keys: + +* "image": `Tensor` in (C, H, W) format. The meaning of channels are defined by `cfg.INPUT.FORMAT`. + Image normalization, if any, will be performed inside the model using + `cfg.MODEL.PIXEL_{MEAN,STD}`. +* "instances": an [Instances](../modules/structures.html#detectron2.structures.Instances) + object, with the following fields: + + "gt_boxes": a [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing N boxes, one for each instance. + + "gt_classes": `Tensor` of long type, a vector of N labels, in range [0, num_categories). + + "gt_masks": a [PolygonMasks](../modules/structures.html#detectron2.structures.PolygonMasks) + or [BitMasks](../modules/structures.html#detectron2.structures.BitMasks) object storing N masks, one for each instance. + + "gt_keypoints": a [Keypoints](../modules/structures.html#detectron2.structures.Keypoints) + object storing N keypoint sets, one for each instance. +* "proposals": an [Instances](../modules/structures.html#detectron2.structures.Instances) + object used only in Fast R-CNN style models, with the following fields: + + "proposal_boxes": a [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing P proposal boxes. + + "objectness_logits": `Tensor`, a vector of P scores, one for each proposal. +* "height", "width": the **desired** output height and width, which is not necessarily the same + as the height or width of the `image` input field. + For example, the `image` input field might be a resized image, + but you may want the outputs to be in **original** resolution. + + If provided, the model will produce output in this resolution, + rather than in the resolution of the `image` as input into the model. This is more efficient and accurate. +* "sem_seg": `Tensor[int]` in (H, W) format. The semantic segmentation ground truth. + Values represent category labels starting from 0. + + +#### How it connects to data loader: + +The output of the default [DatasetMapper]( ../modules/data.html#detectron2.data.DatasetMapper) is a dict +that follows the above format. +After the data loader performs batching, it becomes `list[dict]` which the builtin models support. + + +### Model Output Format + +When in training mode, the builtin models output a `dict[str->ScalarTensor]` with all the losses. + +When in inference mode, the builtin models output a `list[dict]`, one dict for each image. +Based on the tasks the model is doing, each dict may contain the following fields: + +* "instances": [Instances](../modules/structures.html#detectron2.structures.Instances) + object with the following fields: + * "pred_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes) object storing N boxes, one for each detected instance. + * "scores": `Tensor`, a vector of N scores. + * "pred_classes": `Tensor`, a vector of N labels in range [0, num_categories). + + "pred_masks": a `Tensor` of shape (N, H, W), masks for each detected instance. + + "pred_keypoints": a `Tensor` of shape (N, num_keypoint, 3). + Each row in the last dimension is (x, y, score). Scores are larger than 0. +* "sem_seg": `Tensor` of (num_categories, H, W), the semantic segmentation prediction. +* "proposals": [Instances](../modules/structures.html#detectron2.structures.Instances) + object with the following fields: + * "proposal_boxes": [Boxes](../modules/structures.html#detectron2.structures.Boxes) + object storing N boxes. + * "objectness_logits": a torch vector of N scores. +* "panoptic_seg": A tuple of `(Tensor, list[dict])`. The tensor has shape (H, W), where each element + represent the segment id of the pixel. Each dict describes one segment id and has the following fields: + * "id": the segment id + * "isthing": whether the segment is a thing or stuff + * "category_id": the category id of this segment. It represents the thing + class id when `isthing==True`, and the stuff class id otherwise. + + +### Partially execute a model: + +Sometimes you may want to obtain an intermediate tensor inside a model. +Since there are typically hundreds of intermediate tensors, there isn't an API that provides you +the intermediate result you need. +You have the following options: + +1. Write a (sub)model. Following the [tutorial](./write-models.md), you can + rewrite a model component (e.g. a head of a model), such that it + does the same thing as the existing component, but returns the output + you need. +2. Partially execute a model. You can create the model as usual, + but use custom code to execute it instead of its `forward()`. For example, + the following code obtains mask features before mask head. + +```python +images = ImageList.from_tensors(...) # preprocessed input tensor +model = build_model(cfg) +features = model.backbone(images.tensor) +proposals, _ = model.proposal_generator(images, features) +instances = model.roi_heads._forward_box(features, proposals) +mask_features = [features[f] for f in model.roi_heads.in_features] +mask_features = model.roi_heads.mask_pooler(mask_features, [x.pred_boxes for x in instances]) +``` + +Note that both options require you to read the existing forward code to understand +how to write code to obtain the outputs you need. diff --git a/docs/tutorials/training.md b/docs/tutorials/training.md new file mode 100644 index 0000000000000000000000000000000000000000..dc7d537254c398252e3b91c25e33489aa91709c4 --- /dev/null +++ b/docs/tutorials/training.md @@ -0,0 +1,50 @@ +# Training + +From the previous tutorials, you may now have a custom model and data loader. + +You are free to create your own optimizer, and write the training logic: it's +usually easy with PyTorch, and allow researchers to see the entire training +logic more clearly and have full control. +One such example is provided in [tools/plain_train_net.py](../../tools/plain_train_net.py). + +We also provide a standarized "trainer" abstraction with a +[minimal hook system](../modules/engine.html#detectron2.engine.HookBase) +that helps simplify the standard types of training. + +You can use +[SimpleTrainer().train()](../modules/engine.html#detectron2.engine.SimpleTrainer) +which provides minimal abstraction for single-cost single-optimizer single-data-source training. +The builtin `train_net.py` script uses +[DefaultTrainer().train()](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer), +which includes more standard default behavior that one might want to opt in, +including default configurations for learning rate schedule, +logging, evaluation, checkpointing etc. +This also means that it's less likely to support some non-standard behavior +you might want during research. + +To customize the training loops, you can: + +1. If your customization is similar to what `DefaultTrainer` is already doing, +you can change behavior of `DefaultTrainer` by overwriting [its methods](../modules/engine.html#detectron2.engine.defaults.DefaultTrainer) +in a subclass, like what [tools/train_net.py](../../tools/train_net.py) does. +2. If you need something very novel, you can start from [tools/plain_train_net.py](../../tools/plain_train_net.py) to implement them yourself. + +### Logging of Metrics + +During training, metrics are saved to a centralized [EventStorage](../modules/utils.html#detectron2.utils.events.EventStorage). +You can use the following code to access it and log metrics to it: +``` +from detectron2.utils.events import get_event_storage + +# inside the model: +if self.training: + value = # compute the value from inputs + storage = get_event_storage() + storage.put_scalar("some_accuracy", value) +``` + +Refer to its documentation for more details. + +Metrics are then saved to various destinations with [EventWriter](../modules/utils.html#module-detectron2.utils.events). +DefaultTrainer enables a few `EventWriter` with default configurations. +See above for how to customize them. diff --git a/docs/tutorials/write-models.md b/docs/tutorials/write-models.md new file mode 100644 index 0000000000000000000000000000000000000000..bb87d586d609ca94240f32f2eaab7eadb0d07b93 --- /dev/null +++ b/docs/tutorials/write-models.md @@ -0,0 +1,39 @@ +# Write Models + +If you are trying to do something completely new, you may wish to implement +a model entirely from scratch within detectron2. However, in many situations you may +be interested in modifying or extending some components of an existing model. +Therefore, we also provide a registration mechanism that lets you override the +behavior of certain internal components of standard models. + +For example, to add a new backbone, import this code in your code: +```python +from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec + +@BACKBONE_REGISTRY.register() +class ToyBackBone(Backbone): + def __init__(self, cfg, input_shape): + # create your own backbone + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=16, padding=3) + + def forward(self, image): + return {"conv1": self.conv1(image)} + + def output_shape(self): + return {"conv1": ShapeSpec(channels=64, stride=16)} +``` +Then, you can use `cfg.MODEL.BACKBONE.NAME = 'ToyBackBone'` in your config object. +`build_model(cfg)` will then call your `ToyBackBone` instead. + +As another example, to add new abilities to the ROI heads in the Generalized R-CNN meta-architecture, +you can implement a new +[ROIHeads](../modules/modeling.html#detectron2.modeling.ROIHeads) subclass and put it in the `ROI_HEADS_REGISTRY`. +See [densepose in detectron2](../../projects/DensePose) +and [meshrcnn](https://github.com/facebookresearch/meshrcnn) +for examples that implement new ROIHeads to perform new tasks. +And [projects/](../../projects/) +contains more examples that implement different architectures. + +A complete list of registries can be found in [API documentation](../modules/modeling.html#model-registries). +You can register components in these registries to customize different parts of a model, or the +entire model. diff --git a/projects/DensePose/README.md b/projects/DensePose/README.md new file mode 100644 index 0000000000000000000000000000000000000000..fd2f1ee3382365ab53ae44471c90266dff42d883 --- /dev/null +++ b/projects/DensePose/README.md @@ -0,0 +1,54 @@ +# DensePose in Detectron2 +**Dense Human Pose Estimation In The Wild** + +_Rıza Alp Güler, Natalia Neverova, Iasonas Kokkinos_ + +[[`densepose.org`](https://densepose.org)] [[`arXiv`](https://arxiv.org/abs/1802.00434)] [[`BibTeX`](#CitingDensePose)] + +Dense human pose estimation aims at mapping all human pixels of an RGB image to the 3D surface of the human body. + +
+ +
+ +In this repository, we provide the code to train and evaluate DensePose-RCNN. We also provide tools to visualize +DensePose annotation and results. + +# Quick Start + +See [ Getting Started ](doc/GETTING_STARTED.md) + +# Model Zoo and Baselines + +We provide a number of baseline results and trained models available for download. See [Model Zoo](doc/MODEL_ZOO.md) for details. + +# License + +Detectron2 is released under the [Apache 2.0 license](../../LICENSE) + +## Citing DensePose + +If you use DensePose, please take the references from the following BibTeX entries: + +For DensePose with estimated confidences: + +``` +@InProceedings{Neverova2019DensePoseConfidences, + title = {Correlated Uncertainty for Learning Dense Correspondences from Noisy Labels}, + author = {Neverova, Natalia and Novotny, David and Vedaldi, Andrea}, + journal = {Advances in Neural Information Processing Systems}, + year = {2019}, +} +``` + +For the original DensePose: + +``` +@InProceedings{Guler2018DensePose, + title={DensePose: Dense Human Pose Estimation In The Wild}, + author={R\{i}za Alp G\"uler, Natalia Neverova, Iasonas Kokkinos}, + journal={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)}, + year={2018} +} +``` + diff --git a/projects/DensePose/apply_net.py b/projects/DensePose/apply_net.py new file mode 100755 index 0000000000000000000000000000000000000000..7262f7c059b42225b809429654d34f29dbd2801f --- /dev/null +++ b/projects/DensePose/apply_net.py @@ -0,0 +1,318 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import argparse +import glob +import logging +import os +import pickle +import sys +from typing import Any, ClassVar, Dict, List +import torch + +from detectron2.config import get_cfg +from detectron2.data.detection_utils import read_image +from detectron2.engine.defaults import DefaultPredictor +from detectron2.structures.boxes import BoxMode +from detectron2.structures.instances import Instances +from detectron2.utils.logger import setup_logger + +from densepose import add_densepose_config +from densepose.utils.logger import verbosity_to_level +from densepose.vis.base import CompoundVisualizer +from densepose.vis.bounding_box import ScoredBoundingBoxVisualizer +from densepose.vis.densepose import ( + DensePoseResultsContourVisualizer, + DensePoseResultsFineSegmentationVisualizer, + DensePoseResultsUVisualizer, + DensePoseResultsVVisualizer, +) +from densepose.vis.extractor import CompoundExtractor, create_extractor + +DOC = """Apply Net - a tool to print / visualize DensePose results +""" + +LOGGER_NAME = "apply_net" +logger = logging.getLogger(LOGGER_NAME) + +_ACTION_REGISTRY: Dict[str, "Action"] = {} + + +class Action(object): + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + parser.add_argument( + "-v", + "--verbosity", + action="count", + help="Verbose mode. Multiple -v options increase the verbosity.", + ) + + +def register_action(cls: type): + """ + Decorator for action classes to automate action registration + """ + global _ACTION_REGISTRY + _ACTION_REGISTRY[cls.COMMAND] = cls + return cls + + +class InferenceAction(Action): + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(InferenceAction, cls).add_arguments(parser) + parser.add_argument("cfg", metavar="", help="Config file") + parser.add_argument("model", metavar="", help="Model file") + parser.add_argument("input", metavar="", help="Input data") + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + + @classmethod + def execute(cls: type, args: argparse.Namespace): + logger.info(f"Loading config from {args.cfg}") + opts = [] + cfg = cls.setup_config(args.cfg, args.model, args, opts) + logger.info(f"Loading model from {args.model}") + predictor = DefaultPredictor(cfg) + logger.info(f"Loading data from {args.input}") + file_list = cls._get_input_file_list(args.input) + if len(file_list) == 0: + logger.warning(f"No input images for {args.input}") + return + context = cls.create_context(args) + for file_name in file_list: + img = read_image(file_name, format="BGR") # predictor expects BGR image. + with torch.no_grad(): + outputs = predictor(img)["instances"] + cls.execute_on_outputs(context, {"file_name": file_name, "image": img}, outputs) + cls.postexecute(context) + + @classmethod + def setup_config( + cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str] + ): + cfg = get_cfg() + add_densepose_config(cfg) + cfg.merge_from_file(config_fpath) + cfg.merge_from_list(args.opts) + if opts: + cfg.merge_from_list(opts) + cfg.MODEL.WEIGHTS = model_fpath + cfg.freeze() + return cfg + + @classmethod + def _get_input_file_list(cls: type, input_spec: str): + if os.path.isdir(input_spec): + file_list = [ + os.path.join(input_spec, fname) + for fname in os.listdir(input_spec) + if os.path.isfile(os.path.join(input_spec, fname)) + ] + elif os.path.isfile(input_spec): + file_list = [input_spec] + else: + file_list = glob.glob(input_spec) + return file_list + + +@register_action +class DumpAction(InferenceAction): + """ + Dump action that outputs results to a pickle file + """ + + COMMAND: ClassVar[str] = "dump" + + @classmethod + def add_parser(cls: type, subparsers: argparse._SubParsersAction): + parser = subparsers.add_parser(cls.COMMAND, help="Dump model outputs to a file.") + cls.add_arguments(parser) + parser.set_defaults(func=cls.execute) + + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(DumpAction, cls).add_arguments(parser) + parser.add_argument( + "--output", + metavar="", + default="results.pkl", + help="File name to save dump to", + ) + + @classmethod + def execute_on_outputs( + cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances + ): + image_fpath = entry["file_name"] + logger.info(f"Processing {image_fpath}") + result = {"file_name": image_fpath} + if outputs.has("scores"): + result["scores"] = outputs.get("scores").cpu() + if outputs.has("pred_boxes"): + result["pred_boxes_XYXY"] = outputs.get("pred_boxes").tensor.cpu() + if outputs.has("pred_densepose"): + boxes_XYWH = BoxMode.convert( + result["pred_boxes_XYXY"], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS + ) + result["pred_densepose"] = outputs.get("pred_densepose").to_result(boxes_XYWH) + context["results"].append(result) + + @classmethod + def create_context(cls: type, args: argparse.Namespace): + context = {"results": [], "out_fname": args.output} + return context + + @classmethod + def postexecute(cls: type, context: Dict[str, Any]): + out_fname = context["out_fname"] + out_dir = os.path.dirname(out_fname) + if len(out_dir) > 0 and not os.path.exists(out_dir): + os.makedirs(out_dir) + with open(out_fname, "wb") as hFile: + pickle.dump(context["results"], hFile) + logger.info(f"Output saved to {out_fname}") + + +@register_action +class ShowAction(InferenceAction): + """ + Show action that visualizes selected entries on an image + """ + + COMMAND: ClassVar[str] = "show" + VISUALIZERS: ClassVar[Dict[str, object]] = { + "dp_contour": DensePoseResultsContourVisualizer, + "dp_segm": DensePoseResultsFineSegmentationVisualizer, + "dp_u": DensePoseResultsUVisualizer, + "dp_v": DensePoseResultsVVisualizer, + "bbox": ScoredBoundingBoxVisualizer, + } + + @classmethod + def add_parser(cls: type, subparsers: argparse._SubParsersAction): + parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries") + cls.add_arguments(parser) + parser.set_defaults(func=cls.execute) + + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(ShowAction, cls).add_arguments(parser) + parser.add_argument( + "visualizations", + metavar="", + help="Comma separated list of visualizations, possible values: " + "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))), + ) + parser.add_argument( + "--min_score", + metavar="", + default=0.8, + type=float, + help="Minimum detection score to visualize", + ) + parser.add_argument( + "--nms_thresh", metavar="", default=None, type=float, help="NMS threshold" + ) + parser.add_argument( + "--output", + metavar="", + default="outputres.png", + help="File name to save output to", + ) + + @classmethod + def setup_config( + cls: type, config_fpath: str, model_fpath: str, args: argparse.Namespace, opts: List[str] + ): + opts.append("MODEL.ROI_HEADS.SCORE_THRESH_TEST") + opts.append(str(args.min_score)) + if args.nms_thresh is not None: + opts.append("MODEL.ROI_HEADS.NMS_THRESH_TEST") + opts.append(str(args.nms_thresh)) + cfg = super(ShowAction, cls).setup_config(config_fpath, model_fpath, args, opts) + return cfg + + @classmethod + def execute_on_outputs( + cls: type, context: Dict[str, Any], entry: Dict[str, Any], outputs: Instances + ): + import cv2 + import numpy as np + + visualizer = context["visualizer"] + extractor = context["extractor"] + image_fpath = entry["file_name"] + logger.info(f"Processing {image_fpath}") + image = cv2.cvtColor(entry["image"], cv2.COLOR_BGR2GRAY) + image = np.tile(image[:, :, np.newaxis], [1, 1, 3]) + data = extractor(outputs) + image_vis = visualizer.visualize(image, data) + entry_idx = context["entry_idx"] + 1 + out_fname = cls._get_out_fname(entry_idx, context["out_fname"]) + out_dir = os.path.dirname(out_fname) + if len(out_dir) > 0 and not os.path.exists(out_dir): + os.makedirs(out_dir) + cv2.imwrite(out_fname, image_vis) + logger.info(f"Output saved to {out_fname}") + context["entry_idx"] += 1 + + @classmethod + def postexecute(cls: type, context: Dict[str, Any]): + pass + + @classmethod + def _get_out_fname(cls: type, entry_idx: int, fname_base: str): + base, ext = os.path.splitext(fname_base) + return base + ".{0:04d}".format(entry_idx) + ext + + @classmethod + def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]: + vis_specs = args.visualizations.split(",") + visualizers = [] + extractors = [] + for vis_spec in vis_specs: + vis = cls.VISUALIZERS[vis_spec]() + visualizers.append(vis) + extractor = create_extractor(vis) + extractors.append(extractor) + visualizer = CompoundVisualizer(visualizers) + extractor = CompoundExtractor(extractors) + context = { + "extractor": extractor, + "visualizer": visualizer, + "out_fname": args.output, + "entry_idx": 0, + } + return context + + +def create_argument_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=DOC, + formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120), + ) + parser.set_defaults(func=lambda _: parser.print_help(sys.stdout)) + subparsers = parser.add_subparsers(title="Actions") + for _, action in _ACTION_REGISTRY.items(): + action.add_parser(subparsers) + return parser + + +def main(): + parser = create_argument_parser() + args = parser.parse_args() + verbosity = args.verbosity if hasattr(args, "verbosity") else None + global logger + logger = setup_logger(name=LOGGER_NAME) + logger.setLevel(verbosity_to_level(verbosity)) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml b/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3ed1bcd68744a22472cc8b391993e4175013dc42 --- /dev/null +++ b/projects/DensePose/configs/Base-DensePose-RCNN-FPN.yaml @@ -0,0 +1,47 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + BACKBONE: + NAME: "build_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + FPN: + IN_FEATURES: ["res2", "res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map + ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) + RPN: + IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] + PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level + PRE_NMS_TOPK_TEST: 1000 # Per FPN level + # Detectron1 uses 2000 proposals per-batch, + # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) + # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. + POST_NMS_TOPK_TRAIN: 1000 + POST_NMS_TOPK_TEST: 1000 + + DENSEPOSE_ON: True + ROI_HEADS: + NAME: "DensePoseROIHeads" + IN_FEATURES: ["p2", "p3", "p4", "p5"] + NUM_CLASSES: 1 + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_FC: 2 + POOLER_RESOLUTION: 7 + POOLER_SAMPLING_RATIO: 2 + POOLER_TYPE: "ROIAlign" + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseV1ConvXHead" + POOLER_TYPE: "ROIAlign" + NUM_COARSE_SEGM_CHANNELS: 2 +DATASETS: + TRAIN: ("densepose_coco_2014_train", "densepose_coco_2014_valminusminival") + TEST: ("densepose_coco_2014_minival",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.01 + STEPS: (60000, 80000) + MAX_ITER: 90000 + WARMUP_FACTOR: 0.1 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..15475b1ac3bb7272a7ebc0061a55119ffd2591b9 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC1_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" + UV_CONFIDENCE: + ENABLED: True + TYPE: "iid_iso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7546b967ab89129c9a276f19b1cf2d6b59f1a462 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_WC2_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" + UV_CONFIDENCE: + ENABLED: True + TYPE: "indep_aniso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..045f7f02f1b4eb0c0ef1733c3ac65e3aa70168de --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_DL_s1x.yaml @@ -0,0 +1,10 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" +SOLVER: + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ace62094fbc4ce2024810333c11c7a955d8eeb22 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC1_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "iid_iso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..766c098f6dcdd1fb3f67957d7d1d982b37747b96 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_WC2_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "indep_aniso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af44fb767edf9bf093463e62f93e070d0d019c5a --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 +SOLVER: + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8e79a1b9549cf19ed4a43cf9caf3dc88f6133310 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_101_FPN_s1x_legacy.yaml @@ -0,0 +1,17 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + RESNETS: + DEPTH: 101 + ROI_DENSEPOSE_HEAD: + NUM_COARSE_SEGM_CHANNELS: 15 + POOLER_RESOLUTION: 14 + HEATMAP_SIZE: 56 + INDEX_WEIGHTS: 2.0 + PART_WEIGHTS: 0.3 + POINT_REGRESSION_WEIGHTS: 0.1 + DECODER_ON: False +SOLVER: + BASE_LR: 0.002 + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f3720eff56ce042a68da6c99f484b963cae2c7d9 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC1_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" + UV_CONFIDENCE: + ENABLED: True + TYPE: "iid_iso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a47cc05e6e9dc882778c6b502d93cbcec88fb88 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_WC2_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" + UV_CONFIDENCE: + ENABLED: True + TYPE: "indep_aniso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..52a170b4a28289ad943314f77256e34800d23121 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_DL_s1x.yaml @@ -0,0 +1,10 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" +SOLVER: + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d36e54256ac22f1b01604e54430da24972f06eeb --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC1_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "iid_iso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e880d469564a3757ba3f4d708054074cefda49b6 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_WC2_s1x.yaml @@ -0,0 +1,16 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "indep_aniso" + POINT_REGRESSION_WEIGHTS: 0.0005 +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 130000 + STEPS: (100000, 120000) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d2dd14c6f92f3850b99e6f1c828c0fcee52120e1 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x.yaml @@ -0,0 +1,8 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +SOLVER: + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6c5391f3b3c3d437312a290d29b0656cb3804b25 --- /dev/null +++ b/projects/DensePose/configs/densepose_rcnn_R_50_FPN_s1x_legacy.yaml @@ -0,0 +1,17 @@ +_BASE_: "Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + NUM_COARSE_SEGM_CHANNELS: 15 + POOLER_RESOLUTION: 14 + HEATMAP_SIZE: 56 + INDEX_WEIGHTS: 2.0 + PART_WEIGHTS: 0.3 + POINT_REGRESSION_WEIGHTS: 0.1 + DECODER_ON: False +SOLVER: + BASE_LR: 0.002 + MAX_ITER: 130000 + STEPS: (100000, 120000) diff --git a/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml b/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a20882a9fd275bac3e3cf49c128684c73085ca1 --- /dev/null +++ b/projects/DensePose/configs/evolution/Base-RCNN-FPN-MC.yaml @@ -0,0 +1,91 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + BACKBONE: + NAME: "build_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + FPN: + IN_FEATURES: ["res2", "res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map + ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) + RPN: + IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] + PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level + PRE_NMS_TOPK_TEST: 1000 # Per FPN level + # Detectron1 uses 2000 proposals per-batch, + # (See "modeling/rpn/rpn_outputs.py" for details of this legacy issue) + # which is approximately 1000 proposals per-image since the default batch size for FPN is 2. + POST_NMS_TOPK_TRAIN: 1000 + POST_NMS_TOPK_TEST: 1000 + ROI_HEADS: + NAME: "StandardROIHeads" + IN_FEATURES: ["p2", "p3", "p4", "p5"] + NUM_CLASSES: 1 + ROI_BOX_HEAD: + NAME: "FastRCNNConvFCHead" + NUM_FC: 2 + POOLER_RESOLUTION: 7 + ROI_MASK_HEAD: + NAME: "MaskRCNNConvUpsampleHead" + NUM_CONV: 4 + POOLER_RESOLUTION: 14 +DATASETS: + TRAIN: ("base_coco_2017_train",) + TEST: ("base_coco_2017_val", "densepose_chimps") + CATEGORY_MAPS: + "base_coco_2017_train": + "16": 1 # bird -> person + "17": 1 # cat -> person + "18": 1 # dog -> person + "19": 1 # horse -> person + "20": 1 # sheep -> person + "21": 1 # cow -> person + "22": 1 # elephant -> person + "23": 1 # bear -> person + "24": 1 # zebra -> person + "25": 1 # girafe -> person + "base_coco_2017_val": + "16": 1 # bird -> person + "17": 1 # cat -> person + "18": 1 # dog -> person + "19": 1 # horse -> person + "20": 1 # sheep -> person + "21": 1 # cow -> person + "22": 1 # elephant -> person + "23": 1 # bear -> person + "24": 1 # zebra -> person + "25": 1 # girafe -> person + WHITELISTED_CATEGORIES: + "base_coco_2017_train": + - 1 # person + - 16 # bird + - 17 # cat + - 18 # dog + - 19 # horse + - 20 # sheep + - 21 # cow + - 22 # elephant + - 23 # bear + - 24 # zebra + - 25 # girafe + "base_coco_2017_val": + - 1 # person + - 16 # bird + - 17 # cat + - 18 # dog + - 19 # horse + - 20 # sheep + - 21 # cow + - 22 # elephant + - 23 # bear + - 24 # zebra + - 25 # girafe +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml b/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml new file mode 100644 index 0000000000000000000000000000000000000000..80139ad9e40c09fdd862cdac80aa18c5cabc0a1e --- /dev/null +++ b/projects/DensePose/configs/evolution/faster_rcnn_R_50_FPN_1x_MC.yaml @@ -0,0 +1,7 @@ +_BASE_: "Base-RCNN-FPN-MC.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + DENSEPOSE_ON: False + RESNETS: + DEPTH: 50 diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b90989eef81e27d23119d2cd4627e8cea211ac51 --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_DL_instant_test.yaml @@ -0,0 +1,11 @@ +_BASE_: "../Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + ROI_DENSEPOSE_HEAD: + NAME: "DensePoseDeepLabHead" +DATASETS: + TRAIN: ("densepose_coco_2014_minival_100",) + TEST: ("densepose_coco_2014_minival_100",) +SOLVER: + MAX_ITER: 40 + STEPS: (30,) diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d412740340d924bacc3baa57f32bfea0b871511 --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_TTA_inference_acc_test.yaml @@ -0,0 +1,13 @@ +_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml" +MODEL: + WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl" +DATASETS: + TRAIN: () + TEST: ("densepose_coco_2014_minival_100",) +TEST: + AUG: + ENABLED: True + MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200) + MAX_SIZE: 4000 + FLIP: True + EXPECTED_RESULTS: [["bbox_TTA", "AP", 61.74, 0.03], ["densepose_gps_TTA", "AP", 60.22, 0.03], ["densepose_gpsm_TTA", "AP", 63.85, 0.03]] diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f0fe61151adf255baba717f3e65ff6fab52829a6 --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC1_instant_test.yaml @@ -0,0 +1,19 @@ +_BASE_: "../Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "iid_iso" + POINT_REGRESSION_WEIGHTS: 0.0005 +DATASETS: + TRAIN: ("densepose_coco_2014_minival_100",) + TEST: ("densepose_coco_2014_minival_100",) +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 40 + STEPS: (30,) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f0d9358c8846452314697a19b5e2ea9e075ddaeb --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_WC2_instant_test.yaml @@ -0,0 +1,19 @@ +_BASE_: "../Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 + ROI_DENSEPOSE_HEAD: + UV_CONFIDENCE: + ENABLED: True + TYPE: "indep_aniso" + POINT_REGRESSION_WEIGHTS: 0.0005 +DATASETS: + TRAIN: ("densepose_coco_2014_minival_100",) + TEST: ("densepose_coco_2014_minival_100",) +SOLVER: + CLIP_GRADIENTS: + ENABLED: True + MAX_ITER: 40 + STEPS: (30,) + WARMUP_FACTOR: 0.025 diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3c5a7d20989e774cbba2b443e3026a2361201d0f --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_inference_acc_test.yaml @@ -0,0 +1,8 @@ +_BASE_: "../densepose_rcnn_R_50_FPN_s1x.yaml" +MODEL: + WEIGHTS: "https://dl.fbaipublicfiles.com/densepose/densepose_rcnn_R_50_FPN_s1x/165712039/model_final_162be9.pkl" +DATASETS: + TRAIN: () + TEST: ("densepose_coco_2014_minival_100",) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 59.27, 0.025], ["densepose_gps", "AP", 60.11, 0.02], ["densepose_gpsm", "AP", 64.20, 0.02]] diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..057c8768186e8a818228aa2f028ba3007374c571 --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_instant_test.yaml @@ -0,0 +1,9 @@ +_BASE_: "../Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" +DATASETS: + TRAIN: ("densepose_coco_2014_minival_100",) + TEST: ("densepose_coco_2014_minival_100",) +SOLVER: + MAX_ITER: 40 + STEPS: (30,) diff --git a/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b991160c79e5a95feac22be30deea10d200178d4 --- /dev/null +++ b/projects/DensePose/configs/quick_schedules/densepose_rcnn_R_50_FPN_training_acc_test.yaml @@ -0,0 +1,14 @@ +_BASE_: "../Base-DensePose-RCNN-FPN.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + ROI_HEADS: + NUM_CLASSES: 1 +DATASETS: + TRAIN: ("densepose_coco_2014_minival",) + TEST: ("densepose_coco_2014_minival",) +SOLVER: + MAX_ITER: 6000 + STEPS: (5500, 5800) +TEST: + EXPECTED_RESULTS: [["bbox", "AP", 58.27, 1.0], ["densepose_gps", "AP", 42.47, 1.5], ["densepose_gpsm", "AP", 49.20, 1.5]] + diff --git a/projects/DensePose/densepose/__init__.py b/projects/DensePose/densepose/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aea5a1a9c3e63ce168a41545322599ccc4adbbb8 --- /dev/null +++ b/projects/DensePose/densepose/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .data.datasets import builtin # just to register data +from .config import add_densepose_config, add_dataset_category_config +from .densepose_head import ROI_DENSEPOSE_HEAD_REGISTRY +from .evaluator import DensePoseCOCOEvaluator +from .roi_head import DensePoseROIHeads +from .data.structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData +from .modeling.test_time_augmentation import DensePoseGeneralizedRCNNWithTTA +from .utils.transform import load_from_cfg diff --git a/projects/DensePose/densepose/config.py b/projects/DensePose/densepose/config.py new file mode 100644 index 0000000000000000000000000000000000000000..2d76056b362beb7c0832e775b9e3415dd42767a5 --- /dev/null +++ b/projects/DensePose/densepose/config.py @@ -0,0 +1,68 @@ +# -*- coding = utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from detectron2.config import CfgNode as CN + + +def add_dataset_category_config(cfg: CN): + """ + Add config for additional category-related dataset options + - category whitelisting + - category mapping + """ + _C = cfg + _C.DATASETS.CATEGORY_MAPS = CN(new_allowed=True) + _C.DATASETS.WHITELISTED_CATEGORIES = CN(new_allowed=True) + + +def add_densepose_config(cfg: CN): + """ + Add config for densepose head. + """ + _C = cfg + + _C.MODEL.DENSEPOSE_ON = True + + _C.MODEL.ROI_DENSEPOSE_HEAD = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.NAME = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS = 8 + # Number of parts used for point labels + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES = 24 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL = 4 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM = 512 + _C.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL = 3 + _C.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE = 112 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE = "ROIAlignV2" + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION = 28 + _C.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO = 2 + _C.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS = 2 # 15 or 2 + # Overlap threshold for an RoI to be considered foreground (if >= FG_IOU_THRESHOLD) + _C.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD = 0.7 + # Loss weights for annotation masks.(14 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS = 5.0 + # Loss weights for surface parts. (24 Parts) + _C.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS = 1.0 + # Loss weights for UV regression. + _C.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS = 0.01 + # For Decoder + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON = True + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS = 256 + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM = "" + _C.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE = 4 + # For DeepLab head + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB = CN() + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM = "GN" + _C.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON = 0 + # Confidences + # Enable learning confidences (variances) along with the actual values + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE = CN({"ENABLED": False}) + # UV confidence lower bound + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON = 0.01 + # Statistical model type for confidence learning, possible values: + # - "iid_iso": statistically independent identically distributed residuals + # with isotropic covariance + # - "indep_aniso": statistically independent residuals with anisotropic + # covariances + _C.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE = "iid_iso" diff --git a/projects/DensePose/densepose/data/__init__.py b/projects/DensePose/densepose/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e8d3b0be90f14580722f540b161efdb18c0c08 --- /dev/null +++ b/projects/DensePose/densepose/data/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from .build import build_detection_test_loader, build_detection_train_loader +from .dataset_mapper import DatasetMapper + +# ensure the builtin datasets are registered +from . import datasets + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/projects/DensePose/densepose/data/build.py b/projects/DensePose/densepose/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..cb32e3e8178e3198403963e7a641c576a5d51a04 --- /dev/null +++ b/projects/DensePose/densepose/data/build.py @@ -0,0 +1,405 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import itertools +import logging +import numpy as np +import operator +from typing import Any, Callable, Collection, Dict, Iterable, List, Optional +import torch + +from detectron2.config import CfgNode +from detectron2.data import samplers +from detectron2.data.build import ( + load_proposals_into_dataset, + print_instances_class_histogram, + trivial_batch_collator, + worker_init_reset_seed, +) +from detectron2.data.catalog import DatasetCatalog, MetadataCatalog +from detectron2.data.common import AspectRatioGroupedDataset, DatasetFromList, MapDataset +from detectron2.utils.comm import get_world_size + +from .dataset_mapper import DatasetMapper +from .datasets.coco import DENSEPOSE_KEYS_WITHOUT_MASK as DENSEPOSE_COCO_KEYS_WITHOUT_MASK +from .datasets.coco import DENSEPOSE_MASK_KEY as DENSEPOSE_COCO_MASK_KEY + +__all__ = ["build_detection_train_loader", "build_detection_test_loader"] + + +Instance = Dict[str, Any] +InstancePredicate = Callable[[Instance], bool] + + +def _compute_num_images_per_worker(cfg: CfgNode): + num_workers = get_world_size() + images_per_batch = cfg.SOLVER.IMS_PER_BATCH + assert ( + images_per_batch % num_workers == 0 + ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number of workers ({}).".format( + images_per_batch, num_workers + ) + assert ( + images_per_batch >= num_workers + ), "SOLVER.IMS_PER_BATCH ({}) must be larger than the number of workers ({}).".format( + images_per_batch, num_workers + ) + images_per_worker = images_per_batch // num_workers + return images_per_worker + + +def _map_category_id_to_contiguous_id(dataset_name: str, dataset_dicts: Iterable[Instance]): + meta = MetadataCatalog.get(dataset_name) + for dataset_dict in dataset_dicts: + for ann in dataset_dict["annotations"]: + ann["category_id"] = meta.thing_dataset_id_to_contiguous_id[ann["category_id"]] + + +def _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names: Iterable[str]): + # merge categories for all datasets + merged_categories = {} + for dataset_name in dataset_names: + meta = MetadataCatalog.get(dataset_name) + for cat_id, cat_name in meta.categories.items(): + if cat_id not in merged_categories: + merged_categories[cat_id] = (cat_name, dataset_name) + continue + cat_name_other, dataset_name_other = merged_categories[cat_id] + if cat_name_other != cat_name: + raise ValueError( + f"Incompatible categories for category ID {cat_id}: " + f'dataset {dataset_name} value "{cat_name}", ' + f'dataset {dataset_name_other} value "{cat_name_other}"' + ) + + merged_cat_id_to_cont_id = {} + for i, cat_id in enumerate(sorted(merged_categories.keys())): + merged_cat_id_to_cont_id[cat_id] = i + + # add category maps to metadata + for dataset_name in dataset_names: + meta = MetadataCatalog.get(dataset_name) + categories = meta.get("categories") + meta.thing_classes = [categories[cat_id] for cat_id in sorted(categories.keys())] + meta.thing_dataset_id_to_contiguous_id = { + cat_id: merged_cat_id_to_cont_id[cat_id] for cat_id in sorted(categories.keys()) + } + meta.thing_contiguous_id_to_dataset_id = { + merged_cat_id_to_cont_id[cat_id]: cat_id for cat_id in sorted(categories.keys()) + } + + +def _maybe_create_general_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + def has_annotations(instance: Instance) -> bool: + return "annotations" in instance + + def has_only_crowd_anotations(instance: Instance) -> bool: + for ann in instance["annotations"]: + if ann.get("is_crowd", 0) == 0: + return False + return True + + def general_keep_instance_predicate(instance: Instance) -> bool: + return has_annotations(instance) and not has_only_crowd_anotations(instance) + + if not cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS: + return None + return general_keep_instance_predicate + + +def _maybe_create_keypoints_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + + min_num_keypoints = cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + + def has_sufficient_num_keypoints(instance: Instance) -> bool: + num_kpts = sum( + (np.array(ann["keypoints"][2::3]) > 0).sum() + for ann in instance["annotations"] + if "keypoints" in ann + ) + return num_kpts >= min_num_keypoints + + if cfg.MODEL.KEYPOINT_ON and (min_num_keypoints > 0): + return has_sufficient_num_keypoints + return None + + +def _maybe_create_mask_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + if not cfg.MODEL.MASK_ON: + return None + + def has_mask_annotations(instance: Instance) -> bool: + return any("segmentation" in ann for ann in instance["annotations"]) + + return has_mask_annotations + + +def _maybe_create_densepose_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + if not cfg.MODEL.DENSEPOSE_ON: + return None + + def has_densepose_annotations(instance: Instance) -> bool: + for ann in instance["annotations"]: + if all(key in ann for key in DENSEPOSE_COCO_KEYS_WITHOUT_MASK) and ( + (DENSEPOSE_COCO_MASK_KEY in ann) or ("segmentation" in ann) + ): + return True + return False + + return has_densepose_annotations + + +def _maybe_create_specific_keep_instance_predicate(cfg: CfgNode) -> Optional[InstancePredicate]: + specific_predicate_creators = [ + _maybe_create_keypoints_keep_instance_predicate, + _maybe_create_mask_keep_instance_predicate, + _maybe_create_densepose_keep_instance_predicate, + ] + predicates = [creator(cfg) for creator in specific_predicate_creators] + predicates = [p for p in predicates if p is not None] + if not predicates: + return None + + def combined_predicate(instance: Instance) -> bool: + return any(p(instance) for p in predicates) + + return combined_predicate + + +def _get_train_keep_instance_predicate(cfg: CfgNode): + general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg) + combined_specific_keep_predicate = _maybe_create_specific_keep_instance_predicate(cfg) + + def combined_general_specific_keep_predicate(instance: Instance) -> bool: + return general_keep_predicate(instance) and combined_specific_keep_predicate(instance) + + if (general_keep_predicate is None) and (combined_specific_keep_predicate is None): + return None + if general_keep_predicate is None: + return combined_specific_keep_predicate + if combined_specific_keep_predicate is None: + return general_keep_predicate + return combined_general_specific_keep_predicate + + +def _get_test_keep_instance_predicate(cfg: CfgNode): + general_keep_predicate = _maybe_create_general_keep_instance_predicate(cfg) + return general_keep_predicate + + +def _maybe_filter_and_map_categories( + dataset_name: str, dataset_dicts: List[Instance] +) -> List[Instance]: + meta = MetadataCatalog.get(dataset_name) + whitelisted_categories = meta.get("whitelisted_categories") + category_map = meta.get("category_map", {}) + if whitelisted_categories is None and not category_map: + return dataset_dicts + filtered_dataset_dicts = [] + for dataset_dict in dataset_dicts: + anns = [] + for ann in dataset_dict["annotations"]: + cat_id = ann["category_id"] + if whitelisted_categories is not None and cat_id not in whitelisted_categories: + continue + ann["category_id"] = category_map.get(cat_id, cat_id) + anns.append(ann) + dataset_dict["annotations"] = anns + filtered_dataset_dicts.append(dataset_dict) + return filtered_dataset_dicts + + +def _add_category_whitelists_to_metadata(cfg: CfgNode): + for dataset_name, whitelisted_cat_ids in cfg.DATASETS.WHITELISTED_CATEGORIES.items(): + meta = MetadataCatalog.get(dataset_name) + meta.whitelisted_categories = whitelisted_cat_ids + logger = logging.getLogger(__name__) + logger.info( + "Whitelisted categories for dataset {}: {}".format( + dataset_name, meta.whitelisted_categories + ) + ) + + +def _add_category_maps_to_metadata(cfg: CfgNode): + for dataset_name, category_map in cfg.DATASETS.CATEGORY_MAPS.items(): + category_map = { + int(cat_id_src): int(cat_id_dst) for cat_id_src, cat_id_dst in category_map.items() + } + meta = MetadataCatalog.get(dataset_name) + meta.category_map = category_map + logger = logging.getLogger(__name__) + logger.info("Category maps for dataset {}: {}".format(dataset_name, meta.category_map)) + + +def combine_detection_dataset_dicts( + dataset_names: Collection[str], + keep_instance_predicate: Optional[InstancePredicate] = None, + proposal_files: Optional[Collection[str]] = None, +) -> List[Instance]: + """ + Load and prepare dataset dicts for training / testing + + Args: + dataset_names (Collection[str]): a list of dataset names + keep_instance_predicate (Callable: Dict[str, Any] -> bool): predicate + applied to instance dicts which defines whether to keep the instance + proposal_files (Collection[str]): if given, a list of object proposal files + that match each dataset in `dataset_names`. + """ + assert len(dataset_names) + if proposal_files is None: + proposal_files = [None] * len(dataset_names) + assert len(dataset_names) == len(proposal_files) + # load annotations and dataset metadata + dataset_map = {} + for dataset_name in dataset_names: + dataset_dicts = DatasetCatalog.get(dataset_name) + dataset_map[dataset_name] = dataset_dicts + # initialize category maps + _add_category_id_to_contiguous_id_maps_to_metadata(dataset_names) + # apply category maps + all_datasets_dicts = [] + for dataset_name, proposal_file in zip(dataset_names, proposal_files): + dataset_dicts = dataset_map[dataset_name] + assert len(dataset_dicts), f"Dataset '{dataset_name}' is empty!" + if proposal_file is not None: + dataset_dicts = load_proposals_into_dataset(dataset_dicts, proposal_file) + dataset_dicts = _maybe_filter_and_map_categories(dataset_name, dataset_dicts) + _map_category_id_to_contiguous_id(dataset_name, dataset_dicts) + print_instances_class_histogram( + dataset_dicts, MetadataCatalog.get(dataset_name).thing_classes + ) + all_datasets_dicts.append(dataset_dicts) + + if keep_instance_predicate is not None: + all_datasets_dicts_plain = [ + d + for d in itertools.chain.from_iterable(all_datasets_dicts) + if keep_instance_predicate(d) + ] + else: + all_datasets_dicts_plain = list(itertools.chain.from_iterable(all_datasets_dicts)) + return all_datasets_dicts_plain + + +def build_detection_train_loader(cfg: CfgNode, mapper=None): + """ + A data loader is created in a way similar to that of Detectron2. + The main differences are: + - it allows to combine datasets with different but compatible object category sets + + The data loader is created by the following steps: + 1. Use the dataset names in config to query :class:`DatasetCatalog`, and obtain a list of dicts. + 2. Start workers to work on the dicts. Each worker will: + * Map each metadata dict into another format to be consumed by the model. + * Batch them by simply putting dicts into a list. + The batched ``list[mapped_dict]`` is what this dataloader will return. + + Args: + cfg (CfgNode): the config + mapper (callable): a callable which takes a sample (dict) from dataset and + returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, True)`. + + Returns: + an infinite iterator of training data + """ + images_per_worker = _compute_num_images_per_worker(cfg) + + _add_category_whitelists_to_metadata(cfg) + _add_category_maps_to_metadata(cfg) + dataset_dicts = combine_detection_dataset_dicts( + cfg.DATASETS.TRAIN, + keep_instance_predicate=_get_train_keep_instance_predicate(cfg), + proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None, + ) + dataset = DatasetFromList(dataset_dicts, copy=False) + + if mapper is None: + mapper = DatasetMapper(cfg, True) + dataset = MapDataset(dataset, mapper) + + sampler_name = cfg.DATALOADER.SAMPLER_TRAIN + logger = logging.getLogger(__name__) + logger.info("Using training sampler {}".format(sampler_name)) + if sampler_name == "TrainingSampler": + sampler = samplers.TrainingSampler(len(dataset)) + elif sampler_name == "RepeatFactorTrainingSampler": + sampler = samplers.RepeatFactorTrainingSampler( + dataset_dicts, cfg.DATALOADER.REPEAT_THRESHOLD + ) + else: + raise ValueError("Unknown training sampler: {}".format(sampler_name)) + + if cfg.DATALOADER.ASPECT_RATIO_GROUPING: + data_loader = torch.utils.data.DataLoader( + dataset, + sampler=sampler, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=None, + collate_fn=operator.itemgetter(0), # don't batch, but yield individual elements + worker_init_fn=worker_init_reset_seed, + ) # yield individual mapped dict + data_loader = AspectRatioGroupedDataset(data_loader, images_per_worker) + else: + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, images_per_worker, drop_last=True + ) + # drop_last so the batch always have the same size + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=batch_sampler, + collate_fn=trivial_batch_collator, + worker_init_fn=worker_init_reset_seed, + ) + + return data_loader + + +def build_detection_test_loader(cfg, dataset_name, mapper=None): + """ + Similar to `build_detection_train_loader`. + But this function uses the given `dataset_name` argument (instead of the names in cfg), + and uses batch size 1. + + Args: + cfg: a detectron2 CfgNode + dataset_name (str): a name of the dataset that's available in the DatasetCatalog + mapper (callable): a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + By default it will be `DatasetMapper(cfg, False)`. + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + """ + _add_category_whitelists_to_metadata(cfg) + _add_category_maps_to_metadata(cfg) + dataset_dicts = combine_detection_dataset_dicts( + [dataset_name], + keep_instance_predicate=_get_test_keep_instance_predicate(cfg), + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(dataset_name)] + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + + dataset = DatasetFromList(dataset_dicts) + if mapper is None: + mapper = DatasetMapper(cfg, False) + dataset = MapDataset(dataset, mapper) + + sampler = samplers.InferenceSampler(len(dataset)) + # Always use 1 image per worker during inference since this is the + # standard when reporting inference time in papers. + batch_sampler = torch.utils.data.sampler.BatchSampler(sampler, 1, drop_last=False) + + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=cfg.DATALOADER.NUM_WORKERS, + batch_sampler=batch_sampler, + collate_fn=trivial_batch_collator, + ) + return data_loader diff --git a/projects/DensePose/densepose/data/dataset_mapper.py b/projects/DensePose/densepose/data/dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..3eadbe15dd1da6566bc51b32630b7e9b4909576b --- /dev/null +++ b/projects/DensePose/densepose/data/dataset_mapper.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import copy +import torch +from fvcore.common.file_io import PathManager + +from detectron2.data import MetadataCatalog +from detectron2.data import detection_utils as utils +from detectron2.data import transforms as T + +from .structures import DensePoseDataRelative, DensePoseList, DensePoseTransformData + + +class DatasetMapper: + """ + A customized version of `detectron2.data.DatasetMapper` + """ + + def __init__(self, cfg, is_train=True): + self.tfm_gens = utils.build_transform_gen(cfg, is_train) + + # fmt: off + self.img_format = cfg.INPUT.FORMAT + self.mask_on = cfg.MODEL.MASK_ON + self.keypoint_on = cfg.MODEL.KEYPOINT_ON + self.densepose_on = cfg.MODEL.DENSEPOSE_ON + assert not cfg.MODEL.LOAD_PROPOSALS, "not supported yet" + # fmt: on + if self.keypoint_on and is_train: + # Flip only makes sense in training + self.keypoint_hflip_indices = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) + else: + self.keypoint_hflip_indices = None + + if self.densepose_on: + densepose_transform_srcs = [ + MetadataCatalog.get(ds).densepose_transform_src + for ds in cfg.DATASETS.TRAIN + cfg.DATASETS.TEST + ] + assert len(densepose_transform_srcs) > 0 + # TODO: check that DensePose transformation data is the same for + # all the datasets. Otherwise one would have to pass DB ID with + # each entry to select proper transformation data. For now, since + # all DensePose annotated data uses the same data semantics, we + # omit this check. + densepose_transform_data_fpath = PathManager.get_local_path(densepose_transform_srcs[0]) + self.densepose_transform_data = DensePoseTransformData.load( + densepose_transform_data_fpath + ) + + self.is_train = is_train + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + image, transforms = T.apply_transform_gens(self.tfm_gens, image) + image_shape = image.shape[:2] # h, w + dataset_dict["image"] = torch.as_tensor(image.transpose(2, 0, 1).astype("float32")) + + if not self.is_train: + dataset_dict.pop("annotations", None) + return dataset_dict + + for anno in dataset_dict["annotations"]: + if not self.mask_on: + anno.pop("segmentation", None) + if not self.keypoint_on: + anno.pop("keypoints", None) + + # USER: Implement additional transformations if you have other types of data + # USER: Don't call transpose_densepose if you don't need + annos = [ + self._transform_densepose( + utils.transform_instance_annotations( + obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices + ), + transforms, + ) + for obj in dataset_dict.pop("annotations") + if obj.get("iscrowd", 0) == 0 + ] + instances = utils.annotations_to_instances(annos, image_shape) + + if len(annos) and "densepose" in annos[0]: + gt_densepose = [obj["densepose"] for obj in annos] + instances.gt_densepose = DensePoseList(gt_densepose, instances.gt_boxes, image_shape) + + dataset_dict["instances"] = instances[instances.gt_boxes.nonempty()] + return dataset_dict + + def _transform_densepose(self, annotation, transforms): + if not self.densepose_on: + return annotation + + # Handle densepose annotations + is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation) + if is_valid: + densepose_data = DensePoseDataRelative(annotation, cleanup=True) + densepose_data.apply_transform(transforms, self.densepose_transform_data) + annotation["densepose"] = densepose_data + else: + # logger = logging.getLogger(__name__) + # logger.debug("Could not load DensePose annotation: {}".format(reason_not_valid)) + DensePoseDataRelative.cleanup_annotation(annotation) + # NOTE: annotations for certain instances may be unavailable. + # 'None' is accepted by the DensePostList data structure. + annotation["densepose"] = None + return annotation diff --git a/projects/DensePose/densepose/data/datasets/__init__.py b/projects/DensePose/densepose/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0ea9c2f33220298ec63496e909a6ed92f7beb23b --- /dev/null +++ b/projects/DensePose/densepose/data/datasets/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from . import builtin # ensure the builtin datasets are registered + +__all__ = [k for k in globals().keys() if "builtin" not in k and not k.startswith("_")] diff --git a/projects/DensePose/densepose/data/datasets/builtin.py b/projects/DensePose/densepose/data/datasets/builtin.py new file mode 100644 index 0000000000000000000000000000000000000000..d05cf9f62f6d0ca9898b38e5a8a9e0341cccf12a --- /dev/null +++ b/projects/DensePose/densepose/data/datasets/builtin.py @@ -0,0 +1,10 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .coco import BASE_DATASETS as BASE_COCO_DATASETS +from .coco import DATASETS as COCO_DATASETS +from .coco import register_datasets as register_coco_datasets + +DEFAULT_DATASETS_ROOT = "datasets" + + +register_coco_datasets(COCO_DATASETS, DEFAULT_DATASETS_ROOT) +register_coco_datasets(BASE_COCO_DATASETS, DEFAULT_DATASETS_ROOT) diff --git a/projects/DensePose/densepose/data/datasets/coco.py b/projects/DensePose/densepose/data/datasets/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..658c5a8a8a0ec6960cd80a33f21135b12cf1bd91 --- /dev/null +++ b/projects/DensePose/densepose/data/datasets/coco.py @@ -0,0 +1,314 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import contextlib +import io +import logging +import os +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional +from fvcore.common.file_io import PathManager +from fvcore.common.timer import Timer + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import BoxMode + +DENSEPOSE_MASK_KEY = "dp_masks" +DENSEPOSE_KEYS_WITHOUT_MASK = ["dp_x", "dp_y", "dp_I", "dp_U", "dp_V"] +DENSEPOSE_KEYS = DENSEPOSE_KEYS_WITHOUT_MASK + [DENSEPOSE_MASK_KEY] +DENSEPOSE_METADATA_URL_PREFIX = "https://dl.fbaipublicfiles.com/densepose/data/" + + +@dataclass +class CocoDatasetInfo: + name: str + images_root: str + annotations_fpath: str + + +DATASETS = [ + CocoDatasetInfo( + name="densepose_coco_2014_train", + images_root="coco/train2014", + annotations_fpath="coco/annotations/densepose_train2014.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_minival2014.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_minival_100", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_minival2014_100.json", + ), + CocoDatasetInfo( + name="densepose_coco_2014_valminusminival", + images_root="coco/val2014", + annotations_fpath="coco/annotations/densepose_valminusminival2014.json", + ), + CocoDatasetInfo( + name="densepose_chimps", + images_root="densepose_evolution/densepose_chimps", + annotations_fpath="densepose_evolution/annotations/densepose_chimps_densepose.json", + ), +] + + +BASE_DATASETS = [ + CocoDatasetInfo( + name="base_coco_2017_train", + images_root="coco/train2017", + annotations_fpath="coco/annotations/instances_train2017.json", + ), + CocoDatasetInfo( + name="base_coco_2017_val", + images_root="coco/val2017", + annotations_fpath="coco/annotations/instances_val2017.json", + ), + CocoDatasetInfo( + name="base_coco_2017_val_100", + images_root="coco/val2017", + annotations_fpath="coco/annotations/instances_val2017_100.json", + ), +] + + +def _is_relative_local_path(path: os.PathLike): + path_str = os.fsdecode(path) + return ("://" not in path_str) and not os.path.isabs(path) + + +def _maybe_prepend_base_path(base_path: Optional[os.PathLike], path: os.PathLike): + """ + Prepends the provided path with a base path prefix if: + 1) base path is not None; + 2) path is a local path + """ + if base_path is None: + return path + if _is_relative_local_path(path): + return os.path.join(base_path, path) + return path + + +def get_metadata(base_path: Optional[os.PathLike]) -> Dict[str, Any]: + """ + Returns metadata associated with COCO DensePose datasets + + Args: + base_path: Optional[os.PathLike] + Base path used to load metadata from + + Returns: + Dict[str, Any] + Metadata in the form of a dictionary + """ + meta = { + "densepose_transform_src": _maybe_prepend_base_path( + base_path, "UV_symmetry_transforms.mat" + ), + "densepose_smpl_subdiv": _maybe_prepend_base_path(base_path, "SMPL_subdiv.mat"), + "densepose_smpl_subdiv_transform": _maybe_prepend_base_path( + base_path, "SMPL_SUBDIV_TRANSFORM.mat" + ), + } + return meta + + +def _load_coco_annotations(json_file: str): + """ + Load COCO annotations from a JSON file + + Args: + json_file: str + Path to the file to load annotations from + Returns: + Instance of `pycocotools.coco.COCO` that provides access to annotations + data + """ + from pycocotools.coco import COCO + + logger = logging.getLogger(__name__) + timer = Timer() + with contextlib.redirect_stdout(io.StringIO()): + coco_api = COCO(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + return coco_api + + +def _add_categories_metadata(dataset_name: str, categories: Dict[str, Any]): + meta = MetadataCatalog.get(dataset_name) + meta.categories = {c["id"]: c["name"] for c in categories} + logger = logging.getLogger(__name__) + logger.info("Dataset {} categories: {}".format(dataset_name, categories)) + + +def _verify_annotations_have_unique_ids(json_file: str, anns: List[List[Dict[str, Any]]]): + if "minival" in json_file: + # Skip validation on COCO2014 valminusminival and minival annotations + # The ratio of buggy annotations there is tiny and does not affect accuracy + # Therefore we explicitly white-list them + return + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( + json_file + ) + + +def _maybe_add_bbox(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "bbox" not in ann_dict: + return + obj["bbox"] = ann_dict["bbox"] + obj["bbox_mode"] = BoxMode.XYWH_ABS + + +def _maybe_add_segm(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "segmentation" not in ann_dict: + return + segm = ann_dict["segmentation"] + if not isinstance(segm, dict): + # filter out invalid polygons (< 3 points) + segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + if len(segm) == 0: + return + obj["segmentation"] = segm + + +def _maybe_add_keypoints(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + if "keypoints" not in ann_dict: + return + keypts = ann_dict["keypoints"] # list[int] + for idx, v in enumerate(keypts): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # Therefore we assume the coordinates are "pixel indices" and + # add 0.5 to convert to floating point coordinates. + keypts[idx] = v + 0.5 + obj["keypoints"] = keypts + + +def _maybe_add_densepose(obj: Dict[str, Any], ann_dict: Dict[str, Any]): + for key in DENSEPOSE_KEYS: + if key in ann_dict: + obj[key] = ann_dict[key] + + +def _combine_images_with_annotations( + dataset_name: str, + image_root: str, + img_datas: Iterable[Dict[str, Any]], + ann_datas: Iterable[Iterable[Dict[str, Any]]], +): + + ann_keys = ["iscrowd", "category_id"] + dataset_dicts = [] + + for img_dict, ann_dicts in zip(img_datas, ann_datas): + record = {} + record["file_name"] = os.path.join(image_root, img_dict["file_name"]) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + record["image_id"] = img_dict["id"] + record["dataset"] = dataset_name + objs = [] + for ann_dict in ann_dicts: + assert ann_dict["image_id"] == record["image_id"] + assert ann_dict.get("ignore", 0) == 0 + obj = {key: ann_dict[key] for key in ann_keys if key in ann_dict} + _maybe_add_bbox(obj, ann_dict) + _maybe_add_segm(obj, ann_dict) + _maybe_add_keypoints(obj, ann_dict) + _maybe_add_densepose(obj, ann_dict) + objs.append(obj) + record["annotations"] = objs + dataset_dicts.append(record) + return dataset_dicts + + +def load_coco_json(annotations_json_file: str, image_root: str, dataset_name: str): + """ + Loads a JSON file with annotations in COCO instances format. + Replaces `detectron2.data.datasets.coco.load_coco_json` to handle metadata + in a more flexible way. Postpones category mapping to a later stage to be + able to combine several datasets with different (but coherent) sets of + categories. + + Args: + + annotations_json_file: str + Path to the JSON file with annotations in COCO instances format. + image_root: str + directory that contains all the images + dataset_name: str + the name that identifies a dataset, e.g. "densepose_coco_2014_train" + extra_annotation_keys: Optional[List[str]] + If provided, these keys are used to extract additional data from + the annotations. + """ + coco_api = _load_coco_annotations(PathManager.get_local_path(annotations_json_file)) + _add_categories_metadata(dataset_name, coco_api.loadCats(coco_api.getCatIds())) + # sort indices for reproducible results + img_ids = sorted(coco_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = coco_api.loadImgs(img_ids) + logger = logging.getLogger(__name__) + logger.info("Loaded {} images in COCO format from {}".format(len(imgs), annotations_json_file)) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. + anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] + _verify_annotations_have_unique_ids(annotations_json_file, anns) + dataset_records = _combine_images_with_annotations(dataset_name, image_root, imgs, anns) + return dataset_records + + +def register_dataset(dataset_data: CocoDatasetInfo, datasets_root: Optional[os.PathLike] = None): + """ + Registers provided COCO DensePose dataset + + Args: + dataset_data: CocoDatasetInfo + Dataset data + datasets_root: Optional[os.PathLike] + Datasets root folder (default: None) + """ + annotations_fpath = _maybe_prepend_base_path(datasets_root, dataset_data.annotations_fpath) + images_root = _maybe_prepend_base_path(datasets_root, dataset_data.images_root) + + def load_annotations(): + return load_coco_json( + annotations_json_file=annotations_fpath, + image_root=images_root, + dataset_name=dataset_data.name, + ) + + DatasetCatalog.register(dataset_data.name, load_annotations) + MetadataCatalog.get(dataset_data.name).set( + json_file=annotations_fpath, + image_root=images_root, + **get_metadata(DENSEPOSE_METADATA_URL_PREFIX) + ) + + +def register_datasets( + datasets_data: Iterable[CocoDatasetInfo], datasets_root: Optional[os.PathLike] = None +): + """ + Registers provided COCO DensePose datasets + + Args: + datasets_data: Iterable[CocoDatasetInfo] + An iterable of dataset datas + datasets_root: Optional[os.PathLike] + Datasets root folder (default: None) + """ + for dataset_data in datasets_data: + register_dataset(dataset_data, datasets_root) diff --git a/projects/DensePose/densepose/data/structures.py b/projects/DensePose/densepose/data/structures.py new file mode 100644 index 0000000000000000000000000000000000000000..bbb950ba09b1302b72f36d143e092d2ade6dc11e --- /dev/null +++ b/projects/DensePose/densepose/data/structures.py @@ -0,0 +1,579 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import base64 +import numpy as np +from io import BytesIO +import torch +from PIL import Image +from torch.nn import functional as F + + +class DensePoseTransformData(object): + + # Horizontal symmetry label transforms used for horizontal flip + MASK_LABEL_SYMMETRIES = [0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14] + # fmt: off + POINT_LABEL_SYMMETRIES = [ 0, 1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23] # noqa + # fmt: on + + def __init__(self, uv_symmetries): + self.mask_label_symmetries = DensePoseTransformData.MASK_LABEL_SYMMETRIES + self.point_label_symmetries = DensePoseTransformData.POINT_LABEL_SYMMETRIES + self.uv_symmetries = uv_symmetries + + @staticmethod + def load(fpath): + import scipy.io + + uv_symmetry_map = scipy.io.loadmat(fpath) + uv_symmetry_map_torch = {} + for key in ["U_transforms", "V_transforms"]: + uv_symmetry_map_torch[key] = [] + map_src = uv_symmetry_map[key] + map_dst = uv_symmetry_map_torch[key] + for i in range(map_src.shape[1]): + map_dst.append(torch.from_numpy(map_src[0, i]).to(dtype=torch.float)) + uv_symmetry_map_torch[key] = torch.stack(map_dst, dim=0).to( + device=torch.cuda.current_device() + ) + transform_data = DensePoseTransformData(uv_symmetry_map_torch) + return transform_data + + +class DensePoseDataRelative(object): + """ + Dense pose relative annotations that can be applied to any bounding box: + x - normalized X coordinates [0, 255] of annotated points + y - normalized Y coordinates [0, 255] of annotated points + i - body part labels 0,...,24 for annotated points + u - body part U coordinates [0, 1] for annotated points + v - body part V coordinates [0, 1] for annotated points + segm - 256x256 segmentation mask with values 0,...,14 + To obtain absolute x and y data wrt some bounding box one needs to first + divide the data by 256, multiply by the respective bounding box size + and add bounding box offset: + x_img = x0 + x_norm * w / 256.0 + y_img = y0 + y_norm * h / 256.0 + Segmentation masks are typically sampled to get image-based masks. + """ + + # Key for normalized X coordinates in annotation dict + X_KEY = "dp_x" + # Key for normalized Y coordinates in annotation dict + Y_KEY = "dp_y" + # Key for U part coordinates in annotation dict + U_KEY = "dp_U" + # Key for V part coordinates in annotation dict + V_KEY = "dp_V" + # Key for I point labels in annotation dict + I_KEY = "dp_I" + # Key for segmentation mask in annotation dict + S_KEY = "dp_masks" + # Number of body parts in segmentation masks + N_BODY_PARTS = 14 + # Number of parts in point labels + N_PART_LABELS = 24 + MASK_SIZE = 256 + + def __init__(self, annotation, cleanup=False): + is_valid, reason_not_valid = DensePoseDataRelative.validate_annotation(annotation) + assert is_valid, "Invalid DensePose annotations: {}".format(reason_not_valid) + self.x = torch.as_tensor(annotation[DensePoseDataRelative.X_KEY]) + self.y = torch.as_tensor(annotation[DensePoseDataRelative.Y_KEY]) + self.i = torch.as_tensor(annotation[DensePoseDataRelative.I_KEY]) + self.u = torch.as_tensor(annotation[DensePoseDataRelative.U_KEY]) + self.v = torch.as_tensor(annotation[DensePoseDataRelative.V_KEY]) + self.segm = DensePoseDataRelative.extract_segmentation_mask(annotation) + self.device = torch.device("cpu") + if cleanup: + DensePoseDataRelative.cleanup_annotation(annotation) + + def to(self, device): + if self.device == device: + return self + new_data = DensePoseDataRelative.__new__(DensePoseDataRelative) + new_data.x = self.x + new_data.x = self.x.to(device) + new_data.y = self.y.to(device) + new_data.i = self.i.to(device) + new_data.u = self.u.to(device) + new_data.v = self.v.to(device) + new_data.segm = self.segm.to(device) + new_data.device = device + return new_data + + @staticmethod + def extract_segmentation_mask(annotation): + import pycocotools.mask as mask_utils + + poly_specs = annotation[DensePoseDataRelative.S_KEY] + segm = torch.zeros((DensePoseDataRelative.MASK_SIZE,) * 2, dtype=torch.float32) + for i in range(DensePoseDataRelative.N_BODY_PARTS): + poly_i = poly_specs[i] + if poly_i: + mask_i = mask_utils.decode(poly_i) + segm[mask_i > 0] = i + 1 + return segm + + @staticmethod + def validate_annotation(annotation): + for key in [ + DensePoseDataRelative.X_KEY, + DensePoseDataRelative.Y_KEY, + DensePoseDataRelative.I_KEY, + DensePoseDataRelative.U_KEY, + DensePoseDataRelative.V_KEY, + DensePoseDataRelative.S_KEY, + ]: + if key not in annotation: + return False, "no {key} data in the annotation".format(key=key) + return True, None + + @staticmethod + def cleanup_annotation(annotation): + for key in [ + DensePoseDataRelative.X_KEY, + DensePoseDataRelative.Y_KEY, + DensePoseDataRelative.I_KEY, + DensePoseDataRelative.U_KEY, + DensePoseDataRelative.V_KEY, + DensePoseDataRelative.S_KEY, + ]: + if key in annotation: + del annotation[key] + + def apply_transform(self, transforms, densepose_transform_data): + self._transform_pts(transforms, densepose_transform_data) + self._transform_segm(transforms, densepose_transform_data) + + def _transform_pts(self, transforms, dp_transform_data): + import detectron2.data.transforms as T + + # NOTE: This assumes that HorizFlipTransform is the only one that does flip + do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 + if do_hflip: + self.x = self.segm.size(1) - self.x + self._flip_iuv_semantics(dp_transform_data) + + def _flip_iuv_semantics(self, dp_transform_data: DensePoseTransformData) -> None: + i_old = self.i.clone() + uv_symmetries = dp_transform_data.uv_symmetries + pt_label_symmetries = dp_transform_data.point_label_symmetries + for i in range(self.N_PART_LABELS): + if i + 1 in i_old: + annot_indices_i = i_old == i + 1 + if pt_label_symmetries[i + 1] != i + 1: + self.i[annot_indices_i] = pt_label_symmetries[i + 1] + u_loc = (self.u[annot_indices_i] * 255).long() + v_loc = (self.v[annot_indices_i] * 255).long() + self.u[annot_indices_i] = uv_symmetries["U_transforms"][i][v_loc, u_loc].to( + device=self.u.device + ) + self.v[annot_indices_i] = uv_symmetries["V_transforms"][i][v_loc, u_loc].to( + device=self.v.device + ) + + def _transform_segm(self, transforms, dp_transform_data): + import detectron2.data.transforms as T + + # NOTE: This assumes that HorizFlipTransform is the only one that does flip + do_hflip = sum(isinstance(t, T.HFlipTransform) for t in transforms.transforms) % 2 == 1 + if do_hflip: + self.segm = torch.flip(self.segm, [1]) + self._flip_segm_semantics(dp_transform_data) + + def _flip_segm_semantics(self, dp_transform_data): + old_segm = self.segm.clone() + mask_label_symmetries = dp_transform_data.mask_label_symmetries + for i in range(self.N_BODY_PARTS): + if mask_label_symmetries[i + 1] != i + 1: + self.segm[old_segm == i + 1] = mask_label_symmetries[i + 1] + + +def normalized_coords_transform(x0, y0, w, h): + """ + Coordinates transform that maps top left corner to (-1, -1) and bottom + right corner to (1, 1). Used for torch.grid_sample to initialize the + grid + """ + + def f(p): + return (2 * (p[0] - x0) / w - 1, 2 * (p[1] - y0) / h - 1) + + return f + + +class DensePoseOutput(object): + def __init__(self, S, I, U, V, confidences): + """ + Args: + S (`torch.Tensor`): coarse segmentation tensor of size (N, A, H, W) + I (`torch.Tensor`): fine segmentation tensor of size (N, C, H, W) + U (`torch.Tensor`): U coordinates for each fine segmentation label of size (N, C, H, W) + V (`torch.Tensor`): V coordinates for each fine segmentation label of size (N, C, H, W) + confidences (dict of str -> `torch.Tensor`) estimated confidence model parameters + """ + self.S = S + self.I = I # noqa: E741 + self.U = U + self.V = V + self.confidences = confidences + self._check_output_dims(S, I, U, V) + + def _check_output_dims(self, S, I, U, V): + assert ( + len(S.size()) == 4 + ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format( + S.size() + ) + assert ( + len(I.size()) == 4 + ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format( + S.size() + ) + assert ( + len(U.size()) == 4 + ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format( + S.size() + ) + assert ( + len(V.size()) == 4 + ), "Segmentation output should have 4 " "dimensions (NCHW), but has size {}".format( + S.size() + ) + assert len(S) == len(I), ( + "Number of output segmentation planes {} " + "should be equal to the number of output part index " + "planes {}".format(len(S), len(I)) + ) + assert S.size()[2:] == I.size()[2:], ( + "Output segmentation plane size {} " + "should be equal to the output part index " + "plane size {}".format(S.size()[2:], I.size()[2:]) + ) + assert I.size() == U.size(), ( + "Part index output shape {} " + "should be the same as U coordinates output shape {}".format(I.size(), U.size()) + ) + assert I.size() == V.size(), ( + "Part index output shape {} " + "should be the same as V coordinates output shape {}".format(I.size(), V.size()) + ) + + def resize(self, image_size_hw): + # do nothing - outputs are invariant to resize + pass + + def _crop(self, S, I, U, V, bbox_old_xywh, bbox_new_xywh): + """ + Resample S, I, U, V from bbox_old to the cropped bbox_new + """ + x0old, y0old, wold, hold = bbox_old_xywh + x0new, y0new, wnew, hnew = bbox_new_xywh + tr_coords = normalized_coords_transform(x0old, y0old, wold, hold) + topleft = (x0new, y0new) + bottomright = (x0new + wnew, y0new + hnew) + topleft_norm = tr_coords(topleft) + bottomright_norm = tr_coords(bottomright) + hsize = S.size(1) + wsize = S.size(2) + grid = torch.meshgrid( + torch.arange( + topleft_norm[1], + bottomright_norm[1], + (bottomright_norm[1] - topleft_norm[1]) / hsize, + )[:hsize], + torch.arange( + topleft_norm[0], + bottomright_norm[0], + (bottomright_norm[0] - topleft_norm[0]) / wsize, + )[:wsize], + ) + grid = torch.stack(grid, dim=2).to(S.device) + assert ( + grid.size(0) == hsize + ), "Resampled grid expected " "height={}, actual height={}".format(hsize, grid.size(0)) + assert grid.size(1) == wsize, "Resampled grid expected " "width={}, actual width={}".format( + wsize, grid.size(1) + ) + S_new = F.grid_sample( + S.unsqueeze(0), + torch.unsqueeze(grid, 0), + mode="bilinear", + padding_mode="border", + align_corners=True, + ).squeeze(0) + I_new = F.grid_sample( + I.unsqueeze(0), + torch.unsqueeze(grid, 0), + mode="bilinear", + padding_mode="border", + align_corners=True, + ).squeeze(0) + U_new = F.grid_sample( + U.unsqueeze(0), + torch.unsqueeze(grid, 0), + mode="bilinear", + padding_mode="border", + align_corners=True, + ).squeeze(0) + V_new = F.grid_sample( + V.unsqueeze(0), + torch.unsqueeze(grid, 0), + mode="bilinear", + padding_mode="border", + align_corners=True, + ).squeeze(0) + return S_new, I_new, U_new, V_new + + def crop(self, indices_cropped, bboxes_old, bboxes_new): + """ + Crop outputs for selected bounding boxes to the new bounding boxes. + """ + # VK: cropping is ignored for now + # for i, ic in enumerate(indices_cropped): + # self.S[ic], self.I[ic], self.U[ic], self.V[ic] = \ + # self._crop(self.S[ic], self.I[ic], self.U[ic], self.V[ic], + # bboxes_old[i], bboxes_new[i]) + pass + + def hflip(self, transform_data: DensePoseTransformData) -> None: + """ + Change S, I, U and V to take into account a Horizontal flip. + """ + if self.I.shape[0] > 0: + for el in "SIUV": + self.__dict__[el] = torch.flip(self.__dict__[el], [3]) + self._flip_iuv_semantics_tensor(transform_data) + self._flip_segm_semantics_tensor(transform_data) + + def _flip_iuv_semantics_tensor(self, dp_transform_data: DensePoseTransformData) -> None: + point_label_symmetries = dp_transform_data.point_label_symmetries + uv_symmetries = dp_transform_data.uv_symmetries + + N, C, H, W = self.U.shape + u_loc = (self.U[:, 1:, :, :].clamp(0, 1) * 255).long() + v_loc = (self.V[:, 1:, :, :].clamp(0, 1) * 255).long() + Iindex = torch.arange(C - 1, device=self.U.device)[None, :, None, None].expand( + N, C - 1, H, W + ) + self.U[:, 1:, :, :] = uv_symmetries["U_transforms"][Iindex, v_loc, u_loc].to( + device=self.U.device + ) + self.V[:, 1:, :, :] = uv_symmetries["V_transforms"][Iindex, v_loc, u_loc].to( + device=self.V.device + ) + + for el in "IUV": + self.__dict__[el] = self.__dict__[el][:, point_label_symmetries, :, :] + + def _flip_segm_semantics_tensor(self, dp_transform_data): + if self.S.shape[1] == DensePoseDataRelative.N_BODY_PARTS + 1: + self.S = self.S[:, dp_transform_data.mask_label_symmetries, :, :] + + def to_result(self, boxes_xywh): + """ + Convert DensePose outputs to results format. Results are more compact, + but cannot be resampled any more + """ + result = DensePoseResult(boxes_xywh, self.S, self.I, self.U, self.V) + return result + + def __getitem__(self, item): + if isinstance(item, int): + S_selected = self.S[item].unsqueeze(0) + I_selected = self.I[item].unsqueeze(0) + U_selected = self.U[item].unsqueeze(0) + V_selected = self.V[item].unsqueeze(0) + conf_selected = {} + for key in self.confidences: + conf_selected[key] = self.confidences[key][item].unsqueeze(0) + else: + S_selected = self.S[item] + I_selected = self.I[item] + U_selected = self.U[item] + V_selected = self.V[item] + conf_selected = {} + for key in self.confidences: + conf_selected[key] = self.confidences[key][item] + return DensePoseOutput(S_selected, I_selected, U_selected, V_selected, conf_selected) + + def __str__(self): + s = "DensePoseOutput S {}, I {}, U {}, V {}".format( + list(self.S.size()), list(self.I.size()), list(self.U.size()), list(self.V.size()) + ) + s_conf = "confidences: [{}]".format( + ", ".join([f"{key} {list(self.confidences[key].size())}" for key in self.confidences]) + ) + return ", ".join([s, s_conf]) + + def __len__(self): + return self.S.size(0) + + +class DensePoseResult(object): + def __init__(self, boxes_xywh, S, I, U, V): + self.results = [] + self.boxes_xywh = boxes_xywh.cpu().tolist() + assert len(boxes_xywh.size()) == 2 + assert boxes_xywh.size(1) == 4 + for i, box_xywh in enumerate(boxes_xywh): + result_i = self._output_to_result(box_xywh, S[[i]], I[[i]], U[[i]], V[[i]]) + result_numpy_i = result_i.cpu().numpy() + result_encoded_i = DensePoseResult.encode_png_data(result_numpy_i) + result_encoded_with_shape_i = (result_numpy_i.shape, result_encoded_i) + self.results.append(result_encoded_with_shape_i) + + def __str__(self): + s = "DensePoseResult: N={} [{}]".format( + len(self.results), ", ".join([str(list(r[0])) for r in self.results]) + ) + return s + + def _output_to_result(self, box_xywh, S, I, U, V): + x, y, w, h = box_xywh + w = max(int(w), 1) + h = max(int(h), 1) + result = torch.zeros([3, h, w], dtype=torch.uint8, device=U.device) + assert ( + len(S.size()) == 4 + ), "AnnIndex tensor size should have {} " "dimensions but has {}".format(4, len(S.size())) + s_bbox = F.interpolate(S, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + assert ( + len(I.size()) == 4 + ), "IndexUV tensor size should have {} " "dimensions but has {}".format(4, len(S.size())) + i_bbox = ( + F.interpolate(I, (h, w), mode="bilinear", align_corners=False).argmax(dim=1) + * (s_bbox > 0).long() + ).squeeze(0) + assert len(U.size()) == 4, "U tensor size should have {} " "dimensions but has {}".format( + 4, len(U.size()) + ) + u_bbox = F.interpolate(U, (h, w), mode="bilinear", align_corners=False) + assert len(V.size()) == 4, "V tensor size should have {} " "dimensions but has {}".format( + 4, len(V.size()) + ) + v_bbox = F.interpolate(V, (h, w), mode="bilinear", align_corners=False) + result[0] = i_bbox + for part_id in range(1, u_bbox.size(1)): + result[1][i_bbox == part_id] = ( + (u_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8) + ) + result[2][i_bbox == part_id] = ( + (v_bbox[0, part_id][i_bbox == part_id] * 255).clamp(0, 255).to(torch.uint8) + ) + assert ( + result.size(1) == h + ), "Results height {} should be equal" "to bounding box height {}".format(result.size(1), h) + assert ( + result.size(2) == w + ), "Results width {} should be equal" "to bounding box width {}".format(result.size(2), w) + return result + + @staticmethod + def encode_png_data(arr): + """ + Encode array data as a PNG image using the highest compression rate + @param arr [in] Data stored in an array of size (3, M, N) of type uint8 + @return Base64-encoded string containing PNG-compressed data + """ + assert len(arr.shape) == 3, "Expected a 3D array as an input," " got a {0}D array".format( + len(arr.shape) + ) + assert arr.shape[0] == 3, "Expected first array dimension of size 3," " got {0}".format( + arr.shape[0] + ) + assert arr.dtype == np.uint8, "Expected an array of type np.uint8, " " got {0}".format( + arr.dtype + ) + data = np.moveaxis(arr, 0, -1) + im = Image.fromarray(data) + fstream = BytesIO() + im.save(fstream, format="png", optimize=True) + s = base64.encodebytes(fstream.getvalue()).decode() + return s + + @staticmethod + def decode_png_data(shape, s): + """ + Decode array data from a string that contains PNG-compressed data + @param Base64-encoded string containing PNG-compressed data + @return Data stored in an array of size (3, M, N) of type uint8 + """ + fstream = BytesIO(base64.decodebytes(s.encode())) + im = Image.open(fstream) + data = np.moveaxis(np.array(im.getdata(), dtype=np.uint8), -1, 0) + return data.reshape(shape) + + def __len__(self): + return len(self.results) + + def __getitem__(self, item): + result_encoded = self.results[item] + bbox_xywh = self.boxes_xywh[item] + return result_encoded, bbox_xywh + + +class DensePoseList(object): + + _TORCH_DEVICE_CPU = torch.device("cpu") + + def __init__(self, densepose_datas, boxes_xyxy_abs, image_size_hw, device=_TORCH_DEVICE_CPU): + assert len(densepose_datas) == len( + boxes_xyxy_abs + ), "Attempt to initialize DensePoseList with {} DensePose datas " "and {} boxes".format( + len(densepose_datas), len(boxes_xyxy_abs) + ) + self.densepose_datas = [] + for densepose_data in densepose_datas: + assert isinstance(densepose_data, DensePoseDataRelative) or densepose_data is None, ( + "Attempt to initialize DensePoseList with DensePose datas " + "of type {}, expected DensePoseDataRelative".format(type(densepose_data)) + ) + densepose_data_ondevice = ( + densepose_data.to(device) if densepose_data is not None else None + ) + self.densepose_datas.append(densepose_data_ondevice) + self.boxes_xyxy_abs = boxes_xyxy_abs.to(device) + self.image_size_hw = image_size_hw + self.device = device + + def to(self, device): + if self.device == device: + return self + return DensePoseList(self.densepose_datas, self.boxes_xyxy_abs, self.image_size_hw, device) + + def __iter__(self): + return iter(self.densepose_datas) + + def __len__(self): + return len(self.densepose_datas) + + def __repr__(self): + s = self.__class__.__name__ + "(" + s += "num_instances={}, ".format(len(self.densepose_datas)) + s += "image_width={}, ".format(self.image_size_hw[1]) + s += "image_height={})".format(self.image_size_hw[0]) + return s + + def __getitem__(self, item): + if isinstance(item, int): + densepose_data_rel = self.densepose_datas[item] + return densepose_data_rel + elif isinstance(item, slice): + densepose_datas_rel = self.densepose_datas[item] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) + elif isinstance(item, torch.Tensor) and (item.dtype == torch.bool): + densepose_datas_rel = [self.densepose_datas[i] for i, x in enumerate(item) if x > 0] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) + else: + densepose_datas_rel = [self.densepose_datas[i] for i in item] + boxes_xyxy_abs = self.boxes_xyxy_abs[item] + return DensePoseList( + densepose_datas_rel, boxes_xyxy_abs, self.image_size_hw, self.device + ) diff --git a/projects/DensePose/densepose/densepose_coco_evaluation.py b/projects/DensePose/densepose/densepose_coco_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..22471d1e54c3124cacf8b1b23fa88af65ccb486f --- /dev/null +++ b/projects/DensePose/densepose/densepose_coco_evaluation.py @@ -0,0 +1,1138 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# This is a modified version of cocoeval.py where we also have the densepose evaluation. + +__author__ = "tsungyi" + +import copy +import datetime +import itertools +import logging +import numpy as np +import pickle +import time +from collections import defaultdict +from enum import Enum +from typing import Any, Dict, Tuple +import scipy.spatial.distance as ssd +from fvcore.common.file_io import PathManager +from pycocotools import mask as maskUtils +from scipy.io import loadmat +from scipy.ndimage import zoom as spzoom + +from .data.structures import DensePoseDataRelative, DensePoseResult + +logger = logging.getLogger(__name__) + + +class DensePoseEvalMode(str, Enum): + # use both masks and geodesic distances (GPS * IOU) to compute scores + GPSM = "gpsm" + # use only geodesic distances (GPS) to compute scores + GPS = "gps" + # use only masks (IOU) to compute scores + IOU = "iou" + + +class DensePoseDataMode(str, Enum): + # use estimated IUV data (default mode) + IUV_DT = "iuvdt" + # use ground truth IUV data + IUV_GT = "iuvgt" + # use ground truth labels I and set UV to 0 + I_GT_UV_0 = "igtuv0" + # use ground truth labels I and estimated UV coordinates + I_GT_UV_DT = "igtuvdt" + # use estimated labels I and set UV to 0 + I_DT_UV_0 = "idtuv0" + + +class DensePoseCocoEval(object): + # Interface for evaluating detection on the Microsoft COCO dataset. + # + # The usage for CocoEval is as follows: + # cocoGt=..., cocoDt=... # load dataset and results + # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object + # E.params.recThrs = ...; # set parameters as desired + # E.evaluate(); # run per image evaluation + # E.accumulate(); # accumulate per image results + # E.summarize(); # display summary metrics of results + # For example usage see evalDemo.m and http://mscoco.org/. + # + # The evaluation parameters are as follows (defaults in brackets): + # imgIds - [all] N img ids to use for evaluation + # catIds - [all] K cat ids to use for evaluation + # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation + # recThrs - [0:.01:1] R=101 recall thresholds for evaluation + # areaRng - [...] A=4 object area ranges for evaluation + # maxDets - [1 10 100] M=3 thresholds on max detections per image + # iouType - ['segm'] set iouType to 'segm', 'bbox', 'keypoints' or 'densepose' + # iouType replaced the now DEPRECATED useSegm parameter. + # useCats - [1] if true use category labels for evaluation + # Note: if useCats=0 category labels are ignored as in proposal scoring. + # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. + # + # evaluate(): evaluates detections on every image and every category and + # concats the results into the "evalImgs" with fields: + # dtIds - [1xD] id for each of the D detections (dt) + # gtIds - [1xG] id for each of the G ground truths (gt) + # dtMatches - [TxD] matching gt id at each IoU or 0 + # gtMatches - [TxG] matching dt id at each IoU or 0 + # dtScores - [1xD] confidence of each dt + # gtIgnore - [1xG] ignore flag for each gt + # dtIgnore - [TxD] ignore flag for each dt at each IoU + # + # accumulate(): accumulates the per-image, per-category evaluation + # results in "evalImgs" into the dictionary "eval" with fields: + # params - parameters used for evaluation + # date - date evaluation was performed + # counts - [T,R,K,A,M] parameter dimensions (see above) + # precision - [TxRxKxAxM] precision for every evaluation setting + # recall - [TxKxAxM] max recall for every evaluation setting + # Note: precision and recall==-1 for settings with no gt objects. + # + # See also coco, mask, pycocoDemo, pycocoEvalDemo + # + # Microsoft COCO Toolbox. version 2.0 + # Data, paper, and tutorials available at: http://mscoco.org/ + # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. + # Licensed under the Simplified BSD License [see coco/license.txt] + def __init__( + self, + cocoGt=None, + cocoDt=None, + iouType: str = "densepose", + dpEvalMode: DensePoseEvalMode = DensePoseEvalMode.GPS, + dpDataMode: DensePoseDataMode = DensePoseDataMode.IUV_DT, + ): + """ + Initialize CocoEval using coco APIs for gt and dt + :param cocoGt: coco object with ground truth annotations + :param cocoDt: coco object with detection results + :return: None + """ + self.cocoGt = cocoGt # ground truth COCO API + self.cocoDt = cocoDt # detections COCO API + self._dpEvalMode = dpEvalMode + self._dpDataMode = dpDataMode + self.params = {} # evaluation parameters + self.evalImgs = defaultdict(list) # per-image per-category eval results [KxAxI] + self.eval = {} # accumulated evaluation results + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self.params = Params(iouType=iouType) # parameters + self._paramsEval = {} # parameters for evaluation + self.stats = [] # result summarization + self.ious = {} # ious between all gts and dts + if cocoGt is not None: + self.params.imgIds = sorted(cocoGt.getImgIds()) + self.params.catIds = sorted(cocoGt.getCatIds()) + self.ignoreThrBB = 0.7 + self.ignoreThrUV = 0.9 + + def _loadGEval(self): + smpl_subdiv_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/SMPL_subdiv.mat" + ) + pdist_transform_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/SMPL_SUBDIV_TRANSFORM.mat" + ) + pdist_matrix_fpath = PathManager.get_local_path( + "https://dl.fbaipublicfiles.com/densepose/data/Pdist_matrix.pkl", timeout_sec=120 + ) + SMPL_subdiv = loadmat(smpl_subdiv_fpath) + self.PDIST_transform = loadmat(pdist_transform_fpath) + self.PDIST_transform = self.PDIST_transform["index"].squeeze() + UV = np.array([SMPL_subdiv["U_subdiv"], SMPL_subdiv["V_subdiv"]]).squeeze() + ClosestVertInds = np.arange(UV.shape[1]) + 1 + self.Part_UVs = [] + self.Part_ClosestVertInds = [] + for i in np.arange(24): + self.Part_UVs.append(UV[:, SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)]) + self.Part_ClosestVertInds.append( + ClosestVertInds[SMPL_subdiv["Part_ID_subdiv"].squeeze() == (i + 1)] + ) + + with open(pdist_matrix_fpath, "rb") as hFile: + arrays = pickle.load(hFile, encoding="latin1") + self.Pdist_matrix = arrays["Pdist_matrix"] + self.Part_ids = np.array(SMPL_subdiv["Part_ID_subdiv"].squeeze()) + # Mean geodesic distances for parts. + self.Mean_Distances = np.array([0, 0.351, 0.107, 0.126, 0.237, 0.173, 0.142, 0.128, 0.150]) + # Coarse Part labels. + self.CoarseParts = np.array( + [0, 1, 1, 2, 2, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8] + ) + + def _prepare(self): + """ + Prepare ._gts and ._dts for evaluation based on params + :return: None + """ + + def _toMask(anns, coco): + # modify ann['segmentation'] by reference + for ann in anns: + rle = coco.annToRLE(ann) + ann["segmentation"] = rle + + def _getIgnoreRegion(iid, coco): + img = coco.imgs[iid] + + if "ignore_regions_x" not in img.keys(): + return None + + if len(img["ignore_regions_x"]) == 0: + return None + + rgns_merged = [] + for region_x, region_y in zip(img["ignore_regions_x"], img["ignore_regions_y"]): + rgns = [iter(region_x), iter(region_y)] + rgns_merged.append([next(it) for it in itertools.cycle(rgns)]) + rles = maskUtils.frPyObjects(rgns_merged, img["height"], img["width"]) + rle = maskUtils.merge(rles) + return maskUtils.decode(rle) + + def _checkIgnore(dt, iregion): + if iregion is None: + return True + + bb = np.array(dt["bbox"]).astype(np.int) + x1, y1, x2, y2 = bb[0], bb[1], bb[0] + bb[2], bb[1] + bb[3] + x2 = min([x2, iregion.shape[1]]) + y2 = min([y2, iregion.shape[0]]) + + if bb[2] * bb[3] == 0: + return False + + crop_iregion = iregion[y1:y2, x1:x2] + + if crop_iregion.sum() == 0: + return True + + if "densepose" not in dt.keys(): # filtering boxes + return crop_iregion.sum() / bb[2] / bb[3] < self.ignoreThrBB + + # filtering UVs + ignoremask = np.require(crop_iregion, requirements=["F"]) + mask = self._extract_mask(dt) + uvmask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"]) + uvmask_ = maskUtils.encode(uvmask) + ignoremask_ = maskUtils.encode(ignoremask) + uviou = maskUtils.iou([uvmask_], [ignoremask_], [1])[0] + return uviou < self.ignoreThrUV + + p = self.params + + if p.useCats: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + else: + gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) + dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) + + imns = self.cocoGt.loadImgs(p.imgIds) + self.size_mapping = {} + for im in imns: + self.size_mapping[im["id"]] = [im["height"], im["width"]] + + # if iouType == 'uv', add point gt annotations + if p.iouType == "densepose": + self._loadGEval() + + # convert ground truth to mask if iouType == 'segm' + if p.iouType == "segm": + _toMask(gts, self.cocoGt) + _toMask(dts, self.cocoDt) + + # set ignore flag + for gt in gts: + gt["ignore"] = gt["ignore"] if "ignore" in gt else 0 + gt["ignore"] = "iscrowd" in gt and gt["iscrowd"] + if p.iouType == "keypoints": + gt["ignore"] = (gt["num_keypoints"] == 0) or gt["ignore"] + if p.iouType == "densepose": + gt["ignore"] = ("dp_x" in gt) == 0 + + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self._igrgns = defaultdict(list) + + for gt in gts: + iid = gt["image_id"] + if iid not in self._igrgns.keys(): + self._igrgns[iid] = _getIgnoreRegion(iid, self.cocoGt) + if _checkIgnore(gt, self._igrgns[iid]): + self._gts[iid, gt["category_id"]].append(gt) + for dt in dts: + iid = dt["image_id"] + if (iid not in self._igrgns) or _checkIgnore(dt, self._igrgns[iid]): + self._dts[iid, dt["category_id"]].append(dt) + + self.evalImgs = defaultdict(list) # per-image per-category evaluation results + self.eval = {} # accumulated evaluation results + + def evaluate(self): + """ + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + """ + tic = time.time() + logger.info("Running per image DensePose evaluation... {}".format(self.params.iouType)) + p = self.params + # add backward compatibility if useSegm is specified in params + if p.useSegm is not None: + p.iouType = "segm" if p.useSegm == 1 else "bbox" + logger.info("useSegm (deprecated) is not None. Running DensePose evaluation") + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params = p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType in ["segm", "bbox"]: + computeIoU = self.computeIoU + elif p.iouType == "keypoints": + computeIoU = self.computeOks + elif p.iouType == "densepose": + computeIoU = self.computeOgps + if self._dpEvalMode == DensePoseEvalMode.GPSM: + self.real_ious = { + (imgId, catId): self.computeDPIoU(imgId, catId) + for imgId in p.imgIds + for catId in catIds + } + + self.ious = { + (imgId, catId): computeIoU(imgId, catId) for imgId in p.imgIds for catId in catIds + } + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + self.evalImgs = [ + evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + logger.info("DensePose evaluation DONE (t={:0.2f}s).".format(toc - tic)) + + def getDensePoseMask(self, polys): + maskGen = np.zeros([256, 256]) + for i in range(1, 15): + if polys[i - 1]: + currentMask = maskUtils.decode(polys[i - 1]) + maskGen[currentMask > 0] = i + return maskGen + + def _generate_rlemask_on_image(self, mask, imgId, data): + bbox_xywh = np.array(data["bbox"]) + x, y, w, h = bbox_xywh + im_h, im_w = self.size_mapping[imgId] + im_mask = np.zeros((im_h, im_w), dtype=np.uint8) + if mask is not None: + x0 = max(int(x), 0) + x1 = min(int(x + w), im_w, int(x) + mask.shape[1]) + y0 = max(int(y), 0) + y1 = min(int(y + h), im_h, int(y) + mask.shape[0]) + y = int(y) + x = int(x) + im_mask[y0:y1, x0:x1] = mask[y0 - y : y1 - y, x0 - x : x1 - x] + im_mask = np.require(np.asarray(im_mask > 0), dtype=np.uint8, requirements=["F"]) + rle_mask = maskUtils.encode(np.array(im_mask[:, :, np.newaxis], order="F"))[0] + return rle_mask + + def computeDPIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0 : p.maxDets[-1]] + + gtmasks = [] + for g in gt: + if DensePoseDataRelative.S_KEY in g: + mask = self.getDensePoseMask(g[DensePoseDataRelative.S_KEY]) + _, _, w, h = g["bbox"] + scale_x = float(max(w, 1)) / mask.shape[1] + scale_y = float(max(h, 1)) / mask.shape[0] + mask = spzoom(mask, (scale_y, scale_x), order=1, prefilter=False) + mask = np.array(mask > 0.5, dtype=np.uint8) + rle_mask = self._generate_rlemask_on_image(mask, imgId, g) + elif "segmentation" in g: + segmentation = g["segmentation"] + if isinstance(segmentation, list) and segmentation: + # polygons + im_h, im_w = self.size_mapping[imgId] + rles = maskUtils.frPyObjects(segmentation, im_h, im_w) + rle_mask = maskUtils.merge(rles) + elif isinstance(segmentation, dict): + if isinstance(segmentation["counts"], list): + # uncompressed RLE + im_h, im_w = self.size_mapping[imgId] + rle_mask = maskUtils.frPyObjects(segmentation, im_h, im_w) + else: + # compressed RLE + rle_mask = segmentation + else: + rle_mask = self._generate_rlemask_on_image(None, imgId, g) + else: + rle_mask = self._generate_rlemask_on_image(None, imgId, g) + gtmasks.append(rle_mask) + + dtmasks = [] + for d in dt: + mask = self._extract_mask(d) + mask = np.require(np.asarray(mask > 0), dtype=np.uint8, requirements=["F"]) + rle_mask = self._generate_rlemask_on_image(mask, imgId, d) + dtmasks.append(rle_mask) + + # compute iou between each dt and gt region + iscrowd = [int(o["iscrowd"]) for o in gt] + iousDP = maskUtils.iou(dtmasks, gtmasks, iscrowd) + return iousDP + + def computeIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return [] + inds = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt = dt[0 : p.maxDets[-1]] + + if p.iouType == "segm": + g = [g["segmentation"] for g in gt] + d = [d["segmentation"] for d in dt] + elif p.iouType == "bbox": + g = [g["bbox"] for g in gt] + d = [d["bbox"] for d in dt] + else: + raise Exception("unknown iouType for iou computation") + + # compute iou between each dt and gt region + iscrowd = [int(o["iscrowd"]) for o in gt] + ious = maskUtils.iou(d, g, iscrowd) + return ious + + def computeOks(self, imgId, catId): + p = self.params + # dimension here should be Nxm + gts = self._gts[imgId, catId] + dts = self._dts[imgId, catId] + inds = np.argsort([-d["score"] for d in dts], kind="mergesort") + dts = [dts[i] for i in inds] + if len(dts) > p.maxDets[-1]: + dts = dts[0 : p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(gts) == 0 or len(dts) == 0: + return [] + ious = np.zeros((len(dts), len(gts))) + sigmas = ( + np.array( + [ + 0.26, + 0.25, + 0.25, + 0.35, + 0.35, + 0.79, + 0.79, + 0.72, + 0.72, + 0.62, + 0.62, + 1.07, + 1.07, + 0.87, + 0.87, + 0.89, + 0.89, + ] + ) + / 10.0 + ) + vars = (sigmas * 2) ** 2 + k = len(sigmas) + # compute oks between each detection and ground truth object + for j, gt in enumerate(gts): + # create bounds for ignore regions(double the gt bbox) + g = np.array(gt["keypoints"]) + xg = g[0::3] + yg = g[1::3] + vg = g[2::3] + k1 = np.count_nonzero(vg > 0) + bb = gt["bbox"] + x0 = bb[0] - bb[2] + x1 = bb[0] + bb[2] * 2 + y0 = bb[1] - bb[3] + y1 = bb[1] + bb[3] * 2 + for i, dt in enumerate(dts): + d = np.array(dt["keypoints"]) + xd = d[0::3] + yd = d[1::3] + if k1 > 0: + # measure the per-keypoint distance if keypoints visible + dx = xd - xg + dy = yd - yg + else: + # measure minimum distance to keypoints in (x0,y0) & (x1,y1) + z = np.zeros(k) + dx = np.max((z, x0 - xd), axis=0) + np.max((z, xd - x1), axis=0) + dy = np.max((z, y0 - yd), axis=0) + np.max((z, yd - y1), axis=0) + e = (dx ** 2 + dy ** 2) / vars / (gt["area"] + np.spacing(1)) / 2 + if k1 > 0: + e = e[vg > 0] + ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] + return ious + + def _extract_mask(self, dt: Dict[str, Any]) -> np.ndarray: + (densepose_shape, densepose_data_encoded), densepose_bbox_xywh = dt["densepose"] + densepose_data = DensePoseResult.decode_png_data(densepose_shape, densepose_data_encoded) + return densepose_data[0] + + def _extract_iuv( + self, densepose_data: np.ndarray, py: np.ndarray, px: np.ndarray, gt: Dict[str, Any] + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Extract arrays of I, U and V values at given points as numpy arrays + given the data mode stored in self._dpDataMode + """ + if self._dpDataMode == DensePoseDataMode.IUV_DT: + # estimated labels and UV (default) + ipoints = densepose_data[0, py, px] + upoints = densepose_data[1, py, px] / 255.0 # convert from uint8 by /255. + vpoints = densepose_data[2, py, px] / 255.0 + elif self._dpDataMode == DensePoseDataMode.IUV_GT: + # ground truth + ipoints = np.array(gt["dp_I"]) + upoints = np.array(gt["dp_U"]) + vpoints = np.array(gt["dp_V"]) + elif self._dpDataMode == DensePoseDataMode.I_GT_UV_0: + # ground truth labels, UV = 0 + ipoints = np.array(gt["dp_I"]) + upoints = upoints * 0.0 + vpoints = vpoints * 0.0 + elif self._dpDataMode == DensePoseDataMode.I_GT_UV_DT: + # ground truth labels, estimated UV + ipoints = np.array(gt["dp_I"]) + upoints = densepose_data[1, py, px] / 255.0 # convert from uint8 by /255. + vpoints = densepose_data[2, py, px] / 255.0 + elif self._dpDataMode == DensePoseDataMode.I_DT_UV_0: + # estimated labels, UV = 0 + ipoints = densepose_data[0, py, px] + upoints = upoints * 0.0 + vpoints = vpoints * 0.0 + else: + raise ValueError(f"Unknown data mode: {self._dpDataMode}") + return ipoints, upoints, vpoints + + def computeOgps(self, imgId, catId): + p = self.params + # dimension here should be Nxm + g = self._gts[imgId, catId] + d = self._dts[imgId, catId] + inds = np.argsort([-d_["score"] for d_ in d], kind="mergesort") + d = [d[i] for i in inds] + if len(d) > p.maxDets[-1]: + d = d[0 : p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(g) == 0 or len(d) == 0: + return [] + ious = np.zeros((len(d), len(g))) + # compute opgs between each detection and ground truth object + # sigma = self.sigma #0.255 # dist = 0.3m corresponds to ogps = 0.5 + # 1 # dist = 0.3m corresponds to ogps = 0.96 + # 1.45 # dist = 1.7m (person height) corresponds to ogps = 0.5) + for j, gt in enumerate(g): + if not gt["ignore"]: + g_ = gt["bbox"] + for i, dt in enumerate(d): + # + dy = int(dt["bbox"][3]) + dx = int(dt["bbox"][2]) + dp_x = np.array(gt["dp_x"]) * g_[2] / 255.0 + dp_y = np.array(gt["dp_y"]) * g_[3] / 255.0 + py = (dp_y + g_[1] - dt["bbox"][1]).astype(np.int) + px = (dp_x + g_[0] - dt["bbox"][0]).astype(np.int) + # + pts = np.zeros(len(px)) + pts[px >= dx] = -1 + pts[py >= dy] = -1 + pts[px < 0] = -1 + pts[py < 0] = -1 + if len(pts) < 1: + ogps = 0.0 + elif np.max(pts) == -1: + ogps = 0.0 + else: + px[pts == -1] = 0 + py[pts == -1] = 0 + (densepose_shape, densepose_data_encoded), densepose_bbox_xywh = dt[ + "densepose" + ] + densepose_data = DensePoseResult.decode_png_data( + densepose_shape, densepose_data_encoded + ) + assert densepose_data.shape[2] == dx, ( + "DensePoseData width {} should be equal to " + "detection bounding box width {}".format(densepose_data.shape[2], dx) + ) + assert densepose_data.shape[1] == dy, ( + "DensePoseData height {} should be equal to " + "detection bounding box height {}".format(densepose_data.shape[1], dy) + ) + ipoints, upoints, vpoints = self._extract_iuv(densepose_data, py, px, gt) + ipoints[pts == -1] = 0 + # Find closest vertices in subsampled mesh. + cVerts, cVertsGT = self.findAllClosestVerts(gt, upoints, vpoints, ipoints) + # Get pairwise geodesic distances between gt and estimated mesh points. + dist = self.getDistances(cVertsGT, cVerts) + # Compute the Ogps measure. + # Find the mean geodesic normalization distance for + # each GT point, based on which part it is on. + Current_Mean_Distances = self.Mean_Distances[ + self.CoarseParts[self.Part_ids[cVertsGT[cVertsGT > 0].astype(int) - 1]] + ] + # Compute gps + ogps_values = np.exp(-(dist ** 2) / (2 * (Current_Mean_Distances ** 2))) + # + if len(dist) > 0: + ogps = np.sum(ogps_values) / len(dist) + ious[i, j] = ogps + + gbb = [gt["bbox"] for gt in g] + dbb = [dt["bbox"] for dt in d] + + # compute iou between each dt and gt region + iscrowd = [int(o["iscrowd"]) for o in g] + ious_bb = maskUtils.iou(dbb, gbb, iscrowd) + return ious, ious_bb + + def evaluateImg(self, imgId, catId, aRng, maxDet): + """ + perform evaluation for single category and image + :return: dict (single image results) + """ + + p = self.params + if p.useCats: + gt = self._gts[imgId, catId] + dt = self._dts[imgId, catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]] + if len(gt) == 0 and len(dt) == 0: + return None + + for g in gt: + # g['_ignore'] = g['ignore'] + if g["ignore"] or (g["area"] < aRng[0] or g["area"] > aRng[1]): + g["_ignore"] = True + else: + g["_ignore"] = False + + # sort dt highest score first, sort gt ignore last + gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort") + gt = [gt[i] for i in gtind] + dtind = np.argsort([-d["score"] for d in dt], kind="mergesort") + dt = [dt[i] for i in dtind[0:maxDet]] + iscrowd = [int(o["iscrowd"]) for o in gt] + # load computed ious + if p.iouType == "densepose": + # print('Checking the length', len(self.ious[imgId, catId])) + # if len(self.ious[imgId, catId]) == 0: + # print(self.ious[imgId, catId]) + ious = ( + self.ious[imgId, catId][0][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + ioubs = ( + self.ious[imgId, catId][1][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + if self._dpEvalMode == DensePoseEvalMode.GPSM: + iousM = ( + self.real_ious[imgId, catId][:, gtind] + if len(self.real_ious[imgId, catId]) > 0 + else self.real_ious[imgId, catId] + ) + else: + ious = ( + self.ious[imgId, catId][:, gtind] + if len(self.ious[imgId, catId]) > 0 + else self.ious[imgId, catId] + ) + + T = len(p.iouThrs) + G = len(gt) + D = len(dt) + gtm = np.zeros((T, G)) + dtm = np.zeros((T, D)) + gtIg = np.array([g["_ignore"] for g in gt]) + dtIg = np.zeros((T, D)) + if np.all(gtIg) and p.iouType == "densepose": + dtIg = np.logical_or(dtIg, True) + + if len(ious) > 0: # and not p.iouType == 'densepose': + for tind, t in enumerate(p.iouThrs): + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + iou = min([t, 1 - 1e-10]) + m = -1 + for gind, _g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1: + break + if p.iouType == "densepose": + if self._dpEvalMode == DensePoseEvalMode.GPSM: + new_iou = np.sqrt(iousM[dind, gind] * ious[dind, gind]) + elif self._dpEvalMode == DensePoseEvalMode.IOU: + new_iou = iousM[dind, gind] + elif self._dpEvalMode == DensePoseEvalMode.GPS: + new_iou = ious[dind, gind] + else: + new_iou = ious[dind, gind] + if new_iou < iou: + continue + if new_iou == 0.0: + continue + # if match successful and best so far, store appropriately + iou = new_iou + m = gind + # if match made store id of match for both dt and gt + if m == -1: + continue + dtIg[tind, dind] = gtIg[m] + dtm[tind, dind] = gt[m]["id"] + gtm[tind, m] = d["id"] + + if p.iouType == "densepose": + if not len(ioubs) == 0: + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + if dtm[tind, dind] == 0: + ioub = 0.8 + m = -1 + for gind, _g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind, gind] > 0 and not iscrowd[gind]: + continue + # continue to next gt unless better match made + if ioubs[dind, gind] < ioub: + continue + # if match successful and best so far, store appropriately + ioub = ioubs[dind, gind] + m = gind + # if match made store id of match for both dt and gt + if m > -1: + dtIg[:, dind] = gtIg[m] + if gtIg[m]: + dtm[tind, dind] = gt[m]["id"] + gtm[tind, m] = d["id"] + # set unmatched detections outside of area range to ignore + a = np.array([d["area"] < aRng[0] or d["area"] > aRng[1] for d in dt]).reshape((1, len(dt))) + dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0))) + # store results for given image and category + # print('Done with the function', len(self.ious[imgId, catId])) + return { + "image_id": imgId, + "category_id": catId, + "aRng": aRng, + "maxDet": maxDet, + "dtIds": [d["id"] for d in dt], + "gtIds": [g["id"] for g in gt], + "dtMatches": dtm, + "gtMatches": gtm, + "dtScores": [d["score"] for d in dt], + "gtIgnore": gtIg, + "dtIgnore": dtIg, + } + + def accumulate(self, p=None): + """ + Accumulate per image evaluation results and store the result in self.eval + :param p: input params for evaluation + :return: None + """ + logger.info("Accumulating evaluation results...") + tic = time.time() + if not self.evalImgs: + logger.info("Please run evaluate() first") + # allows input customized parameters + if p is None: + p = self.params + p.catIds = p.catIds if p.useCats == 1 else [-1] + T = len(p.iouThrs) + R = len(p.recThrs) + K = len(p.catIds) if p.useCats else 1 + A = len(p.areaRng) + M = len(p.maxDets) + precision = -(np.ones((T, R, K, A, M))) # -1 for the precision of absent categories + recall = -(np.ones((T, K, A, M))) + + # create dictionary for future indexing + logger.info("Categories: {}".format(p.catIds)) + _pe = self._paramsEval + catIds = _pe.catIds if _pe.useCats else [-1] + setK = set(catIds) + setA = set(map(tuple, _pe.areaRng)) + setM = set(_pe.maxDets) + setI = set(_pe.imgIds) + # get inds to evaluate + k_list = [n for n, k in enumerate(p.catIds) if k in setK] + m_list = [m for n, m in enumerate(p.maxDets) if m in setM] + a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] + i_list = [n for n, i in enumerate(p.imgIds) if i in setI] + I0 = len(_pe.imgIds) + A0 = len(_pe.areaRng) + # retrieve E at each category, area range, and max number of detections + for k, k0 in enumerate(k_list): + Nk = k0 * A0 * I0 + for a, a0 in enumerate(a_list): + Na = a0 * I0 + for m, maxDet in enumerate(m_list): + E = [self.evalImgs[Nk + Na + i] for i in i_list] + E = [e for e in E if e is not None] + if len(E) == 0: + continue + dtScores = np.concatenate([e["dtScores"][0:maxDet] for e in E]) + + # different sorting method generates slightly different results. + # mergesort is used to be consistent as Matlab implementation. + inds = np.argsort(-dtScores, kind="mergesort") + + dtm = np.concatenate([e["dtMatches"][:, 0:maxDet] for e in E], axis=1)[:, inds] + dtIg = np.concatenate([e["dtIgnore"][:, 0:maxDet] for e in E], axis=1)[:, inds] + gtIg = np.concatenate([e["gtIgnore"] for e in E]) + npig = np.count_nonzero(gtIg == 0) + if npig == 0: + continue + tps = np.logical_and(dtm, np.logical_not(dtIg)) + fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg)) + tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) + for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): + tp = np.array(tp) + fp = np.array(fp) + nd = len(tp) + rc = tp / npig + pr = tp / (fp + tp + np.spacing(1)) + q = np.zeros((R,)) + + if nd: + recall[t, k, a, m] = rc[-1] + else: + recall[t, k, a, m] = 0 + + # numpy is slow without cython optimization for accessing elements + # use python array gets significant speed improvement + pr = pr.tolist() + q = q.tolist() + + for i in range(nd - 1, 0, -1): + if pr[i] > pr[i - 1]: + pr[i - 1] = pr[i] + + inds = np.searchsorted(rc, p.recThrs, side="left") + try: + for ri, pi in enumerate(inds): + q[ri] = pr[pi] + except Exception: + pass + precision[t, :, k, a, m] = np.array(q) + logger.info( + "Final: max precision {}, min precision {}".format(np.max(precision), np.min(precision)) + ) + self.eval = { + "params": p, + "counts": [T, R, K, A, M], + "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + "precision": precision, + "recall": recall, + } + toc = time.time() + logger.info("DONE (t={:0.2f}s).".format(toc - tic)) + + def summarize(self): + """ + Compute and display summary metrics for evaluation results. + Note this function can *only* be applied on the default parameter setting + """ + + def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): + p = self.params + iStr = " {:<18} {} @[ {}={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" + titleStr = "Average Precision" if ap == 1 else "Average Recall" + typeStr = "(AP)" if ap == 1 else "(AR)" + measure = "IoU" + if self.params.iouType == "keypoints": + measure = "OKS" + elif self.params.iouType == "densepose": + measure = "OGPS" + iouStr = ( + "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) + if iouThr is None + else "{:0.2f}".format(iouThr) + ) + + aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval["precision"] + # IoU + if iouThr is not None: + t = np.where(np.abs(iouThr - p.iouThrs) < 0.001)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval["recall"] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + logger.info(iStr.format(titleStr, typeStr, measure, iouStr, areaRng, maxDets, mean_s)) + return mean_s + + def _summarizeDets(): + stats = np.zeros((12,)) + stats[0] = _summarize(1) + stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) + stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) + stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) + return stats + + def _summarizeKps(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=0.5) + stats[2] = _summarize(1, maxDets=20, iouThr=0.75) + stats[3] = _summarize(1, maxDets=20, areaRng="medium") + stats[4] = _summarize(1, maxDets=20, areaRng="large") + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=0.5) + stats[7] = _summarize(0, maxDets=20, iouThr=0.75) + stats[8] = _summarize(0, maxDets=20, areaRng="medium") + stats[9] = _summarize(0, maxDets=20, areaRng="large") + return stats + + def _summarizeUvs(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=self.params.maxDets[0]) + stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[3] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium") + stats[4] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large") + stats[5] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[7] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[8] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium") + stats[9] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large") + return stats + + def _summarizeUvsOld(): + stats = np.zeros((18,)) + stats[0] = _summarize(1, maxDets=self.params.maxDets[0]) + stats[1] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[2] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.55) + stats[3] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.60) + stats[4] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.65) + stats[5] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.70) + stats[6] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[7] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.80) + stats[8] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.85) + stats[9] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.90) + stats[10] = _summarize(1, maxDets=self.params.maxDets[0], iouThr=0.95) + stats[11] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="medium") + stats[12] = _summarize(1, maxDets=self.params.maxDets[0], areaRng="large") + stats[13] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[14] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.5) + stats[15] = _summarize(0, maxDets=self.params.maxDets[0], iouThr=0.75) + stats[16] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="medium") + stats[17] = _summarize(0, maxDets=self.params.maxDets[0], areaRng="large") + return stats + + if not self.eval: + raise Exception("Please run accumulate() first") + iouType = self.params.iouType + if iouType in ["segm", "bbox"]: + summarize = _summarizeDets + elif iouType in ["keypoints"]: + summarize = _summarizeKps + elif iouType in ["densepose"]: + summarize = _summarizeUvs + self.stats = summarize() + + def __str__(self): + self.summarize() + + # ================ functions for dense pose ============================== + def findAllClosestVerts(self, gt, U_points, V_points, Index_points): + # + I_gt = np.array(gt["dp_I"]) + U_gt = np.array(gt["dp_U"]) + V_gt = np.array(gt["dp_V"]) + # + # print(I_gt) + # + ClosestVerts = np.ones(Index_points.shape) * -1 + for i in np.arange(24): + # + if sum(Index_points == (i + 1)) > 0: + UVs = np.array( + [U_points[Index_points == (i + 1)], V_points[Index_points == (i + 1)]] + ) + Current_Part_UVs = self.Part_UVs[i] + Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i] + D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze() + ClosestVerts[Index_points == (i + 1)] = Current_Part_ClosestVertInds[ + np.argmin(D, axis=0) + ] + # + ClosestVertsGT = np.ones(Index_points.shape) * -1 + for i in np.arange(24): + if sum(I_gt == (i + 1)) > 0: + UVs = np.array([U_gt[I_gt == (i + 1)], V_gt[I_gt == (i + 1)]]) + Current_Part_UVs = self.Part_UVs[i] + Current_Part_ClosestVertInds = self.Part_ClosestVertInds[i] + D = ssd.cdist(Current_Part_UVs.transpose(), UVs.transpose()).squeeze() + ClosestVertsGT[I_gt == (i + 1)] = Current_Part_ClosestVertInds[np.argmin(D, axis=0)] + # + return ClosestVerts, ClosestVertsGT + + def getDistances(self, cVertsGT, cVerts): + + ClosestVertsTransformed = self.PDIST_transform[cVerts.astype(int) - 1] + ClosestVertsGTTransformed = self.PDIST_transform[cVertsGT.astype(int) - 1] + # + ClosestVertsTransformed[cVerts < 0] = 0 + ClosestVertsGTTransformed[cVertsGT < 0] = 0 + # + cVertsGT = ClosestVertsGTTransformed + cVerts = ClosestVertsTransformed + # + n = 27554 + dists = [] + for d in range(len(cVertsGT)): + if cVertsGT[d] > 0: + if cVerts[d] > 0: + i = cVertsGT[d] - 1 + j = cVerts[d] - 1 + if j == i: + dists.append(0) + elif j > i: + ccc = i + i = j + j = ccc + i = n - i - 1 + j = n - j - 1 + k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1 + k = (n * n - n) / 2 - k - 1 + dists.append(self.Pdist_matrix[int(k)][0]) + else: + i = n - i - 1 + j = n - j - 1 + k = (n * (n - 1) / 2) - (n - i) * ((n - i) - 1) / 2 + j - i - 1 + k = (n * n - n) / 2 - k - 1 + dists.append(self.Pdist_matrix[int(k)][0]) + else: + dists.append(np.inf) + return np.atleast_1d(np.array(dists).squeeze()) + + +class Params: + """ + Params for coco evaluation api + """ + + def setDetParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True) + self.maxDets = [1, 10, 100] + self.areaRng = [ + [0 ** 2, 1e5 ** 2], + [0 ** 2, 32 ** 2], + [32 ** 2, 96 ** 2], + [96 ** 2, 1e5 ** 2], + ] + self.areaRngLbl = ["all", "small", "medium", "large"] + self.useCats = 1 + + def setKpParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(0.5, 0.95, np.round((0.95 - 0.5) / 0.05) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, np.round((1.00 - 0.0) / 0.01) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] + self.areaRngLbl = ["all", "medium", "large"] + self.useCats = 1 + + def setUvParams(self): + self.imgIds = [] + self.catIds = [] + self.iouThrs = np.linspace(0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True) + self.recThrs = np.linspace(0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] + self.areaRngLbl = ["all", "medium", "large"] + self.useCats = 1 + + def __init__(self, iouType="segm"): + if iouType == "segm" or iouType == "bbox": + self.setDetParams() + elif iouType == "keypoints": + self.setKpParams() + elif iouType == "densepose": + self.setUvParams() + else: + raise Exception("iouType not supported") + self.iouType = iouType + # useSegm is deprecated + self.useSegm = None diff --git a/projects/DensePose/densepose/densepose_head.py b/projects/DensePose/densepose/densepose_head.py new file mode 100644 index 0000000000000000000000000000000000000000..363970681db36a41d5bc5b1960960a2a8bf23855 --- /dev/null +++ b/projects/DensePose/densepose/densepose_head.py @@ -0,0 +1,1216 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import math +from dataclasses import dataclass +from enum import Enum +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import CfgNode +from detectron2.layers import Conv2d, ConvTranspose2d, interpolate +from detectron2.structures.boxes import matched_boxlist_iou +from detectron2.utils.registry import Registry + +from .data.structures import DensePoseOutput + +ROI_DENSEPOSE_HEAD_REGISTRY = Registry("ROI_DENSEPOSE_HEAD") + + +class DensePoseUVConfidenceType(Enum): + """ + Statistical model type for confidence learning, possible values: + - "iid_iso": statistically independent identically distributed residuals + with anisotropic covariance + - "indep_aniso": statistically independent residuals with anisotropic + covariances + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + # fmt: off + IID_ISO = "iid_iso" + INDEP_ANISO = "indep_aniso" + # fmt: on + + +@dataclass +class DensePoseUVConfidenceConfig: + """ + Configuration options for confidence on UV data + """ + + enabled: bool = False + # lower bound on UV confidences + epsilon: float = 0.01 + type: DensePoseUVConfidenceType = DensePoseUVConfidenceType.IID_ISO + + +@dataclass +class DensePoseConfidenceModelConfig: + """ + Configuration options for confidence models + """ + + # confidence for U and V values + uv_confidence: DensePoseUVConfidenceConfig + + @staticmethod + def from_cfg(cfg: CfgNode) -> "DensePoseConfidenceModelConfig": + return DensePoseConfidenceModelConfig( + uv_confidence=DensePoseUVConfidenceConfig( + enabled=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.ENABLED, + epsilon=cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.EPSILON, + type=DensePoseUVConfidenceType(cfg.MODEL.ROI_DENSEPOSE_HEAD.UV_CONFIDENCE.TYPE), + ) + ) + + +def initialize_module_params(module): + for name, param in module.named_parameters(): + if "bias" in name: + nn.init.constant_(param, 0) + elif "weight" in name: + nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu") + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseDeepLabHead(nn.Module): + def __init__(self, cfg, input_channels): + super(DensePoseDeepLabHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NORM + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + self.use_nonlocal = cfg.MODEL.ROI_DENSEPOSE_HEAD.DEEPLAB.NONLOCAL_ON + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + + self.ASPP = ASPP(input_channels, [6, 12, 56], n_channels) # 6, 12, 56 + self.add_module("ASPP", self.ASPP) + + if self.use_nonlocal: + self.NLBlock = NONLocalBlock2D(input_channels, bn_layer=True) + self.add_module("NLBlock", self.NLBlock) + # weight_init.c2_msra_fill(self.ASPP) + + for i in range(self.n_stacked_convs): + norm_module = nn.GroupNorm(32, hidden_dim) if norm == "GN" else None + layer = Conv2d( + n_channels, + hidden_dim, + kernel_size, + stride=1, + padding=pad_size, + bias=not norm, + norm=norm_module, + ) + weight_init.c2_msra_fill(layer) + n_channels = hidden_dim + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + self.n_out_channels = hidden_dim + # initialize_module_params(self) + + def forward(self, features): + x0 = features + x = self.ASPP(x0) + if self.use_nonlocal: + x = self.NLBlock(x) + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name + + +# Copied from +# https://github.com/pytorch/vision/blob/master/torchvision/models/segmentation/deeplabv3.py +# See https://arxiv.org/pdf/1706.05587.pdf for details +class ASPPConv(nn.Sequential): + def __init__(self, in_channels, out_channels, dilation): + modules = [ + nn.Conv2d( + in_channels, out_channels, 3, padding=dilation, dilation=dilation, bias=False + ), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ] + super(ASPPConv, self).__init__(*modules) + + +class ASPPPooling(nn.Sequential): + def __init__(self, in_channels, out_channels): + super(ASPPPooling, self).__init__( + nn.AdaptiveAvgPool2d(1), + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + + def forward(self, x): + size = x.shape[-2:] + x = super(ASPPPooling, self).forward(x) + return F.interpolate(x, size=size, mode="bilinear", align_corners=False) + + +class ASPP(nn.Module): + def __init__(self, in_channels, atrous_rates, out_channels): + super(ASPP, self).__init__() + modules = [] + modules.append( + nn.Sequential( + nn.Conv2d(in_channels, out_channels, 1, bias=False), + nn.GroupNorm(32, out_channels), + nn.ReLU(), + ) + ) + + rate1, rate2, rate3 = tuple(atrous_rates) + modules.append(ASPPConv(in_channels, out_channels, rate1)) + modules.append(ASPPConv(in_channels, out_channels, rate2)) + modules.append(ASPPConv(in_channels, out_channels, rate3)) + modules.append(ASPPPooling(in_channels, out_channels)) + + self.convs = nn.ModuleList(modules) + + self.project = nn.Sequential( + nn.Conv2d(5 * out_channels, out_channels, 1, bias=False), + # nn.BatchNorm2d(out_channels), + nn.ReLU() + # nn.Dropout(0.5) + ) + + def forward(self, x): + res = [] + for conv in self.convs: + res.append(conv(x)) + res = torch.cat(res, dim=1) + return self.project(res) + + +# copied from +# https://github.com/AlexHex7/Non-local_pytorch/blob/master/lib/non_local_embedded_gaussian.py +# See https://arxiv.org/abs/1711.07971 for details +class _NonLocalBlockND(nn.Module): + def __init__( + self, in_channels, inter_channels=None, dimension=3, sub_sample=True, bn_layer=True + ): + super(_NonLocalBlockND, self).__init__() + + assert dimension in [1, 2, 3] + + self.dimension = dimension + self.sub_sample = sub_sample + + self.in_channels = in_channels + self.inter_channels = inter_channels + + if self.inter_channels is None: + self.inter_channels = in_channels // 2 + if self.inter_channels == 0: + self.inter_channels = 1 + + if dimension == 3: + conv_nd = nn.Conv3d + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + bn = nn.GroupNorm # (32, hidden_dim) #nn.BatchNorm3d + elif dimension == 2: + conv_nd = nn.Conv2d + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm2d + else: + conv_nd = nn.Conv1d + max_pool_layer = nn.MaxPool1d(kernel_size=2) + bn = nn.GroupNorm # (32, hidden_dim)nn.BatchNorm1d + + self.g = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if bn_layer: + self.W = nn.Sequential( + conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ), + bn(32, self.in_channels), + ) + nn.init.constant_(self.W[1].weight, 0) + nn.init.constant_(self.W[1].bias, 0) + else: + self.W = conv_nd( + in_channels=self.inter_channels, + out_channels=self.in_channels, + kernel_size=1, + stride=1, + padding=0, + ) + nn.init.constant_(self.W.weight, 0) + nn.init.constant_(self.W.bias, 0) + + self.theta = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + self.phi = conv_nd( + in_channels=self.in_channels, + out_channels=self.inter_channels, + kernel_size=1, + stride=1, + padding=0, + ) + + if sub_sample: + self.g = nn.Sequential(self.g, max_pool_layer) + self.phi = nn.Sequential(self.phi, max_pool_layer) + + def forward(self, x): + """ + :param x: (b, c, t, h, w) + :return: + """ + + batch_size = x.size(0) + + g_x = self.g(x).view(batch_size, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + theta_x = self.theta(x).view(batch_size, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(batch_size, self.inter_channels, -1) + f = torch.matmul(theta_x, phi_x) + f_div_C = F.softmax(f, dim=-1) + + y = torch.matmul(f_div_C, g_x) + y = y.permute(0, 2, 1).contiguous() + y = y.view(batch_size, self.inter_channels, *x.size()[2:]) + W_y = self.W(y) + z = W_y + x + + return z + + +class NONLocalBlock2D(_NonLocalBlockND): + def __init__(self, in_channels, inter_channels=None, sub_sample=True, bn_layer=True): + super(NONLocalBlock2D, self).__init__( + in_channels, + inter_channels=inter_channels, + dimension=2, + sub_sample=sub_sample, + bn_layer=bn_layer, + ) + + +@ROI_DENSEPOSE_HEAD_REGISTRY.register() +class DensePoseV1ConvXHead(nn.Module): + def __init__(self, cfg, input_channels): + super(DensePoseV1ConvXHead, self).__init__() + # fmt: off + hidden_dim = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_DIM + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.CONV_HEAD_KERNEL + self.n_stacked_convs = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_STACKED_CONVS + # fmt: on + pad_size = kernel_size // 2 + n_channels = input_channels + for i in range(self.n_stacked_convs): + layer = Conv2d(n_channels, hidden_dim, kernel_size, stride=1, padding=pad_size) + layer_name = self._get_layer_name(i) + self.add_module(layer_name, layer) + n_channels = hidden_dim + self.n_out_channels = n_channels + initialize_module_params(self) + + def forward(self, features): + x = features + output = x + for i in range(self.n_stacked_convs): + layer_name = self._get_layer_name(i) + x = getattr(self, layer_name)(x) + x = F.relu(x) + output = x + return output + + def _get_layer_name(self, i): + layer_name = "body_conv_fcn{}".format(i + 1) + return layer_name + + +class DensePosePredictor(nn.Module): + def __init__(self, cfg, input_channels): + + super(DensePosePredictor, self).__init__() + dim_in = input_channels + n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + self.ann_index_lowres = ConvTranspose2d( + dim_in, n_segm_chan, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.index_uv_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.u_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.v_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.scale_factor = cfg.MODEL.ROI_DENSEPOSE_HEAD.UP_SCALE + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + self._initialize_confidence_estimation_layers(cfg, self.confidence_model_cfg, dim_in) + initialize_module_params(self) + + def forward(self, head_outputs): + ann_index_lowres = self.ann_index_lowres(head_outputs) + index_uv_lowres = self.index_uv_lowres(head_outputs) + u_lowres = self.u_lowres(head_outputs) + v_lowres = self.v_lowres(head_outputs) + + def interp2d(input): + return interpolate( + input, scale_factor=self.scale_factor, mode="bilinear", align_corners=False + ) + + ann_index = interp2d(ann_index_lowres) + index_uv = interp2d(index_uv_lowres) + u = interp2d(u_lowres) + v = interp2d(v_lowres) + ( + (sigma_1, sigma_2, kappa_u, kappa_v), + (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres), + (ann_index, index_uv), + ) = self._forward_confidence_estimation_layers( + self.confidence_model_cfg, head_outputs, interp2d, ann_index, index_uv + ) + return ( + (ann_index, index_uv, u, v), + (ann_index_lowres, index_uv_lowres, u_lowres, v_lowres), + (sigma_1, sigma_2, kappa_u, kappa_v), + (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres), + ) + + def _initialize_confidence_estimation_layers( + self, cfg: CfgNode, confidence_model_cfg: DensePoseConfidenceModelConfig, dim_in: int + ): + dim_out_patches = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_PATCHES + 1 + kernel_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECONV_KERNEL + if confidence_model_cfg.uv_confidence.enabled: + if confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + self.sigma_2_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + elif confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO: + self.sigma_2_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_u_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + self.kappa_v_lowres = ConvTranspose2d( + dim_in, dim_out_patches, kernel_size, stride=2, padding=int(kernel_size / 2 - 1) + ) + else: + raise ValueError( + f"Unknown confidence model type: {confidence_model_cfg.confidence_model_type}" + ) + + def _forward_confidence_estimation_layers( + self, confidence_model_cfg, head_outputs, interp2d, ann_index, index_uv + ): + sigma_1, sigma_2, kappa_u, kappa_v = None, None, None, None + sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres = None, None, None, None + if confidence_model_cfg.uv_confidence.enabled: + if confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + sigma_2_lowres = self.sigma_2_lowres(head_outputs) + sigma_2 = interp2d(sigma_2_lowres) + elif confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO: + sigma_2_lowres = self.sigma_2_lowres(head_outputs) + kappa_u_lowres = self.kappa_u_lowres(head_outputs) + kappa_v_lowres = self.kappa_v_lowres(head_outputs) + sigma_2 = interp2d(sigma_2_lowres) + kappa_u = interp2d(kappa_u_lowres) + kappa_v = interp2d(kappa_v_lowres) + else: + raise ValueError( + f"Unknown confidence model type: {confidence_model_cfg.confidence_model_type}" + ) + return ( + (sigma_1, sigma_2, kappa_u, kappa_v), + (sigma_1_lowres, sigma_2_lowres, kappa_u_lowres, kappa_v_lowres), + (ann_index, index_uv), + ) + + +class DensePoseDataFilter(object): + def __init__(self, cfg): + self.iou_threshold = cfg.MODEL.ROI_DENSEPOSE_HEAD.FG_IOU_THRESHOLD + + @torch.no_grad() + def __call__(self, proposals_with_targets): + """ + Filters proposals with targets to keep only the ones relevant for + DensePose training + proposals: list(Instances), each element of the list corresponds to + various instances (proposals, GT for boxes and densepose) for one + image + """ + proposals_filtered = [] + for proposals_per_image in proposals_with_targets: + if not hasattr(proposals_per_image, "gt_densepose"): + continue + assert hasattr(proposals_per_image, "gt_boxes") + assert hasattr(proposals_per_image, "proposal_boxes") + gt_boxes = proposals_per_image.gt_boxes + est_boxes = proposals_per_image.proposal_boxes + # apply match threshold for densepose head + iou = matched_boxlist_iou(gt_boxes, est_boxes) + iou_select = iou > self.iou_threshold + proposals_per_image = proposals_per_image[iou_select] + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes) + # filter out any target without densepose annotation + gt_densepose = proposals_per_image.gt_densepose + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose) + selected_indices = [ + i for i, dp_target in enumerate(gt_densepose) if dp_target is not None + ] + if len(selected_indices) != len(gt_densepose): + proposals_per_image = proposals_per_image[selected_indices] + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.proposal_boxes) + assert len(proposals_per_image.gt_boxes) == len(proposals_per_image.gt_densepose) + proposals_filtered.append(proposals_per_image) + return proposals_filtered + + +def build_densepose_head(cfg, input_channels): + head_name = cfg.MODEL.ROI_DENSEPOSE_HEAD.NAME + return ROI_DENSEPOSE_HEAD_REGISTRY.get(head_name)(cfg, input_channels) + + +def build_densepose_predictor(cfg, input_channels): + predictor = DensePosePredictor(cfg, input_channels) + return predictor + + +def build_densepose_data_filter(cfg): + dp_filter = DensePoseDataFilter(cfg) + return dp_filter + + +def densepose_inference(densepose_outputs, densepose_confidences, detections): + """ + Infer dense pose estimate based on outputs from the DensePose head + and detections. The estimate for each detection instance is stored in its + "pred_densepose" attribute. + + Args: + densepose_outputs (tuple(`torch.Tensor`)): iterable containing 4 elements: + - s (:obj: `torch.Tensor`): coarse segmentation tensor of size (N, A, H, W), + - i (:obj: `torch.Tensor`): fine segmentation tensor of size (N, C, H, W), + - u (:obj: `torch.Tensor`): U coordinates for each class of size (N, C, H, W), + - v (:obj: `torch.Tensor`): V coordinates for each class of size (N, C, H, W), + where N is the total number of detections in a batch, + A is the number of coarse segmentations labels + (e.g. 15 for coarse body parts + background), + C is the number of fine segmentation labels + (e.g. 25 for fine body parts + background), + W is the resolution along the X axis + H is the resolution along the Y axis + densepose_confidences (tuple(`torch.Tensor`)): iterable containing 4 elements: + - sigma_1 (:obj: `torch.Tensor`): global confidences for UV coordinates + of size (N, C, H, W) + - sigma_2 (:obj: `torch.Tensor`): individual confidences for UV coordinates + of size (N, C, H, W) + - kappa_u (:obj: `torch.Tensor`): first component of confidence direction + vector of size (N, C, H, W) + - kappa_v (:obj: `torch.Tensor`): second component of confidence direction + vector of size (N, C, H, W) + detections (list[Instances]): A list of N Instances, where N is the number of images + in the batch. Instances are modified by this method: "pred_densepose" attribute + is added to each instance, the attribute contains the corresponding + DensePoseOutput object. + """ + # DensePose outputs: segmentation, body part indices, U, V + s, index_uv, u, v = densepose_outputs + sigma_1, sigma_2, kappa_u, kappa_v = densepose_confidences + k = 0 + for detection in detections: + n_i = len(detection) + s_i = s[k : k + n_i] + index_uv_i = index_uv[k : k + n_i] + u_i = u[k : k + n_i] + v_i = v[k : k + n_i] + _local_vars = locals() + confidences = { + name: _local_vars[name] + for name in ("sigma_1", "sigma_2", "kappa_u", "kappa_v") + if _local_vars.get(name) is not None + } + densepose_output_i = DensePoseOutput(s_i, index_uv_i, u_i, v_i, confidences) + detection.pred_densepose = densepose_output_i + k += n_i + + +def _linear_interpolation_utilities(v_norm, v0_src, size_src, v0_dst, size_dst, size_z): + """ + Computes utility values for linear interpolation at points v. + The points are given as normalized offsets in the source interval + (v0_src, v0_src + size_src), more precisely: + v = v0_src + v_norm * size_src / 256.0 + The computed utilities include lower points v_lo, upper points v_hi, + interpolation weights v_w and flags j_valid indicating whether the + points falls into the destination interval (v0_dst, v0_dst + size_dst). + + Args: + v_norm (:obj: `torch.Tensor`): tensor of size N containing + normalized point offsets + v0_src (:obj: `torch.Tensor`): tensor of size N containing + left bounds of source intervals for normalized points + size_src (:obj: `torch.Tensor`): tensor of size N containing + source interval sizes for normalized points + v0_dst (:obj: `torch.Tensor`): tensor of size N containing + left bounds of destination intervals + size_dst (:obj: `torch.Tensor`): tensor of size N containing + destination interval sizes + size_z (int): interval size for data to be interpolated + + Returns: + v_lo (:obj: `torch.Tensor`): int tensor of size N containing + indices of lower values used for interpolation, all values are + integers from [0, size_z - 1] + v_hi (:obj: `torch.Tensor`): int tensor of size N containing + indices of upper values used for interpolation, all values are + integers from [0, size_z - 1] + v_w (:obj: `torch.Tensor`): float tensor of size N containing + interpolation weights + j_valid (:obj: `torch.Tensor`): uint8 tensor of size N containing + 0 for points outside the estimation interval + (v0_est, v0_est + size_est) and 1 otherwise + """ + v = v0_src + v_norm * size_src / 256.0 + j_valid = (v - v0_dst >= 0) * (v - v0_dst < size_dst) + v_grid = (v - v0_dst) * size_z / size_dst + v_lo = v_grid.floor().long().clamp(min=0, max=size_z - 1) + v_hi = (v_lo + 1).clamp(max=size_z - 1) + v_grid = torch.min(v_hi.float(), v_grid) + v_w = v_grid - v_lo.float() + return v_lo, v_hi, v_w, j_valid + + +def _grid_sampling_utilities( + zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt, x_norm, y_norm, index_bbox +): + """ + Prepare tensors used in grid sampling. + + Args: + z_est (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with estimated + values of Z to be extracted for the points X, Y and channel + indices I + bbox_xywh_est (:obj: `torch.Tensor`): tensor of size (N, 4) containing + estimated bounding boxes in format XYWH + bbox_xywh_gt (:obj: `torch.Tensor`): tensor of size (N, 4) containing + matched ground truth bounding boxes in format XYWH + index_gt (:obj: `torch.Tensor`): tensor of size K with point labels for + ground truth points + x_norm (:obj: `torch.Tensor`): tensor of size K with X normalized + coordinates of ground truth points. Image X coordinates can be + obtained as X = Xbbox + x_norm * Wbbox / 255 + y_norm (:obj: `torch.Tensor`): tensor of size K with Y normalized + coordinates of ground truth points. Image Y coordinates can be + obtained as Y = Ybbox + y_norm * Hbbox / 255 + index_bbox (:obj: `torch.Tensor`): tensor of size K with bounding box + indices for each ground truth point. The values are thus in + [0, N-1] + + Returns: + j_valid (:obj: `torch.Tensor`): uint8 tensor of size M containing + 0 for points to be discarded and 1 for points to be selected + y_lo (:obj: `torch.Tensor`): int tensor of indices of upper values + in z_est for each point + y_hi (:obj: `torch.Tensor`): int tensor of indices of lower values + in z_est for each point + x_lo (:obj: `torch.Tensor`): int tensor of indices of left values + in z_est for each point + x_hi (:obj: `torch.Tensor`): int tensor of indices of right values + in z_est for each point + w_ylo_xlo (:obj: `torch.Tensor`): float tensor of size M; + contains upper-left value weight for each point + w_ylo_xhi (:obj: `torch.Tensor`): float tensor of size M; + contains upper-right value weight for each point + w_yhi_xlo (:obj: `torch.Tensor`): float tensor of size M; + contains lower-left value weight for each point + w_yhi_xhi (:obj: `torch.Tensor`): float tensor of size M; + contains lower-right value weight for each point + """ + + x0_gt, y0_gt, w_gt, h_gt = bbox_xywh_gt[index_bbox].unbind(dim=1) + x0_est, y0_est, w_est, h_est = bbox_xywh_est[index_bbox].unbind(dim=1) + x_lo, x_hi, x_w, jx_valid = _linear_interpolation_utilities( + x_norm, x0_gt, w_gt, x0_est, w_est, zw + ) + y_lo, y_hi, y_w, jy_valid = _linear_interpolation_utilities( + y_norm, y0_gt, h_gt, y0_est, h_est, zh + ) + j_valid = jx_valid * jy_valid + + w_ylo_xlo = (1.0 - x_w) * (1.0 - y_w) + w_ylo_xhi = x_w * (1.0 - y_w) + w_yhi_xlo = (1.0 - x_w) * y_w + w_yhi_xhi = x_w * y_w + + return j_valid, y_lo, y_hi, x_lo, x_hi, w_ylo_xlo, w_ylo_xhi, w_yhi_xlo, w_yhi_xhi + + +def _extract_at_points_packed( + z_est, + index_bbox_valid, + slice_index_uv, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, +): + """ + Extract ground truth values z_gt for valid point indices and estimated + values z_est using bilinear interpolation over top-left (y_lo, x_lo), + top-right (y_lo, x_hi), bottom-left (y_hi, x_lo) and bottom-right + (y_hi, x_hi) values in z_est with corresponding weights: + w_ylo_xlo, w_ylo_xhi, w_yhi_xlo and w_yhi_xhi. + Use slice_index_uv to slice dim=1 in z_est + """ + z_est_sampled = ( + z_est[index_bbox_valid, slice_index_uv, y_lo, x_lo] * w_ylo_xlo + + z_est[index_bbox_valid, slice_index_uv, y_lo, x_hi] * w_ylo_xhi + + z_est[index_bbox_valid, slice_index_uv, y_hi, x_lo] * w_yhi_xlo + + z_est[index_bbox_valid, slice_index_uv, y_hi, x_hi] * w_yhi_xhi + ) + return z_est_sampled + + +def _resample_data( + z, bbox_xywh_src, bbox_xywh_dst, wout, hout, mode="nearest", padding_mode="zeros" +): + """ + Args: + z (:obj: `torch.Tensor`): tensor of size (N,C,H,W) with data to be + resampled + bbox_xywh_src (:obj: `torch.Tensor`): tensor of size (N,4) containing + source bounding boxes in format XYWH + bbox_xywh_dst (:obj: `torch.Tensor`): tensor of size (N,4) containing + destination bounding boxes in format XYWH + Return: + zresampled (:obj: `torch.Tensor`): tensor of size (N, C, Hout, Wout) + with resampled values of z, where D is the discretization size + """ + n = bbox_xywh_src.size(0) + assert n == bbox_xywh_dst.size(0), ( + "The number of " + "source ROIs for resampling ({}) should be equal to the number " + "of destination ROIs ({})".format(bbox_xywh_src.size(0), bbox_xywh_dst.size(0)) + ) + x0src, y0src, wsrc, hsrc = bbox_xywh_src.unbind(dim=1) + x0dst, y0dst, wdst, hdst = bbox_xywh_dst.unbind(dim=1) + x0dst_norm = 2 * (x0dst - x0src) / wsrc - 1 + y0dst_norm = 2 * (y0dst - y0src) / hsrc - 1 + x1dst_norm = 2 * (x0dst + wdst - x0src) / wsrc - 1 + y1dst_norm = 2 * (y0dst + hdst - y0src) / hsrc - 1 + grid_w = torch.arange(wout, device=z.device, dtype=torch.float) / wout + grid_h = torch.arange(hout, device=z.device, dtype=torch.float) / hout + grid_w_expanded = grid_w[None, None, :].expand(n, hout, wout) + grid_h_expanded = grid_h[None, :, None].expand(n, hout, wout) + dx_expanded = (x1dst_norm - x0dst_norm)[:, None, None].expand(n, hout, wout) + dy_expanded = (y1dst_norm - y0dst_norm)[:, None, None].expand(n, hout, wout) + x0_expanded = x0dst_norm[:, None, None].expand(n, hout, wout) + y0_expanded = y0dst_norm[:, None, None].expand(n, hout, wout) + grid_x = grid_w_expanded * dx_expanded + x0_expanded + grid_y = grid_h_expanded * dy_expanded + y0_expanded + grid = torch.stack((grid_x, grid_y), dim=3) + # resample Z from (N, C, H, W) into (N, C, Hout, Wout) + zresampled = F.grid_sample(z, grid, mode=mode, padding_mode=padding_mode, align_corners=True) + return zresampled + + +def _extract_single_tensors_from_matches_one_image( + proposals_targets, bbox_with_dp_offset, bbox_global_offset +): + i_gt_all = [] + x_norm_all = [] + y_norm_all = [] + u_gt_all = [] + v_gt_all = [] + s_gt_all = [] + bbox_xywh_gt_all = [] + bbox_xywh_est_all = [] + # Ibbox_all == k should be true for all data that corresponds + # to bbox_xywh_gt[k] and bbox_xywh_est[k] + # index k here is global wrt images + i_bbox_all = [] + # at offset k (k is global) contains index of bounding box data + # within densepose output tensor + i_with_dp = [] + + boxes_xywh_est = proposals_targets.proposal_boxes.clone() + boxes_xywh_gt = proposals_targets.gt_boxes.clone() + n_i = len(boxes_xywh_est) + assert n_i == len(boxes_xywh_gt) + + if n_i: + boxes_xywh_est.tensor[:, 2] -= boxes_xywh_est.tensor[:, 0] + boxes_xywh_est.tensor[:, 3] -= boxes_xywh_est.tensor[:, 1] + boxes_xywh_gt.tensor[:, 2] -= boxes_xywh_gt.tensor[:, 0] + boxes_xywh_gt.tensor[:, 3] -= boxes_xywh_gt.tensor[:, 1] + if hasattr(proposals_targets, "gt_densepose"): + densepose_gt = proposals_targets.gt_densepose + for k, box_xywh_est, box_xywh_gt, dp_gt in zip( + range(n_i), boxes_xywh_est.tensor, boxes_xywh_gt.tensor, densepose_gt + ): + if (dp_gt is not None) and (len(dp_gt.x) > 0): + i_gt_all.append(dp_gt.i) + x_norm_all.append(dp_gt.x) + y_norm_all.append(dp_gt.y) + u_gt_all.append(dp_gt.u) + v_gt_all.append(dp_gt.v) + s_gt_all.append(dp_gt.segm.unsqueeze(0)) + bbox_xywh_gt_all.append(box_xywh_gt.view(-1, 4)) + bbox_xywh_est_all.append(box_xywh_est.view(-1, 4)) + i_bbox_k = torch.full_like(dp_gt.i, bbox_with_dp_offset + len(i_with_dp)) + i_bbox_all.append(i_bbox_k) + i_with_dp.append(bbox_global_offset + k) + return ( + i_gt_all, + x_norm_all, + y_norm_all, + u_gt_all, + v_gt_all, + s_gt_all, + bbox_xywh_gt_all, + bbox_xywh_est_all, + i_bbox_all, + i_with_dp, + ) + + +def _extract_single_tensors_from_matches(proposals_with_targets): + i_img = [] + i_gt_all = [] + x_norm_all = [] + y_norm_all = [] + u_gt_all = [] + v_gt_all = [] + s_gt_all = [] + bbox_xywh_gt_all = [] + bbox_xywh_est_all = [] + i_bbox_all = [] + i_with_dp_all = [] + n = 0 + for i, proposals_targets_per_image in enumerate(proposals_with_targets): + n_i = proposals_targets_per_image.proposal_boxes.tensor.size(0) + if not n_i: + continue + ( + i_gt_img, + x_norm_img, + y_norm_img, + u_gt_img, + v_gt_img, + s_gt_img, + bbox_xywh_gt_img, + bbox_xywh_est_img, + i_bbox_img, + i_with_dp_img, + ) = _extract_single_tensors_from_matches_one_image( # noqa + proposals_targets_per_image, len(i_with_dp_all), n + ) + i_gt_all.extend(i_gt_img) + x_norm_all.extend(x_norm_img) + y_norm_all.extend(y_norm_img) + u_gt_all.extend(u_gt_img) + v_gt_all.extend(v_gt_img) + s_gt_all.extend(s_gt_img) + bbox_xywh_gt_all.extend(bbox_xywh_gt_img) + bbox_xywh_est_all.extend(bbox_xywh_est_img) + i_bbox_all.extend(i_bbox_img) + i_with_dp_all.extend(i_with_dp_img) + i_img.extend([i] * len(i_with_dp_img)) + n += n_i + # concatenate all data into a single tensor + if (n > 0) and (len(i_with_dp_all) > 0): + i_gt = torch.cat(i_gt_all, 0).long() + x_norm = torch.cat(x_norm_all, 0) + y_norm = torch.cat(y_norm_all, 0) + u_gt = torch.cat(u_gt_all, 0) + v_gt = torch.cat(v_gt_all, 0) + s_gt = torch.cat(s_gt_all, 0) + bbox_xywh_gt = torch.cat(bbox_xywh_gt_all, 0) + bbox_xywh_est = torch.cat(bbox_xywh_est_all, 0) + i_bbox = torch.cat(i_bbox_all, 0).long() + else: + i_gt = None + x_norm = None + y_norm = None + u_gt = None + v_gt = None + s_gt = None + bbox_xywh_gt = None + bbox_xywh_est = None + i_bbox = None + return ( + i_img, + i_with_dp_all, + bbox_xywh_est, + bbox_xywh_gt, + i_gt, + x_norm, + y_norm, + u_gt, + v_gt, + s_gt, + i_bbox, + ) + + +class IIDIsotropicGaussianUVLoss(nn.Module): + """ + Loss for the case of iid residuals with isotropic covariance: + $Sigma_i = sigma_i^2 I$ + The loss (negative log likelihood) is then: + $1/2 sum_{i=1}^n (log(2 pi) + 2 log sigma_i^2 + ||delta_i||^2 / sigma_i^2)$, + where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates + difference between estimated and ground truth UV values + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + def __init__(self, sigma_lower_bound: float): + super(IIDIsotropicGaussianUVLoss, self).__init__() + self.sigma_lower_bound = sigma_lower_bound + self.log2pi = math.log(2 * math.pi) + + def forward( + self, + u: torch.Tensor, + v: torch.Tensor, + sigma_u: torch.Tensor, + target_u: torch.Tensor, + target_v: torch.Tensor, + ): + # compute $\sigma_i^2$ + # use sigma_lower_bound to avoid degenerate solution for variance + # (sigma -> 0) + sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound + # compute \|delta_i\|^2 + delta_t_delta = (u - target_u) ** 2 + (v - target_v) ** 2 + # the total loss from the formula above: + loss = 0.5 * (self.log2pi + 2 * torch.log(sigma2) + delta_t_delta / sigma2) + return loss.sum() + + +class IndepAnisotropicGaussianUVLoss(nn.Module): + """ + Loss for the case of independent residuals with anisotropic covariances: + $Sigma_i = sigma_i^2 I + r_i r_i^T$ + The loss (negative log likelihood) is then: + $1/2 sum_{i=1}^n (log(2 pi) + + log sigma_i^2 (sigma_i^2 + ||r_i||^2) + + ||delta_i||^2 / sigma_i^2 + - ^2 / (sigma_i^2 * (sigma_i^2 + ||r_i||^2)))$, + where $delta_i=(u - u', v - v')$ is a 2D vector containing UV coordinates + difference between estimated and ground truth UV values + For details, see: + N. Neverova, D. Novotny, A. Vedaldi "Correlated Uncertainty for Learning + Dense Correspondences from Noisy Labels", p. 918--926, in Proc. NIPS 2019 + """ + + def __init__(self, sigma_lower_bound: float): + super(IndepAnisotropicGaussianUVLoss, self).__init__() + self.sigma_lower_bound = sigma_lower_bound + self.log2pi = math.log(2 * math.pi) + + def forward( + self, + u: torch.Tensor, + v: torch.Tensor, + sigma_u: torch.Tensor, + kappa_u_est: torch.Tensor, + kappa_v_est: torch.Tensor, + target_u: torch.Tensor, + target_v: torch.Tensor, + ): + # compute $\sigma_i^2$ + sigma2 = F.softplus(sigma_u) + self.sigma_lower_bound + # compute \|r_i\|^2 + r_sqnorm2 = kappa_u_est ** 2 + kappa_v_est ** 2 + delta_u = u - target_u + delta_v = v - target_v + # compute \|delta_i\|^2 + delta_sqnorm = delta_u ** 2 + delta_v ** 2 + delta_u_r_u = delta_u * kappa_u_est + delta_v_r_v = delta_v * kappa_v_est + # compute the scalar product + delta_r = delta_u_r_u + delta_v_r_v + # compute squared scalar product ^2 + delta_r_sqnorm = delta_r ** 2 + denom2 = sigma2 * (sigma2 + r_sqnorm2) + loss = 0.5 * ( + self.log2pi + torch.log(denom2) + delta_sqnorm / sigma2 - delta_r_sqnorm / denom2 + ) + return loss.sum() + + +class DensePoseLosses(object): + def __init__(self, cfg): + # fmt: off + self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE + self.w_points = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS + self.w_part = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS + self.w_segm = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS + self.n_segm_chan = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS + # fmt: on + self.confidence_model_cfg = DensePoseConfidenceModelConfig.from_cfg(cfg) + if self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.IID_ISO: + self.uv_loss_with_confidences = IIDIsotropicGaussianUVLoss( + self.confidence_model_cfg.uv_confidence.epsilon + ) + elif self.confidence_model_cfg.uv_confidence.type == DensePoseUVConfidenceType.INDEP_ANISO: + self.uv_loss_with_confidences = IndepAnisotropicGaussianUVLoss( + self.confidence_model_cfg.uv_confidence.epsilon + ) + + def __call__(self, proposals_with_gt, densepose_outputs, densepose_confidences): + losses = {} + # densepose outputs are computed for all images and all bounding boxes; + # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively, + # the outputs will have size(0) == 3+1+2+1 == 7 + s, index_uv, u, v = densepose_outputs + sigma_1, sigma_2, kappa_u, kappa_v = densepose_confidences + conf_type = self.confidence_model_cfg.uv_confidence.type + assert u.size(2) == v.size(2) + assert u.size(3) == v.size(3) + assert u.size(2) == index_uv.size(2) + assert u.size(3) == index_uv.size(3) + + with torch.no_grad(): + ( + index_uv_img, + i_with_dp, + bbox_xywh_est, + bbox_xywh_gt, + index_gt_all, + x_norm, + y_norm, + u_gt_all, + v_gt_all, + s_gt, + index_bbox, + ) = _extract_single_tensors_from_matches( # noqa + proposals_with_gt + ) + n_batch = len(i_with_dp) + + # NOTE: we need to keep the same computation graph on all the GPUs to + # perform reduction properly. Hence even if we have no data on one + # of the GPUs, we still need to generate the computation graph. + # Add fake (zero) loss in the form Tensor.sum() * 0 + if not n_batch: + losses["loss_densepose_I"] = index_uv.sum() * 0 + losses["loss_densepose_S"] = s.sum() * 0 + if self.confidence_model_cfg.uv_confidence.enabled: + losses["loss_densepose_UV"] = (u.sum() + v.sum()) * 0 + if conf_type == DensePoseUVConfidenceType.IID_ISO: + losses["loss_densepose_UV"] += sigma_2.sum() * 0 + elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO: + losses["loss_densepose_UV"] += ( + sigma_2.sum() + kappa_u.sum() + kappa_v.sum() + ) * 0 + else: + losses["loss_densepose_U"] = u.sum() * 0 + losses["loss_densepose_V"] = v.sum() * 0 + return losses + + zh = u.size(2) + zw = u.size(3) + + ( + j_valid, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) = _grid_sampling_utilities( # noqa + zh, zw, bbox_xywh_est, bbox_xywh_gt, index_gt_all, x_norm, y_norm, index_bbox + ) + + j_valid_fg = j_valid * (index_gt_all > 0) + + u_gt = u_gt_all[j_valid_fg] + u_est_all = _extract_at_points_packed( + u[i_with_dp], + index_bbox, + index_gt_all, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) + u_est = u_est_all[j_valid_fg] + + v_gt = v_gt_all[j_valid_fg] + v_est_all = _extract_at_points_packed( + v[i_with_dp], + index_bbox, + index_gt_all, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) + v_est = v_est_all[j_valid_fg] + + index_uv_gt = index_gt_all[j_valid] + index_uv_est_all = _extract_at_points_packed( + index_uv[i_with_dp], + index_bbox, + slice(None), + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo[:, None], + w_ylo_xhi[:, None], + w_yhi_xlo[:, None], + w_yhi_xhi[:, None], + ) + index_uv_est = index_uv_est_all[j_valid, :] + + if self.confidence_model_cfg.uv_confidence.enabled: + sigma_2_est_all = _extract_at_points_packed( + sigma_2[i_with_dp], + index_bbox, + index_gt_all, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) + sigma_2_est = sigma_2_est_all[j_valid_fg] + if conf_type in [DensePoseUVConfidenceType.INDEP_ANISO]: + kappa_u_est_all = _extract_at_points_packed( + kappa_u[i_with_dp], + index_bbox, + index_gt_all, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) + kappa_u_est = kappa_u_est_all[j_valid_fg] + kappa_v_est_all = _extract_at_points_packed( + kappa_v[i_with_dp], + index_bbox, + index_gt_all, + y_lo, + y_hi, + x_lo, + x_hi, + w_ylo_xlo, + w_ylo_xhi, + w_yhi_xlo, + w_yhi_xhi, + ) + kappa_v_est = kappa_v_est_all[j_valid_fg] + + # Resample everything to the estimated data size, no need to resample + # S_est then: + s_est = s[i_with_dp] + with torch.no_grad(): + s_gt = _resample_data( + s_gt.unsqueeze(1), + bbox_xywh_gt, + bbox_xywh_est, + self.heatmap_size, + self.heatmap_size, + mode="nearest", + padding_mode="zeros", + ).squeeze(1) + + # add point-based losses: + if self.confidence_model_cfg.uv_confidence.enabled: + if conf_type == DensePoseUVConfidenceType.IID_ISO: + uv_loss = ( + self.uv_loss_with_confidences(u_est, v_est, sigma_2_est, u_gt, v_gt) + * self.w_points + ) + losses["loss_densepose_UV"] = uv_loss + elif conf_type == DensePoseUVConfidenceType.INDEP_ANISO: + uv_loss = ( + self.uv_loss_with_confidences( + u_est, v_est, sigma_2_est, kappa_u_est, kappa_v_est, u_gt, v_gt + ) + * self.w_points + ) + losses["loss_densepose_UV"] = uv_loss + else: + raise ValueError(f"Unknown confidence model type: {conf_type}") + else: + u_loss = F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points + losses["loss_densepose_U"] = u_loss + v_loss = F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points + losses["loss_densepose_V"] = v_loss + index_uv_loss = F.cross_entropy(index_uv_est, index_uv_gt.long()) * self.w_part + losses["loss_densepose_I"] = index_uv_loss + + if self.n_segm_chan == 2: + s_gt = s_gt > 0 + s_loss = F.cross_entropy(s_est, s_gt.long()) * self.w_segm + losses["loss_densepose_S"] = s_loss + return losses + + +def build_densepose_losses(cfg): + losses = DensePoseLosses(cfg) + return losses diff --git a/projects/DensePose/densepose/evaluator.py b/projects/DensePose/densepose/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..3bb002b5093365f12edf5f4610ab261491d12bc8 --- /dev/null +++ b/projects/DensePose/densepose/evaluator.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import contextlib +import copy +import io +import itertools +import json +import logging +import os +from collections import OrderedDict +import torch +from fvcore.common.file_io import PathManager +from pycocotools.coco import COCO + +from detectron2.data import MetadataCatalog +from detectron2.evaluation import DatasetEvaluator +from detectron2.structures import BoxMode +from detectron2.utils.comm import all_gather, is_main_process, synchronize +from detectron2.utils.logger import create_small_table + +from .densepose_coco_evaluation import DensePoseCocoEval, DensePoseEvalMode + + +class DensePoseCOCOEvaluator(DatasetEvaluator): + def __init__(self, dataset_name, distributed, output_dir=None): + self._distributed = distributed + self._output_dir = output_dir + + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + self._metadata = MetadataCatalog.get(dataset_name) + json_file = PathManager.get_local_path(self._metadata.json_file) + with contextlib.redirect_stdout(io.StringIO()): + self._coco_api = COCO(json_file) + + def reset(self): + self._predictions = [] + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + The :class:`Instances` object needs to have `densepose` field. + """ + for input, output in zip(inputs, outputs): + instances = output["instances"].to(self._cpu_device) + + boxes = instances.pred_boxes.tensor.clone() + boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + instances.pred_densepose = instances.pred_densepose.to_result(boxes) + + json_results = prediction_to_json(instances, input["image_id"]) + self._predictions.extend(json_results) + + def evaluate(self): + if self._distributed: + synchronize() + predictions = all_gather(self._predictions) + predictions = list(itertools.chain(*predictions)) + if not is_main_process(): + return + else: + predictions = self._predictions + + return copy.deepcopy(self._eval_predictions(predictions)) + + def _eval_predictions(self, predictions): + """ + Evaluate predictions on densepose. + Return results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_densepose_results.json") + with open(file_path, "w") as f: + json.dump(predictions, f) + f.flush() + os.fsync(f.fileno()) + + self._logger.info("Evaluating predictions ...") + res = OrderedDict() + results_gps, results_gpsm = _evaluate_predictions_on_coco(self._coco_api, predictions) + res["densepose_gps"] = results_gps + res["densepose_gpsm"] = results_gpsm + return res + + +def prediction_to_json(instances, img_id): + """ + Args: + instances (Instances): the output of the model + img_id (str): the image id in COCO + + Returns: + list[dict]: the results in densepose evaluation format + """ + scores = instances.scores.tolist() + + results = [] + for k in range(len(instances)): + densepose = instances.pred_densepose[k] + result = { + "image_id": img_id, + "category_id": 1, # densepose only has one class + "bbox": densepose[1], + "score": scores[k], + "densepose": densepose, + } + results.append(result) + return results + + +def _evaluate_predictions_on_coco(coco_gt, coco_results): + metrics = ["AP", "AP50", "AP75", "APm", "APl"] + + logger = logging.getLogger(__name__) + + if len(coco_results) == 0: # cocoapi does not handle empty results very well + logger.warn("No predictions from the model! Set scores to -1") + results_gps = {metric: -1 for metric in metrics} + results_gpsm = {metric: -1 for metric in metrics} + return results_gps, results_gpsm + + coco_dt = coco_gt.loadRes(coco_results) + results_gps = _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics) + logger.info( + "Evaluation results for densepose, GPS metric: \n" + create_small_table(results_gps) + ) + results_gpsm = _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics) + logger.info( + "Evaluation results for densepose, GPSm metric: \n" + create_small_table(results_gpsm) + ) + return results_gps, results_gpsm + + +def _evaluate_predictions_on_coco_gps(coco_gt, coco_dt, metrics): + coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPS) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)} + return results + + +def _evaluate_predictions_on_coco_gpsm(coco_gt, coco_dt, metrics): + coco_eval = DensePoseCocoEval(coco_gt, coco_dt, "densepose", dpEvalMode=DensePoseEvalMode.GPSM) + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + results = {metric: float(coco_eval.stats[idx] * 100) for idx, metric in enumerate(metrics)} + return results diff --git a/projects/DensePose/densepose/modeling/test_time_augmentation.py b/projects/DensePose/densepose/modeling/test_time_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..fcf69db1b6e4c687bc4e284e2795cab61ebf043f --- /dev/null +++ b/projects/DensePose/densepose/modeling/test_time_augmentation.py @@ -0,0 +1,75 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from detectron2.modeling.test_time_augmentation import GeneralizedRCNNWithTTA + + +class DensePoseGeneralizedRCNNWithTTA(GeneralizedRCNNWithTTA): + def __init__(self, cfg, model, transform_data, tta_mapper=None, batch_size=1): + """ + Args: + cfg (CfgNode): + model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. + transform_data (DensePoseTransformData): contains symmetry label + transforms used for horizontal flip + tta_mapper (callable): takes a dataset dict and returns a list of + augmented versions of the dataset dict. Defaults to + `DatasetMapperTTA(cfg)`. + batch_size (int): batch the augmented images into this batch size for inference. + """ + self._transform_data = transform_data + super().__init__(cfg=cfg, model=model, tta_mapper=tta_mapper, batch_size=batch_size) + + # the implementation follows closely the one from detectron2/modeling + def _inference_one_image(self, input): + """ + Args: + input (dict): one dataset dict + + Returns: + dict: one output dict + """ + + augmented_inputs, aug_vars = self._get_augmented_inputs(input) + # Detect boxes from all augmented versions + with self._turn_off_roi_heads(["mask_on", "keypoint_on", "densepose_on"]): + # temporarily disable roi heads + all_boxes, all_scores, all_classes = self._get_augmented_boxes( + augmented_inputs, aug_vars + ) + merged_instances = self._merge_detections( + all_boxes, all_scores, all_classes, (aug_vars["height"], aug_vars["width"]) + ) + + if self.cfg.MODEL.MASK_ON or self.cfg.MODEL.DENSEPOSE_ON: + # Use the detected boxes to obtain new fields + augmented_instances = self._rescale_detected_boxes( + augmented_inputs, merged_instances, aug_vars + ) + # run forward on the detected boxes + outputs = self._batch_inference( + augmented_inputs, augmented_instances, do_postprocess=False + ) + # Delete now useless variables to avoid being out of memory + del augmented_inputs, augmented_instances, merged_instances + # average the predictions + if self.cfg.MODEL.MASK_ON: + outputs[0].pred_masks = self._reduce_pred_masks(outputs, aug_vars) + if self.cfg.MODEL.DENSEPOSE_ON: + outputs[0].pred_densepose = self._reduce_pred_densepose(outputs, aug_vars) + # postprocess + output = self._detector_postprocess(outputs[0], aug_vars) + return {"instances": output} + else: + return {"instances": merged_instances} + + def _reduce_pred_densepose(self, outputs, aug_vars): + for idx, output in enumerate(outputs): + if aug_vars["do_hflip"][idx]: + output.pred_densepose.hflip(self._transform_data) + # Less memory-intensive averaging + for attr in "SIUV": + setattr( + outputs[0].pred_densepose, + attr, + sum(getattr(o.pred_densepose, attr) for o in outputs) / len(outputs), + ) + return outputs[0].pred_densepose diff --git a/projects/DensePose/densepose/roi_head.py b/projects/DensePose/densepose/roi_head.py new file mode 100644 index 0000000000000000000000000000000000000000..023119760b77cf5294ed18292e77e7f495099770 --- /dev/null +++ b/projects/DensePose/densepose/roi_head.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import numpy as np +from typing import Dict +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn as nn +from torch.nn import functional as F + +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads +from detectron2.modeling.poolers import ROIPooler +from detectron2.modeling.roi_heads import select_foreground_proposals + +from .densepose_head import ( + build_densepose_data_filter, + build_densepose_head, + build_densepose_losses, + build_densepose_predictor, + densepose_inference, +) + + +class Decoder(nn.Module): + """ + A semantic segmentation head described in detail in the Panoptic Feature Pyramid Networks paper + (https://arxiv.org/abs/1901.02446). It takes FPN features as input and merges information from + all levels of the FPN into single output. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec], in_features): + super(Decoder, self).__init__() + + # fmt: off + self.in_features = in_features + feature_strides = {k: v.stride for k, v in input_shape.items()} + feature_channels = {k: v.channels for k, v in input_shape.items()} + num_classes = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NUM_CLASSES + conv_dims = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_CONV_DIMS + self.common_stride = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_COMMON_STRIDE + norm = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_NORM + # fmt: on + + self.scale_heads = [] + for in_feature in self.in_features: + head_ops = [] + head_length = max( + 1, int(np.log2(feature_strides[in_feature]) - np.log2(self.common_stride)) + ) + for k in range(head_length): + conv = Conv2d( + feature_channels[in_feature] if k == 0 else conv_dims, + conv_dims, + kernel_size=3, + stride=1, + padding=1, + bias=not norm, + norm=get_norm(norm, conv_dims), + activation=F.relu, + ) + weight_init.c2_msra_fill(conv) + head_ops.append(conv) + if feature_strides[in_feature] != self.common_stride: + head_ops.append( + nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False) + ) + self.scale_heads.append(nn.Sequential(*head_ops)) + self.add_module(in_feature, self.scale_heads[-1]) + self.predictor = Conv2d(conv_dims, num_classes, kernel_size=1, stride=1, padding=0) + weight_init.c2_msra_fill(self.predictor) + + def forward(self, features): + for i, _ in enumerate(self.in_features): + if i == 0: + x = self.scale_heads[i](features[i]) + else: + x = x + self.scale_heads[i](features[i]) + x = self.predictor(x) + return x + + +@ROI_HEADS_REGISTRY.register() +class DensePoseROIHeads(StandardROIHeads): + """ + A Standard ROIHeads which contains an addition of DensePose head. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) + self._init_densepose_head(cfg, input_shape) + + def _init_densepose_head(self, cfg, input_shape): + # fmt: off + self.densepose_on = cfg.MODEL.DENSEPOSE_ON + if not self.densepose_on: + return + self.densepose_data_filter = build_densepose_data_filter(cfg) + dp_pooler_resolution = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_RESOLUTION + dp_pooler_sampling_ratio = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_SAMPLING_RATIO + dp_pooler_type = cfg.MODEL.ROI_DENSEPOSE_HEAD.POOLER_TYPE + self.use_decoder = cfg.MODEL.ROI_DENSEPOSE_HEAD.DECODER_ON + # fmt: on + if self.use_decoder: + dp_pooler_scales = (1.0 / input_shape[self.in_features[0]].stride,) + else: + dp_pooler_scales = tuple(1.0 / input_shape[k].stride for k in self.in_features) + in_channels = [input_shape[f].channels for f in self.in_features][0] + + if self.use_decoder: + self.decoder = Decoder(cfg, input_shape, self.in_features) + + self.densepose_pooler = ROIPooler( + output_size=dp_pooler_resolution, + scales=dp_pooler_scales, + sampling_ratio=dp_pooler_sampling_ratio, + pooler_type=dp_pooler_type, + ) + self.densepose_head = build_densepose_head(cfg, in_channels) + self.densepose_predictor = build_densepose_predictor( + cfg, self.densepose_head.n_out_channels + ) + self.densepose_losses = build_densepose_losses(cfg) + + def _forward_densepose(self, features, instances): + """ + Forward logic of the densepose prediction branch. + + Args: + features (list[Tensor]): #level input features for densepose prediction + instances (list[Instances]): the per-image instances to train/predict densepose. + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "densepose" and return it. + """ + if not self.densepose_on: + return {} if self.training else instances + + features = [features[f] for f in self.in_features] + if self.training: + proposals, _ = select_foreground_proposals(instances, self.num_classes) + proposals_dp = self.densepose_data_filter(proposals) + if len(proposals_dp) > 0: + # NOTE may deadlock in DDP if certain workers have empty proposals_dp + proposal_boxes = [x.proposal_boxes for x in proposals_dp] + + if self.use_decoder: + features = [self.decoder(features)] + + features_dp = self.densepose_pooler(features, proposal_boxes) + densepose_head_outputs = self.densepose_head(features_dp) + densepose_outputs, _, confidences, _ = self.densepose_predictor( + densepose_head_outputs + ) + densepose_loss_dict = self.densepose_losses( + proposals_dp, densepose_outputs, confidences + ) + return densepose_loss_dict + else: + pred_boxes = [x.pred_boxes for x in instances] + + if self.use_decoder: + features = [self.decoder(features)] + + features_dp = self.densepose_pooler(features, pred_boxes) + if len(features_dp) > 0: + densepose_head_outputs = self.densepose_head(features_dp) + densepose_outputs, _, confidences, _ = self.densepose_predictor( + densepose_head_outputs + ) + else: + # If no detection occurred instances + # set densepose_outputs to empty tensors + empty_tensor = torch.zeros(size=(0, 0, 0, 0), device=features_dp.device) + densepose_outputs = tuple([empty_tensor] * 4) + confidences = tuple([empty_tensor] * 4) + + densepose_inference(densepose_outputs, confidences, instances) + return instances + + def forward(self, images, features, proposals, targets=None): + instances, losses = super().forward(images, features, proposals, targets) + del targets, images + + if self.training: + losses.update(self._forward_densepose(features, instances)) + return instances, losses + + def forward_with_given_boxes(self, features, instances): + """ + Use the given boxes in `instances` to produce other (non-box) per-ROI outputs. + + This is useful for downstream tasks where a box is known, but need to obtain + other attributes (outputs of other heads). + Test-time augmentation also uses this. + + Args: + features: same as in `forward()` + instances (list[Instances]): instances to predict other outputs. Expect the keys + "pred_boxes" and "pred_classes" to exist. + + Returns: + instances (list[Instances]): + the same `Instances` objects, with extra + fields such as `pred_masks` or `pred_keypoints`. + """ + + instances = super().forward_with_given_boxes(features, instances) + instances = self._forward_densepose(features, instances) + return instances diff --git a/projects/DensePose/densepose/utils/dbhelper.py b/projects/DensePose/densepose/utils/dbhelper.py new file mode 100644 index 0000000000000000000000000000000000000000..b28862cdede26c13200d928118d5bc5c00e3d2aa --- /dev/null +++ b/projects/DensePose/densepose/utils/dbhelper.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from typing import Any, Dict, Optional, Tuple + + +class EntrySelector(object): + """ + Base class for entry selectors + """ + + @staticmethod + def from_string(spec: str) -> "EntrySelector": + if spec == "*": + return AllEntrySelector() + return FieldEntrySelector(spec) + + +class AllEntrySelector(EntrySelector): + """ + Selector that accepts all entries + """ + + SPECIFIER = "*" + + def __call__(self, entry): + return True + + +class FieldEntrySelector(EntrySelector): + """ + Selector that accepts only entries that match provided field + specifier(s). Only a limited set of specifiers is supported for now: + ::=[] + ::=[] + is a valid identifier + ::= "int" | "str" + ::= "=" + ::= "," + ::= ":" + ::= | + ::= + ::= "-" + is a string without spaces and special symbols + (e.g. , , , ) + """ + + _SPEC_DELIM = "," + _TYPE_DELIM = ":" + _RANGE_DELIM = "-" + _EQUAL = "=" + _ERROR_PREFIX = "Invalid field selector specifier" + + class _FieldEntryValuePredicate(object): + """ + Predicate that checks strict equality for the specified entry field + """ + + def __init__(self, name: str, typespec: str, value: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.value = value + + def __call__(self, entry): + return entry[self.name] == self.type(self.value) + + class _FieldEntryRangePredicate(object): + """ + Predicate that checks whether an entry field falls into the specified range + """ + + def __init__(self, name: str, typespec: str, vmin: str, vmax: str): + import builtins + + self.name = name + self.type = getattr(builtins, typespec) if typespec is not None else str + self.vmin = vmin + self.vmax = vmax + + def __call__(self, entry): + return (entry[self.name] >= self.type(self.vmin)) and ( + entry[self.name] <= self.type(self.vmax) + ) + + def __init__(self, spec: str): + self._predicates = self._parse_specifier_into_predicates(spec) + + def __call__(self, entry: Dict[str, Any]): + for predicate in self._predicates: + if not predicate(entry): + return False + return True + + def _parse_specifier_into_predicates(self, spec: str): + predicates = [] + specs = spec.split(self._SPEC_DELIM) + for subspec in specs: + eq_idx = subspec.find(self._EQUAL) + if eq_idx > 0: + field_name_with_type = subspec[:eq_idx] + field_name, field_type = self._parse_field_name_type(field_name_with_type) + field_value_or_range = subspec[eq_idx + 1 :] + if self._is_range_spec(field_value_or_range): + vmin, vmax = self._get_range_spec(field_value_or_range) + predicate = FieldEntrySelector._FieldEntryRangePredicate( + field_name, field_type, vmin, vmax + ) + else: + predicate = FieldEntrySelector._FieldEntryValuePredicate( + field_name, field_type, field_value_or_range + ) + predicates.append(predicate) + elif eq_idx == 0: + self._parse_error(f'"{subspec}", field name is empty!') + else: + self._parse_error(f'"{subspec}", should have format ' "=!") + return predicates + + def _parse_field_name_type(self, field_name_with_type: str) -> Tuple[str, Optional[str]]: + type_delim_idx = field_name_with_type.find(self._TYPE_DELIM) + if type_delim_idx > 0: + field_name = field_name_with_type[:type_delim_idx] + field_type = field_name_with_type[type_delim_idx + 1 :] + elif type_delim_idx == 0: + self._parse_error(f'"{field_name_with_type}", field name is empty!') + else: + field_name = field_name_with_type + field_type = None + return field_name, field_type + + def _is_range_spec(self, field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + return delim_idx > 0 + + def _get_range_spec(self, field_value_or_range): + if self._is_range_spec(field_value_or_range): + delim_idx = field_value_or_range.find(self._RANGE_DELIM) + vmin = field_value_or_range[:delim_idx] + vmax = field_value_or_range[delim_idx + 1 :] + return vmin, vmax + else: + self._parse_error('"field_value_or_range", range of values expected!') + + def _parse_error(self, msg): + raise ValueError(f"{self._ERROR_PREFIX}: {msg}") diff --git a/projects/DensePose/densepose/utils/logger.py b/projects/DensePose/densepose/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..e3fa45e0c0218bdd2e79c08b0d8ff83abc3e4308 --- /dev/null +++ b/projects/DensePose/densepose/utils/logger.py @@ -0,0 +1,13 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging + + +def verbosity_to_level(verbosity): + if verbosity is not None: + if verbosity == 0: + return logging.WARNING + elif verbosity == 1: + return logging.INFO + elif verbosity >= 2: + return logging.DEBUG + return logging.WARNING diff --git a/projects/DensePose/densepose/utils/transform.py b/projects/DensePose/densepose/utils/transform.py new file mode 100644 index 0000000000000000000000000000000000000000..b7cfe097234dbd3ff19b84ecdfb63fd8bf5fd4b6 --- /dev/null +++ b/projects/DensePose/densepose/utils/transform.py @@ -0,0 +1,16 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from fvcore.common.file_io import PathManager + +from detectron2.data import MetadataCatalog + +from densepose import DensePoseTransformData + + +def load_for_dataset(dataset_name): + path = MetadataCatalog.get(dataset_name).densepose_transform_src + densepose_transform_data_fpath = PathManager.get_local_path(path) + return DensePoseTransformData.load(densepose_transform_data_fpath) + + +def load_from_cfg(cfg): + return load_for_dataset(cfg.DATASETS.TEST[0]) diff --git a/projects/DensePose/densepose/vis/base.py b/projects/DensePose/densepose/vis/base.py new file mode 100644 index 0000000000000000000000000000000000000000..2aa3e6e9f44ae2ce888f6e24dd11c8428734417b --- /dev/null +++ b/projects/DensePose/densepose/vis/base.py @@ -0,0 +1,191 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import numpy as np +import cv2 +import torch + +Image = np.ndarray +Boxes = torch.Tensor + + +class MatrixVisualizer(object): + """ + Base visualizer for matrix data + """ + + def __init__( + self, + inplace=True, + cmap=cv2.COLORMAP_PARULA, + val_scale=1.0, + alpha=0.7, + interp_method_matrix=cv2.INTER_LINEAR, + interp_method_mask=cv2.INTER_NEAREST, + ): + self.inplace = inplace + self.cmap = cmap + self.val_scale = val_scale + self.alpha = alpha + self.interp_method_matrix = interp_method_matrix + self.interp_method_mask = interp_method_mask + + def visualize(self, image_bgr, mask, matrix, bbox_xywh): + self._check_image(image_bgr) + self._check_mask_matrix(mask, matrix) + if self.inplace: + image_target_bgr = image_bgr + else: + image_target_bgr = image_bgr * 0 + x, y, w, h = [int(v) for v in bbox_xywh] + if w <= 0 or h <= 0: + return image_bgr + mask, matrix = self._resize(mask, matrix, w, h) + mask_bg = np.tile((mask == 0)[:, :, np.newaxis], [1, 1, 3]) + matrix_scaled = matrix.astype(np.float32) * self.val_scale + _EPSILON = 1e-6 + if np.any(matrix_scaled > 255 + _EPSILON): + logger = logging.getLogger(__name__) + logger.warning( + f"Matrix has values > {255 + _EPSILON} after " f"scaling, clipping to [0..255]" + ) + matrix_scaled_8u = matrix_scaled.clip(0, 255).astype(np.uint8) + matrix_vis = cv2.applyColorMap(matrix_scaled_8u, self.cmap) + matrix_vis[mask_bg] = image_target_bgr[y : y + h, x : x + w, :][mask_bg] + image_target_bgr[y : y + h, x : x + w, :] = ( + image_target_bgr[y : y + h, x : x + w, :] * (1.0 - self.alpha) + matrix_vis * self.alpha + ) + return image_target_bgr.astype(np.uint8) + + def _resize(self, mask, matrix, w, h): + if (w != mask.shape[1]) or (h != mask.shape[0]): + mask = cv2.resize(mask, (w, h), self.interp_method_mask) + if (w != matrix.shape[1]) or (h != matrix.shape[0]): + matrix = cv2.resize(matrix, (w, h), self.interp_method_matrix) + return mask, matrix + + def _check_image(self, image_rgb): + assert len(image_rgb.shape) == 3 + assert image_rgb.shape[2] == 3 + assert image_rgb.dtype == np.uint8 + + def _check_mask_matrix(self, mask, matrix): + assert len(matrix.shape) == 2 + assert len(mask.shape) == 2 + assert mask.dtype == np.uint8 + + +class RectangleVisualizer(object): + + _COLOR_GREEN = (18, 127, 15) + + def __init__(self, color=_COLOR_GREEN, thickness=1): + self.color = color + self.thickness = thickness + + def visualize(self, image_bgr, bbox_xywh, color=None, thickness=None): + x, y, w, h = bbox_xywh + color = color or self.color + thickness = thickness or self.thickness + cv2.rectangle(image_bgr, (int(x), int(y)), (int(x + w), int(y + h)), color, thickness) + return image_bgr + + +class PointsVisualizer(object): + + _COLOR_GREEN = (18, 127, 15) + + def __init__(self, color_bgr=_COLOR_GREEN, r=5): + self.color_bgr = color_bgr + self.r = r + + def visualize(self, image_bgr, pts_xy, colors_bgr=None, rs=None): + for j, pt_xy in enumerate(pts_xy): + x, y = pt_xy + color_bgr = colors_bgr[j] if colors_bgr is not None else self.color_bgr + r = rs[j] if rs is not None else self.r + cv2.circle(image_bgr, (x, y), r, color_bgr, -1) + return image_bgr + + +class TextVisualizer(object): + + _COLOR_GRAY = (218, 227, 218) + _COLOR_WHITE = (255, 255, 255) + + def __init__( + self, + font_face=cv2.FONT_HERSHEY_SIMPLEX, + font_color_bgr=_COLOR_GRAY, + font_scale=0.35, + font_line_type=cv2.LINE_AA, + font_line_thickness=1, + fill_color_bgr=_COLOR_WHITE, + fill_color_transparency=1.0, + frame_color_bgr=_COLOR_WHITE, + frame_color_transparency=1.0, + frame_thickness=1, + ): + self.font_face = font_face + self.font_color_bgr = font_color_bgr + self.font_scale = font_scale + self.font_line_type = font_line_type + self.font_line_thickness = font_line_thickness + self.fill_color_bgr = fill_color_bgr + self.fill_color_transparency = fill_color_transparency + self.frame_color_bgr = frame_color_bgr + self.frame_color_transparency = frame_color_transparency + self.frame_thickness = frame_thickness + + def visualize(self, image_bgr, txt, topleft_xy): + txt_w, txt_h = self.get_text_size_wh(txt) + topleft_xy = tuple(map(int, topleft_xy)) + x, y = topleft_xy + if self.frame_color_transparency < 1.0: + t = self.frame_thickness + image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] = ( + image_bgr[y - t : y + txt_h + t, x - t : x + txt_w + t, :] + * self.frame_color_transparency + + np.array(self.frame_color_bgr) * (1.0 - self.frame_color_transparency) + ).astype(np.float) + if self.fill_color_transparency < 1.0: + image_bgr[y : y + txt_h, x : x + txt_w, :] = ( + image_bgr[y : y + txt_h, x : x + txt_w, :] * self.fill_color_transparency + + np.array(self.fill_color_bgr) * (1.0 - self.fill_color_transparency) + ).astype(np.float) + cv2.putText( + image_bgr, + txt, + topleft_xy, + self.font_face, + self.font_scale, + self.font_color_bgr, + self.font_line_thickness, + self.font_line_type, + ) + return image_bgr + + def get_text_size_wh(self, txt): + ((txt_w, txt_h), _) = cv2.getTextSize( + txt, self.font_face, self.font_scale, self.font_line_thickness + ) + return txt_w, txt_h + + +class CompoundVisualizer(object): + def __init__(self, visualizers): + self.visualizers = visualizers + + def visualize(self, image_bgr, data): + assert len(data) == len( + self.visualizers + ), "The number of datas {} should match the number of visualizers" " {}".format( + len(data), len(self.visualizers) + ) + image = image_bgr + for i, visualizer in enumerate(self.visualizers): + image = visualizer.visualize(image, data[i]) + return image + + def __str__(self): + visualizer_str = ", ".join([str(v) for v in self.visualizers]) + return "Compound Visualizer [{}]".format(visualizer_str) diff --git a/projects/DensePose/densepose/vis/bounding_box.py b/projects/DensePose/densepose/vis/bounding_box.py new file mode 100644 index 0000000000000000000000000000000000000000..d7951d69e4a92d638debc79458dd2cfe58c650e3 --- /dev/null +++ b/projects/DensePose/densepose/vis/bounding_box.py @@ -0,0 +1,37 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .base import RectangleVisualizer, TextVisualizer + + +class BoundingBoxVisualizer(object): + def __init__(self): + self.rectangle_visualizer = RectangleVisualizer() + + def visualize(self, image_bgr, boxes_xywh): + for bbox_xywh in boxes_xywh: + image_bgr = self.rectangle_visualizer.visualize(image_bgr, bbox_xywh) + return image_bgr + + +class ScoredBoundingBoxVisualizer(object): + def __init__(self, bbox_visualizer_params=None, score_visualizer_params=None): + if bbox_visualizer_params is None: + bbox_visualizer_params = {} + if score_visualizer_params is None: + score_visualizer_params = {} + self.visualizer_bbox = RectangleVisualizer(**bbox_visualizer_params) + self.visualizer_score = TextVisualizer(**score_visualizer_params) + + def visualize(self, image_bgr, scored_bboxes): + boxes_xywh, box_scores = scored_bboxes + assert len(boxes_xywh) == len( + box_scores + ), "Number of bounding boxes {} should be equal to the number of scores {}".format( + len(boxes_xywh), len(box_scores) + ) + for i, box_xywh in enumerate(boxes_xywh): + score_i = box_scores[i] + image_bgr = self.visualizer_bbox.visualize(image_bgr, box_xywh) + score_txt = "{0:6.4f}".format(score_i) + topleft_xy = box_xywh[0], box_xywh[1] + image_bgr = self.visualizer_score.visualize(image_bgr, score_txt, topleft_xy) + return image_bgr diff --git a/projects/DensePose/densepose/vis/densepose.py b/projects/DensePose/densepose/vis/densepose.py new file mode 100644 index 0000000000000000000000000000000000000000..f2e77dc2d8e0f8c041ac1217978c639a826f0857 --- /dev/null +++ b/projects/DensePose/densepose/vis/densepose.py @@ -0,0 +1,593 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import numpy as np +from typing import Iterable, Optional, Tuple +import cv2 + +from ..data.structures import DensePoseDataRelative, DensePoseOutput, DensePoseResult +from .base import Boxes, Image, MatrixVisualizer, PointsVisualizer + + +class DensePoseResultsVisualizer(object): + def visualize(self, image_bgr: Image, densepose_result: Optional[DensePoseResult]) -> Image: + if densepose_result is None: + return image_bgr + context = self.create_visualization_context(image_bgr) + for i, result_encoded_w_shape in enumerate(densepose_result.results): + iuv_arr = DensePoseResult.decode_png_data(*result_encoded_w_shape) + bbox_xywh = densepose_result.boxes_xywh[i] + self.visualize_iuv_arr(context, iuv_arr, bbox_xywh) + image_bgr = self.context_to_image_bgr(context) + return image_bgr + + +class DensePoseMaskedColormapResultsVisualizer(DensePoseResultsVisualizer): + def __init__( + self, + data_extractor, + segm_extractor, + inplace=True, + cmap=cv2.COLORMAP_PARULA, + alpha=0.7, + val_scale=1.0, + ): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=val_scale, alpha=alpha + ) + self.data_extractor = data_extractor + self.segm_extractor = segm_extractor + + def create_visualization_context(self, image_bgr: Image): + return image_bgr + + def context_to_image_bgr(self, context): + return context + + def get_image_bgr_from_context(self, context): + return context + + def visualize_iuv_arr(self, context, iuv_arr, bbox_xywh): + image_bgr = self.get_image_bgr_from_context(context) + matrix = self.data_extractor(iuv_arr) + segm = self.segm_extractor(iuv_arr) + mask = np.zeros(matrix.shape, dtype=np.uint8) + mask[segm > 0] = 1 + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh) + return image_bgr + + +def _extract_i_from_iuvarr(iuv_arr): + return iuv_arr[0, :, :] + + +def _extract_u_from_iuvarr(iuv_arr): + return iuv_arr[1, :, :] + + +def _extract_v_from_iuvarr(iuv_arr): + return iuv_arr[2, :, :] + + +class DensePoseResultsMplContourVisualizer(DensePoseResultsVisualizer): + def __init__(self, levels=10, **kwargs): + self.levels = levels + self.plot_args = kwargs + + def create_visualization_context(self, image_bgr: Image): + import matplotlib.pyplot as plt + from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas + + context = {} + context["image_bgr"] = image_bgr + dpi = 100 + height_inches = float(image_bgr.shape[0]) / dpi + width_inches = float(image_bgr.shape[1]) / dpi + fig = plt.figure(figsize=(width_inches, height_inches), dpi=dpi) + plt.axes([0, 0, 1, 1]) + plt.axis("off") + context["fig"] = fig + canvas = FigureCanvas(fig) + context["canvas"] = canvas + extent = (0, image_bgr.shape[1], image_bgr.shape[0], 0) + plt.imshow(image_bgr[:, :, ::-1], extent=extent) + return context + + def context_to_image_bgr(self, context): + fig = context["fig"] + w, h = map(int, fig.get_size_inches() * fig.get_dpi()) + canvas = context["canvas"] + canvas.draw() + image_1d = np.fromstring(canvas.tostring_rgb(), dtype="uint8") + image_rgb = image_1d.reshape(h, w, 3) + image_bgr = image_rgb[:, :, ::-1].copy() + return image_bgr + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image: + import matplotlib.pyplot as plt + + u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0 + v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0 + extent = ( + bbox_xywh[0], + bbox_xywh[0] + bbox_xywh[2], + bbox_xywh[1], + bbox_xywh[1] + bbox_xywh[3], + ) + plt.contour(u, self.levels, extent=extent, **self.plot_args) + plt.contour(v, self.levels, extent=extent, **self.plot_args) + + +class DensePoseResultsCustomContourVisualizer(DensePoseResultsVisualizer): + """ + Contour visualization using marching squares + """ + + def __init__(self, levels=10, **kwargs): + # TODO: colormap is hardcoded + cmap = cv2.COLORMAP_PARULA + if isinstance(levels, int): + self.levels = np.linspace(0, 1, levels) + else: + self.levels = levels + if "linewidths" in kwargs: + self.linewidths = kwargs["linewidths"] + else: + self.linewidths = [1] * len(self.levels) + self.plot_args = kwargs + img_colors_bgr = cv2.applyColorMap((self.levels * 255).astype(np.uint8), cmap) + self.level_colors_bgr = [ + [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr + ] + + def create_visualization_context(self, image_bgr: Image): + return image_bgr + + def context_to_image_bgr(self, context): + return context + + def get_image_bgr_from_context(self, context): + return context + + def visualize_iuv_arr(self, context, iuv_arr: np.ndarray, bbox_xywh: Boxes) -> Image: + image_bgr = self.get_image_bgr_from_context(context) + segm = _extract_i_from_iuvarr(iuv_arr) + u = _extract_u_from_iuvarr(iuv_arr).astype(float) / 255.0 + v = _extract_v_from_iuvarr(iuv_arr).astype(float) / 255.0 + self._contours(image_bgr, u, segm, bbox_xywh) + self._contours(image_bgr, v, segm, bbox_xywh) + + def _contours(self, image_bgr, arr, segm, bbox_xywh): + for part_idx in range(1, DensePoseDataRelative.N_PART_LABELS + 1): + mask = segm == part_idx + if not np.any(mask): + continue + arr_min = np.amin(arr[mask]) + arr_max = np.amax(arr[mask]) + I, J = np.nonzero(mask) + i0 = np.amin(I) + i1 = np.amax(I) + 1 + j0 = np.amin(J) + j1 = np.amax(J) + 1 + if (j1 == j0 + 1) or (i1 == i0 + 1): + continue + Nw = arr.shape[1] - 1 + Nh = arr.shape[0] - 1 + for level_idx, level in enumerate(self.levels): + if (level < arr_min) or (level > arr_max): + continue + vp = arr[i0:i1, j0:j1] >= level + bin_codes = vp[:-1, :-1] + vp[1:, :-1] * 2 + vp[1:, 1:] * 4 + vp[:-1, 1:] * 8 + mp = mask[i0:i1, j0:j1] + bin_mask_codes = mp[:-1, :-1] + mp[1:, :-1] * 2 + mp[1:, 1:] * 4 + mp[:-1, 1:] * 8 + it = np.nditer(bin_codes, flags=["multi_index"]) + color_bgr = self.level_colors_bgr[level_idx] + linewidth = self.linewidths[level_idx] + while not it.finished: + if (it[0] != 0) and (it[0] != 15): + i, j = it.multi_index + if bin_mask_codes[i, j] != 0: + self._draw_line( + image_bgr, + arr, + mask, + level, + color_bgr, + linewidth, + it[0], + it.multi_index, + bbox_xywh, + Nw, + Nh, + (i0, j0), + ) + it.iternext() + + def _draw_line( + self, + image_bgr, + arr, + mask, + v, + color_bgr, + linewidth, + bin_code, + multi_idx, + bbox_xywh, + Nw, + Nh, + offset, + ): + lines = self._bin_code_2_lines(arr, v, bin_code, multi_idx, Nw, Nh, offset) + x0, y0, w, h = bbox_xywh + x1 = x0 + w + y1 = y0 + h + for line in lines: + x0r, y0r = line[0] + x1r, y1r = line[1] + pt0 = (int(x0 + x0r * (x1 - x0)), int(y0 + y0r * (y1 - y0))) + pt1 = (int(x0 + x1r * (x1 - x0)), int(y0 + y1r * (y1 - y0))) + cv2.line(image_bgr, pt0, pt1, color_bgr, linewidth) + + def _bin_code_2_lines(self, arr, v, bin_code, multi_idx, Nw, Nh, offset): + i0, j0 = offset + i, j = multi_idx + i += i0 + j += j0 + v0, v1, v2, v3 = arr[i, j], arr[i + 1, j], arr[i + 1, j + 1], arr[i, j + 1] + x0i = float(j) / Nw + y0j = float(i) / Nh + He = 1.0 / Nh + We = 1.0 / Nw + if (bin_code == 1) or (bin_code == 14): + a = (v - v0) / (v1 - v0) + b = (v - v0) / (v3 - v0) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + b * We, y0j) + return [(pt1, pt2)] + elif (bin_code == 2) or (bin_code == 13): + a = (v - v0) / (v1 - v0) + b = (v - v1) / (v2 - v1) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + b * We, y0j + He) + return [(pt1, pt2)] + elif (bin_code == 3) or (bin_code == 12): + a = (v - v0) / (v3 - v0) + b = (v - v1) / (v2 - v1) + pt1 = (x0i + a * We, y0j) + pt2 = (x0i + b * We, y0j + He) + return [(pt1, pt2)] + elif (bin_code == 4) or (bin_code == 11): + a = (v - v1) / (v2 - v1) + b = (v - v3) / (v2 - v3) + pt1 = (x0i + a * We, y0j + He) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif (bin_code == 6) or (bin_code == 9): + a = (v - v0) / (v1 - v0) + b = (v - v3) / (v2 - v3) + pt1 = (x0i, y0j + a * He) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif (bin_code == 7) or (bin_code == 8): + a = (v - v0) / (v3 - v0) + b = (v - v3) / (v2 - v3) + pt1 = (x0i + a * We, y0j) + pt2 = (x0i + We, y0j + b * He) + return [(pt1, pt2)] + elif bin_code == 5: + a1 = (v - v0) / (v1 - v0) + b1 = (v - v1) / (v2 - v1) + pt11 = (x0i, y0j + a1 * He) + pt12 = (x0i + b1 * We, y0j + He) + a2 = (v - v0) / (v3 - v0) + b2 = (v - v3) / (v2 - v3) + pt21 = (x0i + a2 * We, y0j) + pt22 = (x0i + We, y0j + b2 * He) + return [(pt11, pt12), (pt21, pt22)] + elif bin_code == 10: + a1 = (v - v0) / (v3 - v0) + b1 = (v - v0) / (v1 - v0) + pt11 = (x0i + a1 * We, y0j) + pt12 = (x0i, y0j + b1 * He) + a2 = (v - v1) / (v2 - v1) + b2 = (v - v3) / (v2 - v3) + pt21 = (x0i + a2 * We, y0j + He) + pt22 = (x0i + We, y0j + b2 * He) + return [(pt11, pt12), (pt21, pt22)] + return [] + + +try: + import matplotlib + + matplotlib.use("Agg") + DensePoseResultsContourVisualizer = DensePoseResultsMplContourVisualizer +except ModuleNotFoundError: + logger = logging.getLogger(__name__) + logger.warning("Could not import matplotlib, using custom contour visualizer") + DensePoseResultsContourVisualizer = DensePoseResultsCustomContourVisualizer + + +class DensePoseResultsFineSegmentationVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + super(DensePoseResultsFineSegmentationVisualizer, self).__init__( + _extract_i_from_iuvarr, + _extract_i_from_iuvarr, + inplace, + cmap, + alpha, + val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS, + ) + + +class DensePoseResultsUVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + super(DensePoseResultsUVisualizer, self).__init__( + _extract_u_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0 + ) + + +class DensePoseResultsVVisualizer(DensePoseMaskedColormapResultsVisualizer): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + super(DensePoseResultsVVisualizer, self).__init__( + _extract_v_from_iuvarr, _extract_i_from_iuvarr, inplace, cmap, alpha, val_scale=1.0 + ) + + +class DensePoseOutputsFineSegmentationVisualizer(object): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, + cmap=cmap, + val_scale=255.0 / DensePoseDataRelative.N_PART_LABELS, + alpha=alpha, + ) + + def visualize( + self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]] + ) -> Image: + if dp_output_with_bboxes is None: + return image_bgr + densepose_output, bboxes_xywh = dp_output_with_bboxes + S = densepose_output.S + I = densepose_output.I # noqa + U = densepose_output.U + V = densepose_output.V + N = S.size(0) + assert N == I.size( + 0 + ), "densepose outputs S {} and I {}" " should have equal first dim size".format( + S.size(), I.size() + ) + assert N == U.size( + 0 + ), "densepose outputs S {} and U {}" " should have equal first dim size".format( + S.size(), U.size() + ) + assert N == V.size( + 0 + ), "densepose outputs S {} and V {}" " should have equal first dim size".format( + S.size(), V.size() + ) + assert N == len( + bboxes_xywh + ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format( + len(bboxes_xywh), N + ) + for n in range(N): + Sn = S[n].argmax(dim=0) + In = I[n].argmax(dim=0) * (Sn > 0).long() + matrix = In.cpu().numpy().astype(np.uint8) + mask = np.zeros(matrix.shape, dtype=np.uint8) + mask[matrix > 0] = 1 + bbox_xywh = bboxes_xywh[n] + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh) + return image_bgr + + +class DensePoseOutputsUVisualizer(object): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha + ) + + def visualize( + self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]] + ) -> Image: + if dp_output_with_bboxes is None: + return image_bgr + densepose_output, bboxes_xywh = dp_output_with_bboxes + assert isinstance( + densepose_output, DensePoseOutput + ), "DensePoseOutput expected, {} encountered".format(type(densepose_output)) + S = densepose_output.S + I = densepose_output.I # noqa + U = densepose_output.U + V = densepose_output.V + N = S.size(0) + assert N == I.size( + 0 + ), "densepose outputs S {} and I {}" " should have equal first dim size".format( + S.size(), I.size() + ) + assert N == U.size( + 0 + ), "densepose outputs S {} and U {}" " should have equal first dim size".format( + S.size(), U.size() + ) + assert N == V.size( + 0 + ), "densepose outputs S {} and V {}" " should have equal first dim size".format( + S.size(), V.size() + ) + assert N == len( + bboxes_xywh + ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format( + len(bboxes_xywh), N + ) + for n in range(N): + Sn = S[n].argmax(dim=0) + In = I[n].argmax(dim=0) * (Sn > 0).long() + segmentation = In.cpu().numpy().astype(np.uint8) + mask = np.zeros(segmentation.shape, dtype=np.uint8) + mask[segmentation > 0] = 1 + Un = U[n].cpu().numpy().astype(np.float32) + Uvis = np.zeros(segmentation.shape, dtype=np.float32) + for partId in range(Un.shape[0]): + Uvis[segmentation == partId] = Un[partId][segmentation == partId].clip(0, 1) * 255 + bbox_xywh = bboxes_xywh[n] + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Uvis, bbox_xywh) + return image_bgr + + +class DensePoseOutputsVVisualizer(object): + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, cmap=cmap, val_scale=1.0, alpha=alpha + ) + + def visualize( + self, image_bgr: Image, dp_output_with_bboxes: Optional[Tuple[DensePoseOutput, Boxes]] + ) -> Image: + if dp_output_with_bboxes is None: + return image_bgr + densepose_output, bboxes_xywh = dp_output_with_bboxes + assert isinstance( + densepose_output, DensePoseOutput + ), "DensePoseOutput expected, {} encountered".format(type(densepose_output)) + S = densepose_output.S + I = densepose_output.I # noqa + U = densepose_output.U + V = densepose_output.V + N = S.size(0) + assert N == I.size( + 0 + ), "densepose outputs S {} and I {}" " should have equal first dim size".format( + S.size(), I.size() + ) + assert N == U.size( + 0 + ), "densepose outputs S {} and U {}" " should have equal first dim size".format( + S.size(), U.size() + ) + assert N == V.size( + 0 + ), "densepose outputs S {} and V {}" " should have equal first dim size".format( + S.size(), V.size() + ) + assert N == len( + bboxes_xywh + ), "number of bounding boxes {}" " should be equal to first dim size of outputs {}".format( + len(bboxes_xywh), N + ) + for n in range(N): + Sn = S[n].argmax(dim=0) + In = I[n].argmax(dim=0) * (Sn > 0).long() + segmentation = In.cpu().numpy().astype(np.uint8) + mask = np.zeros(segmentation.shape, dtype=np.uint8) + mask[segmentation > 0] = 1 + Vn = V[n].cpu().numpy().astype(np.float32) + Vvis = np.zeros(segmentation.shape, dtype=np.float32) + for partId in range(Vn.size(0)): + Vvis[segmentation == partId] = Vn[partId][segmentation == partId].clip(0, 1) * 255 + bbox_xywh = bboxes_xywh[n] + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, Vvis, bbox_xywh) + return image_bgr + + +class DensePoseDataCoarseSegmentationVisualizer(object): + """ + Visualizer for ground truth segmentation + """ + + def __init__(self, inplace=True, cmap=cv2.COLORMAP_PARULA, alpha=0.7): + self.mask_visualizer = MatrixVisualizer( + inplace=inplace, + cmap=cmap, + val_scale=255.0 / DensePoseDataRelative.N_BODY_PARTS, + alpha=alpha, + ) + + def visualize( + self, + image_bgr: Image, + bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]], + ) -> Image: + if bbox_densepose_datas is None: + return image_bgr + for bbox_xywh, densepose_data in zip(*bbox_densepose_datas): + matrix = densepose_data.segm.numpy() + mask = np.zeros(matrix.shape, dtype=np.uint8) + mask[matrix > 0] = 1 + image_bgr = self.mask_visualizer.visualize(image_bgr, mask, matrix, bbox_xywh.numpy()) + return image_bgr + + +class DensePoseDataPointsVisualizer(object): + def __init__(self, densepose_data_to_value_fn=None, cmap=cv2.COLORMAP_PARULA): + self.points_visualizer = PointsVisualizer() + self.densepose_data_to_value_fn = densepose_data_to_value_fn + self.cmap = cmap + + def visualize( + self, + image_bgr: Image, + bbox_densepose_datas: Optional[Tuple[Iterable[Boxes], Iterable[DensePoseDataRelative]]], + ) -> Image: + if bbox_densepose_datas is None: + return image_bgr + for bbox_xywh, densepose_data in zip(*bbox_densepose_datas): + x0, y0, w, h = bbox_xywh.numpy() + x = densepose_data.x.numpy() * w / 255.0 + x0 + y = densepose_data.y.numpy() * h / 255.0 + y0 + pts_xy = zip(x, y) + if self.densepose_data_to_value_fn is None: + image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy) + else: + v = self.densepose_data_to_value_fn(densepose_data) + img_colors_bgr = cv2.applyColorMap(v, self.cmap) + colors_bgr = [ + [int(v) for v in img_color_bgr.ravel()] for img_color_bgr in img_colors_bgr + ] + image_bgr = self.points_visualizer.visualize(image_bgr, pts_xy, colors_bgr) + return image_bgr + + +def _densepose_data_u_for_cmap(densepose_data): + u = np.clip(densepose_data.u.numpy(), 0, 1) * 255.0 + return u.astype(np.uint8) + + +def _densepose_data_v_for_cmap(densepose_data): + v = np.clip(densepose_data.v.numpy(), 0, 1) * 255.0 + return v.astype(np.uint8) + + +def _densepose_data_i_for_cmap(densepose_data): + i = ( + np.clip(densepose_data.i.numpy(), 0.0, DensePoseDataRelative.N_PART_LABELS) + * 255.0 + / DensePoseDataRelative.N_PART_LABELS + ) + return i.astype(np.uint8) + + +class DensePoseDataPointsUVisualizer(DensePoseDataPointsVisualizer): + def __init__(self): + super(DensePoseDataPointsUVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_u_for_cmap + ) + + +class DensePoseDataPointsVVisualizer(DensePoseDataPointsVisualizer): + def __init__(self): + super(DensePoseDataPointsVVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_v_for_cmap + ) + + +class DensePoseDataPointsIVisualizer(DensePoseDataPointsVisualizer): + def __init__(self): + super(DensePoseDataPointsIVisualizer, self).__init__( + densepose_data_to_value_fn=_densepose_data_i_for_cmap + ) diff --git a/projects/DensePose/densepose/vis/extractor.py b/projects/DensePose/densepose/vis/extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..b715a4451e096d6d6c086f9bcf60f92d2ae692f8 --- /dev/null +++ b/projects/DensePose/densepose/vis/extractor.py @@ -0,0 +1,152 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +from typing import Sequence +import torch + +from detectron2.layers.nms import batched_nms +from detectron2.structures.instances import Instances + +from densepose.vis.bounding_box import BoundingBoxVisualizer, ScoredBoundingBoxVisualizer +from densepose.vis.densepose import DensePoseResultsVisualizer + +from .base import CompoundVisualizer + +Scores = Sequence[float] + + +def extract_scores_from_instances(instances: Instances, select=None): + if instances.has("scores"): + return instances.scores if select is None else instances.scores[select] + return None + + +def extract_boxes_xywh_from_instances(instances: Instances, select=None): + if instances.has("pred_boxes"): + boxes_xywh = instances.pred_boxes.tensor.clone() + boxes_xywh[:, 2] -= boxes_xywh[:, 0] + boxes_xywh[:, 3] -= boxes_xywh[:, 1] + return boxes_xywh if select is None else boxes_xywh[select] + return None + + +def create_extractor(visualizer: object): + """ + Create an extractor for the provided visualizer + """ + if isinstance(visualizer, CompoundVisualizer): + extractors = [create_extractor(v) for v in visualizer.visualizers] + return CompoundExtractor(extractors) + elif isinstance(visualizer, DensePoseResultsVisualizer): + return DensePoseResultExtractor() + elif isinstance(visualizer, ScoredBoundingBoxVisualizer): + return CompoundExtractor([extract_boxes_xywh_from_instances, extract_scores_from_instances]) + elif isinstance(visualizer, BoundingBoxVisualizer): + return extract_boxes_xywh_from_instances + else: + logger = logging.getLogger(__name__) + logger.error(f"Could not create extractor for {visualizer}") + return None + + +class BoundingBoxExtractor(object): + """ + Extracts bounding boxes from instances + """ + + def __call__(self, instances: Instances): + boxes_xywh = extract_boxes_xywh_from_instances(instances) + return boxes_xywh + + +class ScoredBoundingBoxExtractor(object): + """ + Extracts bounding boxes from instances + """ + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if (scores is None) or (boxes_xywh is None): + return (boxes_xywh, scores) + if select is not None: + scores = scores[select] + boxes_xywh = boxes_xywh[select] + return (boxes_xywh, scores) + + +class DensePoseResultExtractor(object): + """ + Extracts DensePose result from instances + """ + + def __call__(self, instances: Instances, select=None): + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if instances.has("pred_densepose") and (boxes_xywh is not None): + dpout = instances.pred_densepose + if select is not None: + dpout = dpout[select] + boxes_xywh = boxes_xywh[select] + return dpout.to_result(boxes_xywh) + else: + return None + + +class CompoundExtractor(object): + """ + Extracts data for CompoundVisualizer + """ + + def __init__(self, extractors): + self.extractors = extractors + + def __call__(self, instances: Instances, select=None): + datas = [] + for extractor in self.extractors: + data = extractor(instances, select) + datas.append(data) + return datas + + +class NmsFilteredExtractor(object): + """ + Extracts data in the format accepted by NmsFilteredVisualizer + """ + + def __init__(self, extractor, iou_threshold): + self.extractor = extractor + self.iou_threshold = iou_threshold + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + boxes_xywh = extract_boxes_xywh_from_instances(instances) + if boxes_xywh is None: + return None + select_local_idx = batched_nms( + boxes_xywh, + scores, + torch.zeros(len(scores), dtype=torch.int32), + iou_threshold=self.iou_threshold, + ).squeeze() + select_local = torch.zeros(len(boxes_xywh), dtype=torch.bool, device=boxes_xywh.device) + select_local[select_local_idx] = True + select = select_local if select is None else (select & select_local) + return self.extractor(instances, select=select) + + +class ScoreThresholdedExtractor(object): + """ + Extracts data in the format accepted by ScoreThresholdedVisualizer + """ + + def __init__(self, extractor, min_score): + self.extractor = extractor + self.min_score = min_score + + def __call__(self, instances: Instances, select=None): + scores = extract_scores_from_instances(instances) + if scores is None: + return None + select_local = scores > self.min_score + select = select_local if select is None else (select & select_local) + data = self.extractor(instances, select=select) + return data diff --git a/projects/DensePose/dev/README.md b/projects/DensePose/dev/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3a94b67ed4b4d0c2934f074802cd00f3660f9a9 --- /dev/null +++ b/projects/DensePose/dev/README.md @@ -0,0 +1,7 @@ + +## Some scripts for developers to use, include: + +- `run_instant_tests.sh`: run training for a few iterations. +- `run_inference_tests.sh`: run inference on a small dataset. +- `../../dev/linter.sh`: lint the codebase before commit +- `../../dev/parse_results.sh`: parse results from log file. diff --git a/projects/DensePose/dev/run_inference_tests.sh b/projects/DensePose/dev/run_inference_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..34f47d5a07a90c411e830c98a346845fa618f836 --- /dev/null +++ b/projects/DensePose/dev/run_inference_tests.sh @@ -0,0 +1,33 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +BIN="python train_net.py" +OUTPUT="inference_test_output" +NUM_GPUS=2 +IMS_PER_GPU=2 +IMS_PER_BATCH=$(( NUM_GPUS * IMS_PER_GPU )) + +CFG_LIST=( "${@:1}" ) + +if [ ${#CFG_LIST[@]} -eq 0 ]; then + CFG_LIST=( ./configs/quick_schedules/*inference_acc_test.yaml ) +fi + +echo "========================================================================" +echo "Configs to run:" +echo "${CFG_LIST[@]}" +echo "========================================================================" + +for cfg in "${CFG_LIST[@]}"; do + echo "========================================================================" + echo "Running $cfg ..." + echo "========================================================================" + $BIN \ + --eval-only \ + --num-gpus $NUM_GPUS \ + --config-file "$cfg" \ + OUTPUT_DIR "$OUTPUT" \ + SOLVER.IMS_PER_BATCH $IMS_PER_BATCH + rm -rf $OUTPUT +done + diff --git a/projects/DensePose/dev/run_instant_tests.sh b/projects/DensePose/dev/run_instant_tests.sh new file mode 100755 index 0000000000000000000000000000000000000000..a53785180974a70bce7fdb0c9da4024166efd596 --- /dev/null +++ b/projects/DensePose/dev/run_instant_tests.sh @@ -0,0 +1,28 @@ +#!/bin/bash -e +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +BIN="python train_net.py" +OUTPUT="instant_test_output" +NUM_GPUS=2 +SOLVER_IMS_PER_BATCH=$((NUM_GPUS * 2)) + +CFG_LIST=( "${@:1}" ) +if [ ${#CFG_LIST[@]} -eq 0 ]; then + CFG_LIST=( ./configs/quick_schedules/*instant_test.yaml ) +fi + +echo "========================================================================" +echo "Configs to run:" +echo "${CFG_LIST[@]}" +echo "========================================================================" + +for cfg in "${CFG_LIST[@]}"; do + echo "========================================================================" + echo "Running $cfg ..." + echo "========================================================================" + $BIN --num-gpus $NUM_GPUS --config-file "$cfg" \ + SOLVER.IMS_PER_BATCH $SOLVER_IMS_PER_BATCH \ + OUTPUT_DIR "$OUTPUT" + rm -rf "$OUTPUT" +done + diff --git a/projects/DensePose/doc/GETTING_STARTED.md b/projects/DensePose/doc/GETTING_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..a6bcbedee42835c99fa5aa1110309329dfbff6f0 --- /dev/null +++ b/projects/DensePose/doc/GETTING_STARTED.md @@ -0,0 +1,58 @@ +# Getting Started with DensePose + +## Inference with Pre-trained Models + +1. Pick a model and its config file from [Model Zoo](MODEL_ZOO.md), for example [densepose_rcnn_R_50_FPN_s1x.yaml](../configs/densepose_rcnn_R_50_FPN_s1x.yaml) +2. Run the [Apply Net](TOOL_APPLY_NET.md) tool to visualize the results or save the to disk. For example, to use contour visualization for DensePose, one can run: +```bash +python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml densepose_rcnn_R_50_FPN_s1x.pkl image.jpg dp_contour,bbox --output image_densepose_contour.png +``` +Please see [Apply Net](TOOL_APPLY_NET.md) for more details on the tool. + +## Training + +First, prepare the [dataset](http://densepose.org/#dataset) into the following structure under the directory you'll run training scripts: +
+datasets/coco/
+  annotations/
+    densepose_{train,minival,valminusminival}2014.json
+    densepose_minival2014_100.json   (optional, for testing only)
+  {train,val}2014/
+    # image files that are mentioned in the corresponding json
+
+ +To train a model one can use the [train_net.py](../train_net.py) script. +This script was used to train all DensePose models in [Model Zoo](MODEL_ZOO.md). +For example, to launch end-to-end DensePose-RCNN training with ResNet-50 FPN backbone +on 8 GPUs following the s1x schedule, one can run +```bash +python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml --num-gpus 8 +``` +The configs are made for 8-GPU training. To train on 1 GPU, one can apply the +[linear learning rate scaling rule](https://arxiv.org/abs/1706.02677): +```bash +python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \ + SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 +``` + +## Evaluation + +Model testing can be done in the same way as training, except for an additional flag `--eval-only` and +model location specification through `MODEL.WEIGHTS model.pth` in the command line +```bash +python train_net.py --config-file configs/densepose_rcnn_R_50_FPN_s1x.yaml \ + --eval-only MODEL.WEIGHTS model.pth +``` + +## Tools + +We provide tools which allow one to: + - easily view DensePose annotated data in a dataset; + - perform DensePose inference on a set of images; + - visualize DensePose model results; + +`query_db` is a tool to print or visualize DensePose data in a dataset. +Please refer to [Query DB](TOOL_QUERY_DB.md) for more details on this tool + +`apply_net` is a tool to print or visualize DensePose results. +Please refer to [Apply Net](TOOL_APPLY_NET.md) for more details on this tool diff --git a/projects/DensePose/doc/MODEL_ZOO.md b/projects/DensePose/doc/MODEL_ZOO.md new file mode 100644 index 0000000000000000000000000000000000000000..c26308417de03efea3872b44fec43c74ead529e9 --- /dev/null +++ b/projects/DensePose/doc/MODEL_ZOO.md @@ -0,0 +1,277 @@ +# Model Zoo and Baselines + +# Introduction + +We provide baselines trained with Detectron2 DensePose. The corresponding +configuration files can be found in the [configs](../configs) directory. +All models were trained on COCO `train2014` + `valminusminival2014` and +evaluated on COCO `minival2014`. For the details on common settings in which +baselines were trained, please check [Detectron 2 Model Zoo](../../../MODEL_ZOO.md). + +## License + +All models available for download through this document are licensed under the +[Creative Commons Attribution-ShareAlike 3.0 license](https://creativecommons.org/licenses/by-sa/3.0/) + +## COCO DensePose Baselines with DensePose-RCNN + +### Legacy Models + +Baselines trained using schedules from [Güler et al, 2018](https://arxiv.org/pdf/1802.00434.pdf) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
dp. AP
GPS
dp. AP
GPSm
model iddownload
R_50_FPN_s1x_legacys1x0.3070.0513.258.152.154.9164832157model | metrics
R_101_FPN_s1x_legacys1x0.3900.0634.359.553.256.1164832182model | metrics
+ +### Improved Baselines, Original Fully Convolutional Haad + +These models use an improved training schedule and Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
dp. AP
GPS
dp. AP
GPSm
model iddownload
R_50_FPN_s1xs1x0.3590.0664.561.263.765.3165712039model | metrics
R_101_FPN_s1xs1x0.4280.0795.862.364.566.4165712084model | metrics
+ +### Improved Baselines, DeepLabV3 Head + +These models use an improved training schedule, Panoptic FPN head from [Kirillov et al, 2019](https://arxiv.org/abs/1901.02446) and DeepLabV3 head from [Chen et al, 2017](https://arxiv.org/abs/1706.05587). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
dp. AP
GPS
dp. AP
GPSm
model iddownload
R_50_FPN_DL_s1xs1x0.3920.0706.761.165.666.8165712097model | metrics
R_101_FPN_DL_s1xs1x0.4780.0837.062.366.367.7165712116model | metrics
+ +### Baselines with Confidence Estimation + +These models perform additional estimation of confidence in regressed UV coodrinates, along the lines of [Neverova et al., 2019](https://papers.nips.cc/paper/8378-correlated-uncertainty-for-learning-dense-correspondences-from-noisy-labels). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Namelr
sched
train
time
(s/iter)
inference
time
(s/im)
train
mem
(GB)
box
AP
dp. AP
GPS
dp. AP
GPSm
model iddownload
R_50_FPN_WC1_s1xs1x0.3530.0644.660.564.265.6173862049model | metrics
R_50_FPN_WC2_s1xs1x0.3640.0664.860.764.265.7173861455model | metrics
R_50_FPN_DL_WC1_s1xs1x0.3970.0686.761.165.867.1173067973model | metrics
R_50_FPN_DL_WC2_s1xs1x0.4100.0706.860.865.666.7173859335model | metrics
R_101_FPN_WC1_s1xs1x0.4350.0765.762.564.966.5171402969model | metrics
R_101_FPN_WC2_s1xs1x0.4500.0785.762.364.866.6173860702model | metrics
R_101_FPN_DL_WC1_s1xs1x0.4790.0817.962.066.267.4173858525model | metrics
R_101_FPN_DL_WC2_s1xs1x0.4910.0827.661.765.967.3173294801model | metrics
+ +## Old Baselines + +It is still possible to use some baselines from [DensePose 1](https://github.com/facebookresearch/DensePose). +Below are evaluation metrics for the baselines recomputed in the current framework: + +| Model | bbox AP | AP | AP50 | AP75 | APm |APl | +|-----|-----|-----|--- |--- |--- |--- | +| [`ResNet50_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet50_FPN_s1x-e2e.pkl) | 54.673 | 48.894 | 84.963 | 50.717 | 43.132 | 50.433 | +| [`ResNet101_FPN_s1x-e2e`](https://dl.fbaipublicfiles.com/densepose/DensePose_ResNet101_FPN_s1x-e2e.pkl) | 56.032 | 51.088 | 86.250 | 55.057 | 46.542 | 52.563 | + +Note: these scores are close, but not strictly equal to the ones reported in the [DensePose 1 Model Zoo](https://github.com/facebookresearch/DensePose/blob/master/MODEL_ZOO.md), +which is due to small incompatibilities between the frameworks. diff --git a/projects/DensePose/doc/TOOL_APPLY_NET.md b/projects/DensePose/doc/TOOL_APPLY_NET.md new file mode 100644 index 0000000000000000000000000000000000000000..f5cf2579a83811e4b192b3688f241b570f62bcb5 --- /dev/null +++ b/projects/DensePose/doc/TOOL_APPLY_NET.md @@ -0,0 +1,130 @@ +# Apply Net + +`apply_net` is a tool to print or visualize DensePose results on a set of images. +It has two modes: `dump` to save DensePose model results to a pickle file +and `show` to visualize them on images. + +## Dump Mode + +The general command form is: +```bash +python apply_net.py dump [-h] [-v] [--output ] +``` + +There are three mandatory arguments: + - ``, configuration file for a given model; + - ``, model file with trained parameters + - ``, input image file name, pattern or folder + +One can additionally provide `--output` argument to define the output file name, +which defaults to `output.pkl`. + + +Examples: + +1. Dump results of a DensePose model with ResNet-50 FPN backbone for images + in a folder `images` to file `dump.pkl`: +```bash +python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl images --output dump.pkl -v +``` + +2. Dump results of a DensePose model with ResNet-50 FPN backbone for images + with file name matching a pattern `image*.jpg` to file `results.pkl`: +```bash +python apply_net.py dump configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl "image*.jpg" --output results.pkl -v +``` + +If you want to load the pickle file generated by the above command: +``` +# make sure DensePose is in your PYTHONPATH, or use the following line to add it: +sys.path.append("/your_detectron2_path/detectron2_repo/projects/DensePose/") + +f = open('/your_result_path/results.pkl', 'rb') +data = pickle.load(f) +``` + +The file `results.pkl` contains the list of results per image, for each image the result is a dictionary: +``` +data: [{'file_name': '/your_path/image1.jpg', + 'scores': tensor([0.9884]), + 'pred_boxes_XYXY': tensor([[ 69.6114, 0.0000, 706.9797, 706.0000]]), + 'pred_densepose': }, + {'file_name': '/your_path/image2.jpg', + 'scores': tensor([0.9999, 0.5373, 0.3991]), + 'pred_boxes_XYXY': tensor([[ 59.5734, 7.7535, 579.9311, 932.3619], + [612.9418, 686.1254, 612.9999, 704.6053], + [164.5081, 407.4034, 598.3944, 920.4266]]), + 'pred_densepose': }] +``` + +We can use the following code, to parse the outputs of the first +detected instance on the first image. +``` +img_id, instance_id = 0, 0 # Look at the first image and the first detected instance +bbox_xyxy = data[img_id]['pred_boxes_XYXY'][instance_id] +result_encoded = data[img_id]['pred_densepose'].results[instance_id] +iuv_arr = DensePoseResult.decode_png_data(*result_encoded) +``` +The array `bbox_xyxy` contains (x0, y0, x1, y1) of the bounding box. + +The shape of `iuv_arr` is `[3, H, W]`, where (H, W) is the shape of the bounding box. +- `iuv_arr[0,:,:]`: The patch index of image points, indicating which of the 24 surface patches the point is on. +- `iuv_arr[1,:,:]`: The U-coordinate value of image points. +- `iuv_arr[2,:,:]`: The V-coordinate value of image points. + + +## Visualization Mode + +The general command form is: +```bash +python apply_net.py show [-h] [-v] [--min_score ] [--nms_thresh ] [--output ] +``` + +There are four mandatory arguments: + - ``, configuration file for a given model; + - ``, model file with trained parameters + - ``, input image file name, pattern or folder + - ``, visualizations specifier; currently available visualizations are: + * `bbox` - bounding boxes of detected persons; + * `dp_segm` - segmentation masks for detected persons; + * `dp_u` - each body part is colored according to the estimated values of the + U coordinate in part parameterization; + * `dp_v` - each body part is colored according to the estimated values of the + V coordinate in part parameterization; + * `dp_contour` - plots contours with color-coded U and V coordinates + + +One can additionally provide the following optional arguments: + - `--min_score` to only show detections with sufficient scores that are not lower than provided value + - `--nms_thresh` to additionally apply non-maximum suppression to detections at a given threshold + - `--output` to define visualization file name template, which defaults to `output.png`. + To distinguish output file names for different images, the tool appends 1-based entry index, + e.g. output.0001.png, output.0002.png, etc... + + +The following examples show how to output results of a DensePose model +with ResNet-50 FPN backbone using different visualizations for image `image.jpg`: + +1. Show bounding box and segmentation: +```bash +python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_segm -v +``` +![Bounding Box + Segmentation Visualization](images/res_bbox_dp_segm.jpg) + +2. Show bounding box and estimated U coordinates for body parts: +```bash +python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_u -v +``` +![Bounding Box + U Coordinate Visualization](images/res_bbox_dp_u.jpg) + +3. Show bounding box and estimated V coordinates for body parts: +```bash +python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg bbox,dp_v -v +``` +![Bounding Box + V Coordinate Visualization](images/res_bbox_dp_v.jpg) + +4. Show bounding box and estimated U and V coordinates via contour plots: +```bash +python apply_net.py show configs/densepose_rcnn_R_50_FPN_s1x.yaml DensePose_ResNet50_FPN_s1x-e2e.pkl image.jpg dp_contour,bbox -v +``` +![Bounding Box + Contour Visualization](images/res_bbox_dp_contour.jpg) diff --git a/projects/DensePose/doc/TOOL_QUERY_DB.md b/projects/DensePose/doc/TOOL_QUERY_DB.md new file mode 100644 index 0000000000000000000000000000000000000000..b0a764b8740597c6af634127b80b53d28913726f --- /dev/null +++ b/projects/DensePose/doc/TOOL_QUERY_DB.md @@ -0,0 +1,105 @@ + +# Query Dataset + +`query_db` is a tool to print or visualize DensePose data from a dataset. +It has two modes: `print` and `show` to output dataset entries to standard +output or to visualize them on images. + +## Print Mode + +The general command form is: +```bash +python query_db.py print [-h] [-v] [--max-entries N] +``` + +There are two mandatory arguments: + - ``, DensePose dataset specification, from which to select + the entries (e.g. `densepose_coco_2014_train`). + - ``, dataset entry selector which can be a single specification, + or a comma-separated list of specifications of the form + `field[:type]=value` for exact match with the value + or `field[:type]=min-max` for a range of values + +One can additionally limit the maximum number of entries to output +by providing `--max-entries` argument. + +Examples: + +1. Output at most 10 first entries from the `densepose_coco_2014_train` dataset: +```bash +python query_db.py print densepose_coco_2014_train \* --max-entries 10 -v +``` + +2. Output all entries with `file_name` equal to `COCO_train2014_000000000036.jpg`: +```bash +python query_db.py print densepose_coco_2014_train file_name=COCO_train2014_000000000036.jpg -v +``` + +3. Output all entries with `image_id` between 36 and 156: +```bash +python query_db.py print densepose_coco_2014_train image_id:int=36-156 -v +``` + +## Visualization Mode + +The general command form is: +```bash +python query_db.py show [-h] [-v] [--max-entries N] [--output ] +``` + +There are three mandatory arguments: + - ``, DensePose dataset specification, from which to select + the entries (e.g. `densepose_coco_2014_train`). + - ``, dataset entry selector which can be a single specification, + or a comma-separated list of specifications of the form + `field[:type]=value` for exact match with the value + or `field[:type]=min-max` for a range of values + - ``, visualizations specifier; currently available visualizations are: + * `bbox` - bounding boxes of annotated persons; + * `dp_i` - annotated points colored according to the containing part; + * `dp_pts` - annotated points in green color; + * `dp_segm` - segmentation masks for annotated persons; + * `dp_u` - annotated points colored according to their U coordinate in part parameterization; + * `dp_v` - annotated points colored according to their V coordinate in part parameterization; + +One can additionally provide one of the two optional arguments: + - `--max_entries` to limit the maximum number of entries to visualize + - `--output` to provide visualization file name template, which defaults + to `output.png`. To distinguish file names for different dataset + entries, the tool appends 1-based entry index to the output file name, + e.g. output.0001.png, output.0002.png, etc. + +The following examples show how to output different visualizations for image with `id = 322` +from `densepose_coco_2014_train` dataset: + +1. Show bounding box and segmentation: +```bash +python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v +``` +![Bounding Box + Segmentation Visualization](images/vis_bbox_dp_segm.jpg) + +2. Show bounding box and points colored according to the containing part: +```bash +python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_i -v +``` +![Bounding Box + Point Label Visualization](images/vis_bbox_dp_i.jpg) + +3. Show bounding box and annotated points in green color: +```bash +python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_segm -v +``` +![Bounding Box + Point Visualization](images/vis_bbox_dp_pts.jpg) + +4. Show bounding box and annotated points colored according to their U coordinate in part parameterization: +```bash +python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_u -v +``` +![Bounding Box + Point U Visualization](images/vis_bbox_dp_u.jpg) + +5. Show bounding box and annotated points colored according to their V coordinate in part parameterization: +```bash +python query_db.py show densepose_coco_2014_train image_id:int=322 bbox,dp_v -v +``` +![Bounding Box + Point V Visualization](images/vis_bbox_dp_v.jpg) + + diff --git a/projects/DensePose/doc/images/res_bbox_dp_contour.jpg b/projects/DensePose/doc/images/res_bbox_dp_contour.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8f0c195c237d8ca70e16f5827b8a3b6e456844a4 Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_contour.jpg differ diff --git a/projects/DensePose/doc/images/res_bbox_dp_segm.jpg b/projects/DensePose/doc/images/res_bbox_dp_segm.jpg new file mode 100644 index 0000000000000000000000000000000000000000..855fb7fe49956528eb3649379e4ca74210e8ee61 Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_segm.jpg differ diff --git a/projects/DensePose/doc/images/res_bbox_dp_u.jpg b/projects/DensePose/doc/images/res_bbox_dp_u.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fd4e77bb3e775f018ff590ec78c18e4e3b4bf82e Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_u.jpg differ diff --git a/projects/DensePose/doc/images/res_bbox_dp_v.jpg b/projects/DensePose/doc/images/res_bbox_dp_v.jpg new file mode 100644 index 0000000000000000000000000000000000000000..09a81973763efe102cb0511cc5116b50225a36a9 Binary files /dev/null and b/projects/DensePose/doc/images/res_bbox_dp_v.jpg differ diff --git a/projects/DensePose/doc/images/vis_bbox_dp_i.jpg b/projects/DensePose/doc/images/vis_bbox_dp_i.jpg new file mode 100644 index 0000000000000000000000000000000000000000..113dd84f460f6bfe0c7c2d9ab56e69af360206c4 Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_i.jpg differ diff --git a/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg b/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1a81dae4a73c88dd20ccda83648ac7048ad36c95 Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_pts.jpg differ diff --git a/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg b/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg new file mode 100644 index 0000000000000000000000000000000000000000..b17f831724797da574a171b568225194e363fc38 Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_segm.jpg differ diff --git a/projects/DensePose/doc/images/vis_bbox_dp_u.jpg b/projects/DensePose/doc/images/vis_bbox_dp_u.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e21be74331f910cdf2579e2e5cdaaba2f8a595a1 Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_u.jpg differ diff --git a/projects/DensePose/doc/images/vis_bbox_dp_v.jpg b/projects/DensePose/doc/images/vis_bbox_dp_v.jpg new file mode 100644 index 0000000000000000000000000000000000000000..7bcab2ccecdf3fbb8a29416f7da937a99fa9c87b Binary files /dev/null and b/projects/DensePose/doc/images/vis_bbox_dp_v.jpg differ diff --git a/projects/DensePose/query_db.py b/projects/DensePose/query_db.py new file mode 100755 index 0000000000000000000000000000000000000000..6d3ea2ffdff7559a8cd78df95a5fb7f308f33e1e --- /dev/null +++ b/projects/DensePose/query_db.py @@ -0,0 +1,250 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import argparse +import logging +import os +import sys +from timeit import default_timer as timer +from typing import Any, ClassVar, Dict, List +import torch +from fvcore.common.file_io import PathManager + +from detectron2.data.catalog import DatasetCatalog +from detectron2.utils.logger import setup_logger + +from densepose.data.structures import DensePoseDataRelative +from densepose.utils.dbhelper import EntrySelector +from densepose.utils.logger import verbosity_to_level +from densepose.vis.base import CompoundVisualizer +from densepose.vis.bounding_box import BoundingBoxVisualizer +from densepose.vis.densepose import ( + DensePoseDataCoarseSegmentationVisualizer, + DensePoseDataPointsIVisualizer, + DensePoseDataPointsUVisualizer, + DensePoseDataPointsVisualizer, + DensePoseDataPointsVVisualizer, +) + +DOC = """Query DB - a tool to print / visualize data from a database +""" + +LOGGER_NAME = "query_db" + +logger = logging.getLogger(LOGGER_NAME) + +_ACTION_REGISTRY: Dict[str, "Action"] = {} + + +class Action(object): + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + parser.add_argument( + "-v", + "--verbosity", + action="count", + help="Verbose mode. Multiple -v options increase the verbosity.", + ) + + +def register_action(cls: type): + """ + Decorator for action classes to automate action registration + """ + global _ACTION_REGISTRY + _ACTION_REGISTRY[cls.COMMAND] = cls + return cls + + +class EntrywiseAction(Action): + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(EntrywiseAction, cls).add_arguments(parser) + parser.add_argument( + "dataset", metavar="", help="Dataset name (e.g. densepose_coco_2014_train)" + ) + parser.add_argument( + "selector", + metavar="", + help="Dataset entry selector in the form field1[:type]=value1[," + "field2[:type]=value_min-value_max...] which selects all " + "entries from the dataset that satisfy the constraints", + ) + parser.add_argument( + "--max-entries", metavar="N", help="Maximum number of entries to process", type=int + ) + + @classmethod + def execute(cls: type, args: argparse.Namespace): + dataset = setup_dataset(args.dataset) + entry_selector = EntrySelector.from_string(args.selector) + context = cls.create_context(args) + if args.max_entries is not None: + for _, entry in zip(range(args.max_entries), dataset): + if entry_selector(entry): + cls.execute_on_entry(entry, context) + else: + for entry in dataset: + if entry_selector(entry): + cls.execute_on_entry(entry, context) + + @classmethod + def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]: + context = {} + return context + + +@register_action +class PrintAction(EntrywiseAction): + """ + Print action that outputs selected entries to stdout + """ + + COMMAND: ClassVar[str] = "print" + + @classmethod + def add_parser(cls: type, subparsers: argparse._SubParsersAction): + parser = subparsers.add_parser(cls.COMMAND, help="Output selected entries to stdout. ") + cls.add_arguments(parser) + parser.set_defaults(func=cls.execute) + + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(PrintAction, cls).add_arguments(parser) + + @classmethod + def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]): + import pprint + + printer = pprint.PrettyPrinter(indent=2, width=200, compact=True) + printer.pprint(entry) + + +@register_action +class ShowAction(EntrywiseAction): + """ + Show action that visualizes selected entries on an image + """ + + COMMAND: ClassVar[str] = "show" + VISUALIZERS: ClassVar[Dict[str, object]] = { + "dp_segm": DensePoseDataCoarseSegmentationVisualizer(), + "dp_i": DensePoseDataPointsIVisualizer(), + "dp_u": DensePoseDataPointsUVisualizer(), + "dp_v": DensePoseDataPointsVVisualizer(), + "dp_pts": DensePoseDataPointsVisualizer(), + "bbox": BoundingBoxVisualizer(), + } + + @classmethod + def add_parser(cls: type, subparsers: argparse._SubParsersAction): + parser = subparsers.add_parser(cls.COMMAND, help="Visualize selected entries") + cls.add_arguments(parser) + parser.set_defaults(func=cls.execute) + + @classmethod + def add_arguments(cls: type, parser: argparse.ArgumentParser): + super(ShowAction, cls).add_arguments(parser) + parser.add_argument( + "visualizations", + metavar="", + help="Comma separated list of visualizations, possible values: " + "[{}]".format(",".join(sorted(cls.VISUALIZERS.keys()))), + ) + parser.add_argument( + "--output", + metavar="", + default="output.png", + help="File name to save output to", + ) + + @classmethod + def execute_on_entry(cls: type, entry: Dict[str, Any], context: Dict[str, Any]): + import cv2 + import numpy as np + + image_fpath = PathManager.get_local_path(entry["file_name"]) + image = cv2.imread(image_fpath, cv2.IMREAD_GRAYSCALE) + image = np.tile(image[:, :, np.newaxis], [1, 1, 3]) + datas = cls._extract_data_for_visualizers_from_entry(context["vis_specs"], entry) + visualizer = context["visualizer"] + image_vis = visualizer.visualize(image, datas) + entry_idx = context["entry_idx"] + 1 + out_fname = cls._get_out_fname(entry_idx, context["out_fname"]) + cv2.imwrite(out_fname, image_vis) + logger.info(f"Output saved to {out_fname}") + context["entry_idx"] += 1 + + @classmethod + def _get_out_fname(cls: type, entry_idx: int, fname_base: str): + base, ext = os.path.splitext(fname_base) + return base + ".{0:04d}".format(entry_idx) + ext + + @classmethod + def create_context(cls: type, args: argparse.Namespace) -> Dict[str, Any]: + vis_specs = args.visualizations.split(",") + visualizers = [] + for vis_spec in vis_specs: + vis = cls.VISUALIZERS[vis_spec] + visualizers.append(vis) + context = { + "vis_specs": vis_specs, + "visualizer": CompoundVisualizer(visualizers), + "out_fname": args.output, + "entry_idx": 0, + } + return context + + @classmethod + def _extract_data_for_visualizers_from_entry( + cls: type, vis_specs: List[str], entry: Dict[str, Any] + ): + dp_list = [] + bbox_list = [] + for annotation in entry["annotations"]: + is_valid, _ = DensePoseDataRelative.validate_annotation(annotation) + if not is_valid: + continue + bbox = torch.as_tensor(annotation["bbox"]) + bbox_list.append(bbox) + dp_data = DensePoseDataRelative(annotation) + dp_list.append(dp_data) + datas = [] + for vis_spec in vis_specs: + datas.append(bbox_list if "bbox" == vis_spec else (bbox_list, dp_list)) + return datas + + +def setup_dataset(dataset_name): + logger.info("Loading dataset {}".format(dataset_name)) + start = timer() + dataset = DatasetCatalog.get(dataset_name) + stop = timer() + logger.info("Loaded dataset {} in {:.3f}s".format(dataset_name, stop - start)) + return dataset + + +def create_argument_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=DOC, + formatter_class=lambda prog: argparse.HelpFormatter(prog, max_help_position=120), + ) + parser.set_defaults(func=lambda _: parser.print_help(sys.stdout)) + subparsers = parser.add_subparsers(title="Actions") + for _, action in _ACTION_REGISTRY.items(): + action.add_parser(subparsers) + return parser + + +def main(): + parser = create_argument_parser() + args = parser.parse_args() + verbosity = args.verbosity if hasattr(args, "verbosity") else None + global logger + logger = setup_logger(name=LOGGER_NAME) + logger.setLevel(verbosity_to_level(verbosity)) + args.func(args) + + +if __name__ == "__main__": + main() diff --git a/projects/DensePose/tests/common.py b/projects/DensePose/tests/common.py new file mode 100644 index 0000000000000000000000000000000000000000..13bf0dd3ca113e0756d3023e36272675c6b972f9 --- /dev/null +++ b/projects/DensePose/tests/common.py @@ -0,0 +1,110 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import os +import torch + +from detectron2.config import get_cfg +from detectron2.engine import default_setup +from detectron2.modeling import build_model + +from densepose import add_dataset_category_config, add_densepose_config + +_BASE_CONFIG_DIR = "configs" +_EVOLUTION_CONFIG_SUB_DIR = "evolution" +_QUICK_SCHEDULES_CONFIG_SUB_DIR = "quick_schedules" +_BASE_CONFIG_FILE_PREFIX = "Base-" +_CONFIG_FILE_EXT = ".yaml" + + +def _get_base_config_dir(): + """ + Return the base directory for configurations + """ + return os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", _BASE_CONFIG_DIR) + + +def _get_evolution_config_dir(): + """ + Return the base directory for evolution configurations + """ + return os.path.join(_get_base_config_dir(), _EVOLUTION_CONFIG_SUB_DIR) + + +def _get_quick_schedules_config_dir(): + """ + Return the base directory for quick schedules configurations + """ + return os.path.join(_get_base_config_dir(), _QUICK_SCHEDULES_CONFIG_SUB_DIR) + + +def _collect_config_files(config_dir): + """ + Collect all configuration files (i.e. densepose_*.yaml) directly in the specified directory + """ + start = _get_base_config_dir() + results = [] + for entry in os.listdir(config_dir): + path = os.path.join(config_dir, entry) + if not os.path.isfile(path): + continue + _, ext = os.path.splitext(entry) + if ext != _CONFIG_FILE_EXT: + continue + if entry.startswith(_BASE_CONFIG_FILE_PREFIX): + continue + config_file = os.path.relpath(path, start) + results.append(config_file) + return results + + +def get_config_files(): + """ + Get all the configuration files (relative to the base configuration directory) + """ + return _collect_config_files(_get_base_config_dir()) + + +def get_evolution_config_files(): + """ + Get all the evolution configuration files (relative to the base configuration directory) + """ + return _collect_config_files(_get_evolution_config_dir()) + + +def get_quick_schedules_config_files(): + """ + Get all the quick schedules configuration files (relative to the base configuration directory) + """ + return _collect_config_files(_get_quick_schedules_config_dir()) + + +def _get_model_config(config_file): + """ + Load and return the configuration from the specified file (relative to the base configuration + directory) + """ + cfg = get_cfg() + add_dataset_category_config(cfg) + add_densepose_config(cfg) + path = os.path.join(_get_base_config_dir(), config_file) + cfg.merge_from_file(path) + if not torch.cuda.is_available(): + cfg.MODEL_DEVICE = "cpu" + return cfg + + +def get_model(config_file): + """ + Get the model from the specified file (relative to the base configuration directory) + """ + cfg = _get_model_config(config_file) + return build_model(cfg) + + +def setup(config_file): + """ + Setup the configuration from the specified file (relative to the base configuration directory) + """ + cfg = _get_model_config(config_file) + cfg.freeze() + default_setup(cfg, {}) diff --git a/projects/DensePose/tests/test_model_e2e.py b/projects/DensePose/tests/test_model_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..eed131080547d84185c1d33913014a2c977b119f --- /dev/null +++ b/projects/DensePose/tests/test_model_e2e.py @@ -0,0 +1,43 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest +import torch + +from detectron2.structures import BitMasks, Boxes, Instances + +from .common import get_model + + +# TODO(plabatut): Modularize detectron2 tests and re-use +def make_model_inputs(image, instances=None): + if instances is None: + return {"image": image} + + return {"image": image, "instances": instances} + + +def make_empty_instances(h, w): + instances = Instances((h, w)) + instances.gt_boxes = Boxes(torch.rand(0, 4)) + instances.gt_classes = torch.tensor([]).to(dtype=torch.int64) + instances.gt_masks = BitMasks(torch.rand(0, h, w)) + return instances + + +class ModelE2ETest(unittest.TestCase): + CONFIG_PATH = "" + + def setUp(self): + self.model = get_model(self.CONFIG_PATH) + + def _test_eval(self, sizes): + inputs = [make_model_inputs(torch.rand(3, size[0], size[1])) for size in sizes] + self.model.eval() + self.model(inputs) + + +class DensePoseRCNNE2ETest(ModelE2ETest): + CONFIG_PATH = "densepose_rcnn_R_101_FPN_s1x.yaml" + + def test_empty_data(self): + self._test_eval([(200, 250), (200, 249)]) diff --git a/projects/DensePose/tests/test_setup.py b/projects/DensePose/tests/test_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..96827f14b3a71d571c2109791233b5bcf7ef35f8 --- /dev/null +++ b/projects/DensePose/tests/test_setup.py @@ -0,0 +1,30 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest + +from .common import ( + get_config_files, + get_evolution_config_files, + get_quick_schedules_config_files, + setup, +) + + +class TestSetup(unittest.TestCase): + def _test_setup(self, config_file): + setup(config_file) + + def test_setup_configs(self): + config_files = get_config_files() + for config_file in config_files: + self._test_setup(config_file) + + def test_setup_evolution_configs(self): + config_files = get_evolution_config_files() + for config_file in config_files: + self._test_setup(config_file) + + def test_setup_quick_schedules_configs(self): + config_files = get_quick_schedules_config_files() + for config_file in config_files: + self._test_setup(config_file) diff --git a/projects/DensePose/tests/test_structures.py b/projects/DensePose/tests/test_structures.py new file mode 100644 index 0000000000000000000000000000000000000000..ad97c23a43a9a72db566ec272b10f5bbda874695 --- /dev/null +++ b/projects/DensePose/tests/test_structures.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import unittest + +from densepose.data.structures import normalized_coords_transform + + +class TestStructures(unittest.TestCase): + def test_normalized_coords_transform(self): + bbox = (32, 24, 288, 216) + x0, y0, w, h = bbox + xmin, ymin, xmax, ymax = x0, y0, x0 + w, y0 + h + f = normalized_coords_transform(*bbox) + # Top-left + expected_p, actual_p = (-1, -1), f((xmin, ymin)) + self.assertEqual(expected_p, actual_p) + # Top-right + expected_p, actual_p = (1, -1), f((xmax, ymin)) + self.assertEqual(expected_p, actual_p) + # Bottom-left + expected_p, actual_p = (-1, 1), f((xmin, ymax)) + self.assertEqual(expected_p, actual_p) + # Bottom-right + expected_p, actual_p = (1, 1), f((xmax, ymax)) + self.assertEqual(expected_p, actual_p) diff --git a/projects/DensePose/train_net.py b/projects/DensePose/train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..9d2e7bd8b92964f752620d92e7acb662c0b86fa7 --- /dev/null +++ b/projects/DensePose/train_net.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +DensePose Training Script. + +This script is similar to the training script in detectron2/tools. + +It is an example of how a user might use detectron2 for a new project. +""" + +import logging +import os +from collections import OrderedDict +from fvcore.common.file_io import PathManager + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import CfgNode, get_cfg +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch +from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results +from detectron2.modeling import DatasetMapperTTA +from detectron2.utils.logger import setup_logger + +from densepose import ( + DensePoseCOCOEvaluator, + DensePoseGeneralizedRCNNWithTTA, + add_dataset_category_config, + add_densepose_config, + load_from_cfg, +) +from densepose.data import DatasetMapper, build_detection_test_loader, build_detection_train_loader + + +class Trainer(DefaultTrainer): + @classmethod + def build_evaluator(cls, cfg: CfgNode, dataset_name, output_folder=None): + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluators = [COCOEvaluator(dataset_name, cfg, True, output_folder)] + if cfg.MODEL.DENSEPOSE_ON: + evaluators.append(DensePoseCOCOEvaluator(dataset_name, True, output_folder)) + return DatasetEvaluators(evaluators) + + @classmethod + def build_test_loader(cls, cfg: CfgNode, dataset_name): + return build_detection_test_loader(cfg, dataset_name, mapper=DatasetMapper(cfg, False)) + + @classmethod + def build_train_loader(cls, cfg: CfgNode): + return build_detection_train_loader(cfg, mapper=DatasetMapper(cfg, True)) + + @classmethod + def test_with_TTA(cls, cfg: CfgNode, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA + # Only support some R-CNN models. + logger.info("Running inference with test-time augmentation ...") + transform_data = load_from_cfg(cfg) + model = DensePoseGeneralizedRCNNWithTTA(cfg, model, transform_data, DatasetMapperTTA(cfg)) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res + + +def setup(args): + cfg = get_cfg() + add_dataset_category_config(cfg) + add_densepose_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + # Setup logger for "densepose" module + setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="densepose") + return cfg + + +def main(args): + cfg = setup(args) + # disable strict kwargs checking: allow one to specify path handle + # hints through kwargs, like timeout in DP evaluation + PathManager.set_strict_kwargs_checking(False) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + if cfg.TEST.AUG.ENABLED: + res.update(Trainer.test_with_TTA(cfg, model)) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + if cfg.TEST.AUG.ENABLED: + trainer.register_hooks( + [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] + ) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/projects/PointRend/README.md b/projects/PointRend/README.md new file mode 100644 index 0000000000000000000000000000000000000000..443736fff35cc49e02807a7b941da19c0bdfa666 --- /dev/null +++ b/projects/PointRend/README.md @@ -0,0 +1,135 @@ +# PointRend: Image Segmentation as Rendering + +Alexander Kirillov, Yuxin Wu, Kaiming He, Ross Girshick + +[[`arXiv`](https://arxiv.org/abs/1912.08193)] [[`BibTeX`](#CitingPointRend)] + +
+ +

+ +In this repository, we release code for PointRend in Detectron2. PointRend can be flexibly applied to both instance and semantic segmentation tasks by building on top of existing state-of-the-art models. + +## Installation +Install Detectron 2 following [INSTALL.md](https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md). You are ready to go! + +## Quick start and visualization + +This [Colab Notebook](https://colab.research.google.com/drive/1isGPL5h5_cKoPPhVL9XhMokRtHDvmMVL) tutorial contains examples of PointRend usage and visualizations of its point sampling stages. + +## Training + +To train a model with 8 GPUs run: +```bash +cd /path/to/detectron2/projects/PointRend +python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --num-gpus 8 +``` + +## Evaluation + +Model evaluation can be done similarly: +```bash +cd /path/to/detectron2/projects/PointRend +python train_net.py --config-file configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint +``` + +# Pretrained Models + +## Instance Segmentation +#### COCO + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Mask
head
Backbonelr
sched
Output
resolution
mask
AP
mask
AP*
model iddownload
PointRendR50-FPN224×22436.239.7164254221model | metrics
PointRendR50-FPN224×22438.341.6164955410model | metrics
+ +AP* is COCO mask AP evaluated against the higher-quality LVIS annotations; see the paper for details. Run `python detectron2/datasets/prepare_cocofied_lvis.py` to prepare GT files for AP* evaluation. Since LVIS annotations are not exhaustive `lvis-api` and not `cocoapi` should be used to evaluate AP*. + +#### Cityscapes +Cityscapes model is trained with ImageNet pretraining. + + + + + + + + + + + + + + + + + + + + +
Mask
head
Backbonelr
sched
Output
resolution
mask
AP
model iddownload
PointRendR50-FPN224×22435.9164255101model | metrics
+ + +## Semantic Segmentation + +#### Cityscapes +Cityscapes model is trained with ImageNet pretraining. + + + + + + + + + + + + + + + + + + +
MethodBackboneOutput
resolution
mIoUmodel iddownload
SemanticFPN + PointRendR101-FPN1024×204878.6186480235model | metrics
+ +## Citing PointRend + +If you use PointRend, please use the following BibTeX entry. + +```BibTeX +@InProceedings{kirillov2019pointrend, + title={{PointRend}: Image Segmentation as Rendering}, + author={Alexander Kirillov and Yuxin Wu and Kaiming He and Ross Girshick}, + journal={ArXiv:1912.08193}, + year={2019} +} +``` diff --git a/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml b/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d3917188afe04c7626e539f7c0bc28df4118a290 --- /dev/null +++ b/projects/PointRend/configs/InstanceSegmentation/Base-PointRend-RCNN-FPN.yaml @@ -0,0 +1,21 @@ +_BASE_: "../../../../configs/Base-RCNN-FPN.yaml" +MODEL: + ROI_HEADS: + NAME: "PointRendROIHeads" + IN_FEATURES: ["p2", "p3", "p4", "p5"] + ROI_BOX_HEAD: + TRAIN_ON_PRED_BOXES: True + ROI_MASK_HEAD: + NAME: "CoarseMaskHead" + FC_DIM: 1024 + NUM_FC: 2 + OUTPUT_SIDE_RESOLUTION: 7 + IN_FEATURES: ["p2"] + POINT_HEAD_ON: True + POINT_HEAD: + FC_DIM: 256 + NUM_FC: 3 + IN_FEATURES: ["p2"] +INPUT: + # PointRend for instance segmenation does not work with "polygon" mask_format. + MASK_FORMAT: "bitmask" diff --git a/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c23dbe1c8463d16f6be110ef49acd8c6142c3aa8 --- /dev/null +++ b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_cityscapes.yaml @@ -0,0 +1,23 @@ +_BASE_: Base-PointRend-RCNN-FPN.yaml +MODEL: + WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl + MASK_ON: true + RESNETS: + DEPTH: 50 + ROI_HEADS: + NUM_CLASSES: 8 + POINT_HEAD: + NUM_CLASSES: 8 +DATASETS: + TEST: ("cityscapes_fine_instance_seg_val",) + TRAIN: ("cityscapes_fine_instance_seg_train",) +SOLVER: + BASE_LR: 0.01 + IMS_PER_BATCH: 8 + MAX_ITER: 24000 + STEPS: (18000,) +INPUT: + MAX_SIZE_TEST: 2048 + MAX_SIZE_TRAIN: 2048 + MIN_SIZE_TEST: 1024 + MIN_SIZE_TRAIN: (800, 832, 864, 896, 928, 960, 992, 1024) diff --git a/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e9fc573bf544de8610a65a7cda2a0df57aec0abf --- /dev/null +++ b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_1x_coco.yaml @@ -0,0 +1,9 @@ +_BASE_: Base-PointRend-RCNN-FPN.yaml +MODEL: + WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl + MASK_ON: true + RESNETS: + DEPTH: 50 +# To add COCO AP evaluation against the higher-quality LVIS annotations. +# DATASETS: +# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied") diff --git a/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2f013f32aeb4122f50c5c4030e9738d9d474ba34 --- /dev/null +++ b/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml @@ -0,0 +1,13 @@ +_BASE_: Base-PointRend-RCNN-FPN.yaml +MODEL: + WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl + MASK_ON: true + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 +# To add COCO AP evaluation against the higher-quality LVIS annotations. +# DATASETS: +# TEST: ("coco_2017_val", "lvis_v0.5_val_cocofied") + diff --git a/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml b/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml new file mode 100644 index 0000000000000000000000000000000000000000..00562a92363dc47c6ebe9ef8bebb89cd5e5b8502 --- /dev/null +++ b/projects/PointRend/configs/SemanticSegmentation/Base-PointRend-Semantic-FPN.yaml @@ -0,0 +1,19 @@ +_BASE_: "../../../../configs/Base-RCNN-FPN.yaml" +MODEL: + META_ARCHITECTURE: "SemanticSegmentor" + BACKBONE: + FREEZE_AT: 0 + SEM_SEG_HEAD: + NAME: "PointRendSemSegHead" + POINT_HEAD: + NUM_CLASSES: 54 + FC_DIM: 256 + NUM_FC: 3 + IN_FEATURES: ["p2"] + TRAIN_NUM_POINTS: 1024 + SUBDIVISION_STEPS: 2 + SUBDIVISION_NUM_POINTS: 8192 + COARSE_SEM_SEG_HEAD_NAME: "SemSegFPNHead" +DATASETS: + TRAIN: ("coco_2017_train_panoptic_stuffonly",) + TEST: ("coco_2017_val_panoptic_stuffonly",) diff --git a/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml b/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4965b068c11bc568317ea3cc8c83d8c44234b936 --- /dev/null +++ b/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_101_FPN_1x_cityscapes.yaml @@ -0,0 +1,33 @@ +_BASE_: Base-PointRend-Semantic-FPN.yaml +MODEL: + WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-101.pkl + RESNETS: + DEPTH: 101 + SEM_SEG_HEAD: + NUM_CLASSES: 19 + POINT_HEAD: + NUM_CLASSES: 19 + TRAIN_NUM_POINTS: 2048 + SUBDIVISION_NUM_POINTS: 8192 +DATASETS: + TRAIN: ("cityscapes_fine_sem_seg_train",) + TEST: ("cityscapes_fine_sem_seg_val",) +SOLVER: + BASE_LR: 0.01 + STEPS: (40000, 55000) + MAX_ITER: 65000 + IMS_PER_BATCH: 32 +INPUT: + MIN_SIZE_TRAIN: (512, 768, 1024, 1280, 1536, 1792, 2048) + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 1024 + MAX_SIZE_TRAIN: 4096 + MAX_SIZE_TEST: 2048 + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: (512, 1024) + SINGLE_CATEGORY_MAX_AREA: 0.75 + COLOR_AUG_SSD: True +DATALOADER: + NUM_WORKERS: 16 diff --git a/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml b/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7948bd808ea9888b20d1e118abf6bb630c485f39 --- /dev/null +++ b/projects/PointRend/configs/SemanticSegmentation/pointrend_semantic_R_50_FPN_1x_coco.yaml @@ -0,0 +1,5 @@ +_BASE_: Base-PointRend-Semantic-FPN.yaml +MODEL: + WEIGHTS: detectron2://ImageNetPretrained/MSRA/R-50.pkl + RESNETS: + DEPTH: 50 diff --git a/projects/PointRend/point_rend/__init__.py b/projects/PointRend/point_rend/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4020fe0a287f87cb3bd2487b5b40b7e1e2647aa8 --- /dev/null +++ b/projects/PointRend/point_rend/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .config import add_pointrend_config +from .coarse_mask_head import CoarseMaskHead +from .roi_heads import PointRendROIHeads +from .dataset_mapper import SemSegDatasetMapper +from .semantic_seg import PointRendSemSegHead diff --git a/projects/PointRend/point_rend/coarse_mask_head.py b/projects/PointRend/point_rend/coarse_mask_head.py new file mode 100644 index 0000000000000000000000000000000000000000..3f1cffb4c985dc3121a863eb7b378965b718a19d --- /dev/null +++ b/projects/PointRend/point_rend/coarse_mask_head.py @@ -0,0 +1,92 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.layers import Conv2d, ShapeSpec +from detectron2.modeling import ROI_MASK_HEAD_REGISTRY + + +@ROI_MASK_HEAD_REGISTRY.register() +class CoarseMaskHead(nn.Module): + """ + A mask head with fully connected layers. Given pooled features it first reduces channels and + spatial dimensions with conv layers and then uses FC layers to predict coarse masks analogously + to the standard box head. + """ + + def __init__(self, cfg, input_shape: ShapeSpec): + """ + The following attributes are parsed from config: + conv_dim: the output dimension of the conv layers + fc_dim: the feature dimenstion of the FC layers + num_fc: the number of FC layers + output_side_resolution: side resolution of the output square mask prediction + """ + super(CoarseMaskHead, self).__init__() + + # fmt: off + self.num_classes = cfg.MODEL.ROI_HEADS.NUM_CLASSES + conv_dim = cfg.MODEL.ROI_MASK_HEAD.CONV_DIM + self.fc_dim = cfg.MODEL.ROI_MASK_HEAD.FC_DIM + num_fc = cfg.MODEL.ROI_MASK_HEAD.NUM_FC + self.output_side_resolution = cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION + self.input_channels = input_shape.channels + self.input_h = input_shape.height + self.input_w = input_shape.width + # fmt: on + + self.conv_layers = [] + if self.input_channels > conv_dim: + self.reduce_channel_dim_conv = Conv2d( + self.input_channels, + conv_dim, + kernel_size=1, + stride=1, + padding=0, + bias=True, + activation=F.relu, + ) + self.conv_layers.append(self.reduce_channel_dim_conv) + + self.reduce_spatial_dim_conv = Conv2d( + conv_dim, conv_dim, kernel_size=2, stride=2, padding=0, bias=True, activation=F.relu + ) + self.conv_layers.append(self.reduce_spatial_dim_conv) + + input_dim = conv_dim * self.input_h * self.input_w + input_dim //= 4 + + self.fcs = [] + for k in range(num_fc): + fc = nn.Linear(input_dim, self.fc_dim) + self.add_module("coarse_mask_fc{}".format(k + 1), fc) + self.fcs.append(fc) + input_dim = self.fc_dim + + output_dim = self.num_classes * self.output_side_resolution * self.output_side_resolution + + self.prediction = nn.Linear(self.fc_dim, output_dim) + # use normal distribution initialization for mask prediction layer + nn.init.normal_(self.prediction.weight, std=0.001) + nn.init.constant_(self.prediction.bias, 0) + + for layer in self.conv_layers: + weight_init.c2_msra_fill(layer) + for layer in self.fcs: + weight_init.c2_xavier_fill(layer) + + def forward(self, x): + # unlike BaseMaskRCNNHead, this head only outputs intermediate + # features, because the features will be used later by PointHead. + N = x.shape[0] + x = x.view(N, self.input_channels, self.input_h, self.input_w) + for layer in self.conv_layers: + x = layer(x) + x = torch.flatten(x, start_dim=1) + for layer in self.fcs: + x = F.relu(layer(x)) + return self.prediction(x).view( + N, self.num_classes, self.output_side_resolution, self.output_side_resolution + ) diff --git a/projects/PointRend/point_rend/color_augmentation.py b/projects/PointRend/point_rend/color_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..27344c470adac143186e61c8a5b0f39900937634 --- /dev/null +++ b/projects/PointRend/point_rend/color_augmentation.py @@ -0,0 +1,98 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import random +import cv2 +from fvcore.transforms.transform import Transform + + +class ColorAugSSDTransform(Transform): + """ + A color related data augmentation used in Single Shot Multibox Detector (SSD). + + Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, + Scott Reed, Cheng-Yang Fu, Alexander C. Berg. + SSD: Single Shot MultiBox Detector. ECCV 2016. + + Implementation based on: + + https://github.com/weiliu89/caffe/blob + /4817bf8b4200b35ada8ed0dc378dceaf38c539e4 + /src/caffe/util/im_transforms.cpp + + https://github.com/chainer/chainercv/blob + /7159616642e0be7c5b3ef380b848e16b7e99355b/chainercv + /links/model/ssd/transforms.py + """ + + def __init__( + self, + img_format, + brightness_delta=32, + contrast_low=0.5, + contrast_high=1.5, + saturation_low=0.5, + saturation_high=1.5, + hue_delta=18, + ): + super().__init__() + assert img_format in ["BGR", "RGB"] + self.is_rgb = img_format == "RGB" + del img_format + self._set_attributes(locals()) + + def apply_coords(self, coords): + return coords + + def apply_segmentation(self, segmentation): + return segmentation + + def apply_image(self, img, interp=None): + if self.is_rgb: + img = img[:, :, [2, 1, 0]] + img = self.brightness(img) + if random.randrange(2): + img = self.contrast(img) + img = self.saturation(img) + img = self.hue(img) + else: + img = self.saturation(img) + img = self.hue(img) + img = self.contrast(img) + if self.is_rgb: + img = img[:, :, [2, 1, 0]] + return img + + def convert(self, img, alpha=1, beta=0): + img = img.astype(np.float32) * alpha + beta + img = np.clip(img, 0, 255) + return img.astype(np.uint8) + + def brightness(self, img): + if random.randrange(2): + return self.convert( + img, beta=random.uniform(-self.brightness_delta, self.brightness_delta) + ) + return img + + def contrast(self, img): + if random.randrange(2): + return self.convert(img, alpha=random.uniform(self.contrast_low, self.contrast_high)) + return img + + def saturation(self, img): + if random.randrange(2): + img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + img[:, :, 1] = self.convert( + img[:, :, 1], alpha=random.uniform(self.saturation_low, self.saturation_high) + ) + return cv2.cvtColor(img, cv2.COLOR_HSV2BGR) + return img + + def hue(self, img): + if random.randrange(2): + img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) + img[:, :, 0] = ( + img[:, :, 0].astype(int) + random.randint(-self.hue_delta, self.hue_delta) + ) % 180 + return cv2.cvtColor(img, cv2.COLOR_HSV2BGR) + return img diff --git a/projects/PointRend/point_rend/config.py b/projects/PointRend/point_rend/config.py new file mode 100644 index 0000000000000000000000000000000000000000..74f63672bba7cd25679054b19ff87254a0e24974 --- /dev/null +++ b/projects/PointRend/point_rend/config.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from detectron2.config import CfgNode as CN + + +def add_pointrend_config(cfg): + """ + Add config for PointRend. + """ + # We retry random cropping until no single category in semantic segmentation GT occupies more + # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. + cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 + # Color augmentatition from SSD paper for semantic segmentation model during training. + cfg.INPUT.COLOR_AUG_SSD = False + + # Names of the input feature maps to be used by a coarse mask head. + cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES = ("p2",) + cfg.MODEL.ROI_MASK_HEAD.FC_DIM = 1024 + cfg.MODEL.ROI_MASK_HEAD.NUM_FC = 2 + # The side size of a coarse mask head prediction. + cfg.MODEL.ROI_MASK_HEAD.OUTPUT_SIDE_RESOLUTION = 7 + # True if point head is used. + cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON = False + + cfg.MODEL.POINT_HEAD = CN() + cfg.MODEL.POINT_HEAD.NAME = "StandardPointHead" + cfg.MODEL.POINT_HEAD.NUM_CLASSES = 80 + # Names of the input feature maps to be used by a mask point head. + cfg.MODEL.POINT_HEAD.IN_FEATURES = ("p2",) + # Number of points sampled during training for a mask point head. + cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS = 14 * 14 + # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the + # original paper. + cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO = 3 + # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in + # the original paper. + cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO = 0.75 + # Number of subdivision steps during inference. + cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS = 5 + # Maximum number of points selected at each subdivision step (N). + cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS = 28 * 28 + cfg.MODEL.POINT_HEAD.FC_DIM = 256 + cfg.MODEL.POINT_HEAD.NUM_FC = 3 + cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK = False + # If True, then coarse prediction features are used as inout for each layer in PointRend's MLP. + cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER = True + cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME = "SemSegFPNHead" diff --git a/projects/PointRend/point_rend/dataset_mapper.py b/projects/PointRend/point_rend/dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..76b64ee79b679741d547c5d1ffca55ac756051ae --- /dev/null +++ b/projects/PointRend/point_rend/dataset_mapper.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import numpy as np +import torch +from fvcore.common.file_io import PathManager +from fvcore.transforms.transform import CropTransform +from PIL import Image + +from detectron2.data import detection_utils as utils +from detectron2.data import transforms as T + +from .color_augmentation import ColorAugSSDTransform + +""" +This file contains the mapping that's applied to "dataset dicts" for semantic segmentation models. +Unlike the default DatasetMapper this mapper uses cropping as the last transformation. +""" + +__all__ = ["SemSegDatasetMapper"] + + +class SemSegDatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by semantic segmentation models. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies geometric transforms to the image and annotation + 3. Find and applies suitable cropping to the image and annotation + 4. Prepare image and annotation to Tensors + """ + + def __init__(self, cfg, is_train=True): + if cfg.INPUT.CROP.ENABLED and is_train: + self.crop_gen = T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE) + logging.getLogger(__name__).info("CropGen used in training: " + str(self.crop_gen)) + else: + self.crop_gen = None + + self.tfm_gens = utils.build_transform_gen(cfg, is_train) + + if cfg.INPUT.COLOR_AUG_SSD: + self.tfm_gens.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) + logging.getLogger(__name__).info( + "Color augmnetation used in training: " + str(self.tfm_gens[-1]) + ) + + # fmt: off + self.img_format = cfg.INPUT.FORMAT + self.single_category_max_area = cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA + self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE + # fmt: on + + self.is_train = is_train + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + assert "sem_seg_file_name" in dataset_dict + + image, transforms = T.apply_transform_gens(self.tfm_gens, image) + if self.is_train: + with PathManager.open(dataset_dict.pop("sem_seg_file_name"), "rb") as f: + sem_seg_gt = Image.open(f) + sem_seg_gt = np.asarray(sem_seg_gt, dtype="uint8") + sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) + if self.crop_gen: + image, sem_seg_gt = crop_transform( + image, + sem_seg_gt, + self.crop_gen, + self.single_category_max_area, + self.ignore_value, + ) + dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) + + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + + if not self.is_train: + dataset_dict.pop("sem_seg_file_name", None) + return dataset_dict + + return dataset_dict + + +def crop_transform(image, sem_seg, crop_gen, single_category_max_area, ignore_value): + """ + Find a cropping window such that no single category occupies more than + `single_category_max_area` in `sem_seg`. The function retries random cropping 10 times max. + """ + if single_category_max_area >= 1.0: + crop_tfm = crop_gen.get_transform(image) + sem_seg_temp = crop_tfm.apply_segmentation(sem_seg) + else: + h, w = sem_seg.shape + crop_size = crop_gen.get_crop_size((h, w)) + for _ in range(10): + y0 = np.random.randint(h - crop_size[0] + 1) + x0 = np.random.randint(w - crop_size[1] + 1) + sem_seg_temp = sem_seg[y0 : y0 + crop_size[0], x0 : x0 + crop_size[1]] + labels, cnt = np.unique(sem_seg_temp, return_counts=True) + cnt = cnt[labels != ignore_value] + if len(cnt) > 1 and np.max(cnt) / np.sum(cnt) < single_category_max_area: + break + crop_tfm = CropTransform(x0, y0, crop_size[1], crop_size[0]) + image = crop_tfm.apply_image(image) + return image, sem_seg_temp diff --git a/projects/PointRend/point_rend/point_features.py b/projects/PointRend/point_rend/point_features.py new file mode 100644 index 0000000000000000000000000000000000000000..320a33de8505572eedcfa94d355bf2772ab75528 --- /dev/null +++ b/projects/PointRend/point_rend/point_features.py @@ -0,0 +1,216 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch +from torch.nn import functional as F + +from detectron2.layers import cat +from detectron2.structures import Boxes + + +""" +Shape shorthand in this module: + + N: minibatch dimension size, i.e. the number of RoIs for instance segmenation or the + number of images for semantic segmenation. + R: number of ROIs, combined over all images, in the minibatch + P: number of points +""" + + +def point_sample(input, point_coords, **kwargs): + """ + A wrapper around :function:`torch.nn.functional.grid_sample` to support 3D point_coords tensors. + Unlike :function:`torch.nn.functional.grid_sample` it assumes `point_coords` to lie inside + [0, 1] x [0, 1] square. + + Args: + input (Tensor): A tensor of shape (N, C, H, W) that contains features map on a H x W grid. + point_coords (Tensor): A tensor of shape (N, P, 2) or (N, Hgrid, Wgrid, 2) that contains + [0, 1] x [0, 1] normalized point coordinates. + + Returns: + output (Tensor): A tensor of shape (N, C, P) or (N, C, Hgrid, Wgrid) that contains + features for points in `point_coords`. The features are obtained via bilinear + interplation from `input` the same way as :function:`torch.nn.functional.grid_sample`. + """ + add_dim = False + if point_coords.dim() == 3: + add_dim = True + point_coords = point_coords.unsqueeze(2) + output = F.grid_sample(input, 2.0 * point_coords - 1.0, **kwargs) + if add_dim: + output = output.squeeze(3) + return output + + +def generate_regular_grid_point_coords(R, side_size, device): + """ + Generate regular square grid of points in [0, 1] x [0, 1] coordinate space. + + Args: + R (int): The number of grids to sample, one for each region. + side_size (int): The side size of the regular grid. + device (torch.device): Desired device of returned tensor. + + Returns: + (Tensor): A tensor of shape (R, side_size^2, 2) that contains coordinates + for the regular grids. + """ + aff = torch.tensor([[[0.5, 0, 0.5], [0, 0.5, 0.5]]], device=device) + r = F.affine_grid(aff, torch.Size((1, 1, side_size, side_size)), align_corners=False) + return r.view(1, -1, 2).expand(R, -1, -1) + + +def get_uncertain_point_coords_with_randomness( + coarse_logits, uncertainty_func, num_points, oversample_ratio, importance_sample_ratio +): + """ + Sample points in [0, 1] x [0, 1] coordinate space based on their uncertainty. The unceratinties + are calculated for each point using 'uncertainty_func' function that takes point's logit + prediction as input. + See PointRend paper for details. + + Args: + coarse_logits (Tensor): A tensor of shape (N, C, Hmask, Wmask) or (N, 1, Hmask, Wmask) for + class-specific or class-agnostic prediction. + uncertainty_func: A function that takes a Tensor of shape (N, C, P) or (N, 1, P) that + contains logit predictions for P points and returns their uncertainties as a Tensor of + shape (N, 1, P). + num_points (int): The number of points P to sample. + oversample_ratio (int): Oversampling parameter. + importance_sample_ratio (float): Ratio of points that are sampled via importnace sampling. + + Returns: + point_coords (Tensor): A tensor of shape (N, P, 2) that contains the coordinates of P + sampled points. + """ + assert oversample_ratio >= 1 + assert importance_sample_ratio <= 1 and importance_sample_ratio >= 0 + num_boxes = coarse_logits.shape[0] + num_sampled = int(num_points * oversample_ratio) + point_coords = torch.rand(num_boxes, num_sampled, 2, device=coarse_logits.device) + point_logits = point_sample(coarse_logits, point_coords, align_corners=False) + # It is crucial to calculate uncertainty based on the sampled prediction value for the points. + # Calculating uncertainties of the coarse predictions first and sampling them for points leads + # to incorrect results. + # To illustrate this: assume uncertainty_func(logits)=-abs(logits), a sampled point between + # two coarse predictions with -1 and 1 logits has 0 logits, and therefore 0 uncertainty value. + # However, if we calculate uncertainties for the coarse predictions first, + # both will have -1 uncertainty, and the sampled point will get -1 uncertainty. + point_uncertainties = uncertainty_func(point_logits) + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + idx = torch.topk(point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_sampled * torch.arange(num_boxes, dtype=torch.long, device=coarse_logits.device) + idx += shift[:, None] + point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( + num_boxes, num_uncertain_points, 2 + ) + if num_random_points > 0: + point_coords = cat( + [ + point_coords, + torch.rand(num_boxes, num_random_points, 2, device=coarse_logits.device), + ], + dim=1, + ) + return point_coords + + +def get_uncertain_point_coords_on_grid(uncertainty_map, num_points): + """ + Find `num_points` most uncertain points from `uncertainty_map` grid. + + Args: + uncertainty_map (Tensor): A tensor of shape (N, 1, H, W) that contains uncertainty + values for a set of points on a regular H x W grid. + num_points (int): The number of points P to select. + + Returns: + point_indices (Tensor): A tensor of shape (N, P) that contains indices from + [0, H x W) of the most uncertain points. + point_coords (Tensor): A tensor of shape (N, P, 2) that contains [0, 1] x [0, 1] normalized + coordinates of the most uncertain points from the H x W grid. + """ + R, _, H, W = uncertainty_map.shape + h_step = 1.0 / float(H) + w_step = 1.0 / float(W) + + num_points = min(H * W, num_points) + point_indices = torch.topk(uncertainty_map.view(R, H * W), k=num_points, dim=1)[1] + point_coords = torch.zeros(R, num_points, 2, dtype=torch.float, device=uncertainty_map.device) + point_coords[:, :, 0] = w_step / 2.0 + (point_indices % W).to(torch.float) * w_step + point_coords[:, :, 1] = h_step / 2.0 + (point_indices // W).to(torch.float) * h_step + return point_indices, point_coords + + +def point_sample_fine_grained_features(features_list, feature_scales, boxes, point_coords): + """ + Get features from feature maps in `features_list` that correspond to specific point coordinates + inside each bounding box from `boxes`. + + Args: + features_list (list[Tensor]): A list of feature map tensors to get features from. + feature_scales (list[float]): A list of scales for tensors in `features_list`. + boxes (list[Boxes]): A list of I Boxes objects that contain R_1 + ... + R_I = R boxes all + together. + point_coords (Tensor): A tensor of shape (R, P, 2) that contains + [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. + + Returns: + point_features (Tensor): A tensor of shape (R, C, P) that contains features sampled + from all features maps in feature_list for P sampled points for all R boxes in `boxes`. + point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains image-level + coordinates of P points. + """ + cat_boxes = Boxes.cat(boxes) + num_boxes = [len(b) for b in boxes] + + point_coords_wrt_image = get_point_coords_wrt_image(cat_boxes.tensor, point_coords) + split_point_coords_wrt_image = torch.split(point_coords_wrt_image, num_boxes) + + point_features = [] + for idx_img, point_coords_wrt_image_per_image in enumerate(split_point_coords_wrt_image): + point_features_per_image = [] + for idx_feature, feature_map in enumerate(features_list): + h, w = feature_map.shape[-2:] + scale = torch.tensor([w, h], device=feature_map.device) / feature_scales[idx_feature] + point_coords_scaled = point_coords_wrt_image_per_image / scale + point_features_per_image.append( + point_sample( + feature_map[idx_img].unsqueeze(0), + point_coords_scaled.unsqueeze(0), + align_corners=False, + ) + .squeeze(0) + .transpose(1, 0) + ) + point_features.append(cat(point_features_per_image, dim=1)) + + return cat(point_features, dim=0), point_coords_wrt_image + + +def get_point_coords_wrt_image(boxes_coords, point_coords): + """ + Convert box-normalized [0, 1] x [0, 1] point cooordinates to image-level coordinates. + + Args: + boxes_coords (Tensor): A tensor of shape (R, 4) that contains bounding boxes. + coordinates. + point_coords (Tensor): A tensor of shape (R, P, 2) that contains + [0, 1] x [0, 1] box-normalized coordinates of the P sampled points. + + Returns: + point_coords_wrt_image (Tensor): A tensor of shape (R, P, 2) that contains + image-normalized coordinates of P sampled points. + """ + with torch.no_grad(): + point_coords_wrt_image = point_coords.clone() + point_coords_wrt_image[:, :, 0] = point_coords_wrt_image[:, :, 0] * ( + boxes_coords[:, None, 2] - boxes_coords[:, None, 0] + ) + point_coords_wrt_image[:, :, 1] = point_coords_wrt_image[:, :, 1] * ( + boxes_coords[:, None, 3] - boxes_coords[:, None, 1] + ) + point_coords_wrt_image[:, :, 0] += boxes_coords[:, None, 0] + point_coords_wrt_image[:, :, 1] += boxes_coords[:, None, 1] + return point_coords_wrt_image diff --git a/projects/PointRend/point_rend/point_head.py b/projects/PointRend/point_rend/point_head.py new file mode 100644 index 0000000000000000000000000000000000000000..6f35baea064fbee14d9bcd0b57e354f82bf54a8c --- /dev/null +++ b/projects/PointRend/point_rend/point_head.py @@ -0,0 +1,154 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.layers import ShapeSpec, cat +from detectron2.structures import BitMasks +from detectron2.utils.events import get_event_storage +from detectron2.utils.registry import Registry + +from .point_features import point_sample + +POINT_HEAD_REGISTRY = Registry("POINT_HEAD") +POINT_HEAD_REGISTRY.__doc__ = """ +Registry for point heads, which makes prediction for a given set of per-point features. + +The registered object will be called with `obj(cfg, input_shape)`. +""" + + +def roi_mask_point_loss(mask_logits, instances, points_coord): + """ + Compute the point-based loss for instance segmentation mask predictions. + + Args: + mask_logits (Tensor): A tensor of shape (R, C, P) or (R, 1, P) for class-specific or + class-agnostic, where R is the total number of predicted masks in all images, C is the + number of foreground classes, and P is the number of points sampled for each mask. + The values are logits. + instances (list[Instances]): A list of N Instances, where N is the number of images + in the batch. These instances are in 1:1 correspondence with the `mask_logits`. So, i_th + elememt of the list contains R_i objects and R_1 + ... + R_N is equal to R. + The ground-truth labels (class, box, mask, ...) associated with each instance are stored + in fields. + points_coords (Tensor): A tensor of shape (R, P, 2), where R is the total number of + predicted masks and P is the number of points for each mask. The coordinates are in + the image pixel coordinate space, i.e. [0, H] x [0, W]. + Returns: + point_loss (Tensor): A scalar tensor containing the loss. + """ + assert len(instances) == 0 or isinstance( + instances[0].gt_masks, BitMasks + ), "Point head works with GT in 'bitmask' format only. Set INPUT.MASK_FORMAT to 'bitmask'." + with torch.no_grad(): + cls_agnostic_mask = mask_logits.size(1) == 1 + total_num_masks = mask_logits.size(0) + + gt_classes = [] + gt_mask_logits = [] + idx = 0 + for instances_per_image in instances: + if not cls_agnostic_mask: + gt_classes_per_image = instances_per_image.gt_classes.to(dtype=torch.int64) + gt_classes.append(gt_classes_per_image) + + gt_bit_masks = instances_per_image.gt_masks.tensor + h, w = instances_per_image.gt_masks.image_size + scale = torch.tensor([w, h], dtype=torch.float, device=gt_bit_masks.device) + points_coord_grid_sample_format = ( + points_coord[idx : idx + len(instances_per_image)] / scale + ) + idx += len(instances_per_image) + gt_mask_logits.append( + point_sample( + gt_bit_masks.to(torch.float32).unsqueeze(1), + points_coord_grid_sample_format, + align_corners=False, + ).squeeze(1) + ) + gt_mask_logits = cat(gt_mask_logits) + + # torch.mean (in binary_cross_entropy_with_logits) doesn't + # accept empty tensors, so handle it separately + if gt_mask_logits.numel() == 0: + return mask_logits.sum() * 0 + + if cls_agnostic_mask: + mask_logits = mask_logits[:, 0] + else: + indices = torch.arange(total_num_masks) + gt_classes = cat(gt_classes, dim=0) + mask_logits = mask_logits[indices, gt_classes] + + # Log the training accuracy (using gt classes and 0.0 threshold for the logits) + mask_accurate = (mask_logits > 0.0) == gt_mask_logits.to(dtype=torch.uint8) + mask_accuracy = mask_accurate.nonzero().size(0) / mask_accurate.numel() + get_event_storage().put_scalar("point_rend/accuracy", mask_accuracy) + + point_loss = F.binary_cross_entropy_with_logits( + mask_logits, gt_mask_logits.to(dtype=torch.float32), reduction="mean" + ) + return point_loss + + +@POINT_HEAD_REGISTRY.register() +class StandardPointHead(nn.Module): + """ + A point head multi-layer perceptron which we model with conv1d layers with kernel 1. The head + takes both fine-grained and coarse prediction features as its input. + """ + + def __init__(self, cfg, input_shape: ShapeSpec): + """ + The following attributes are parsed from config: + fc_dim: the output dimension of each FC layers + num_fc: the number of FC layers + coarse_pred_each_layer: if True, coarse prediction features are concatenated to each + layer's input + """ + super(StandardPointHead, self).__init__() + # fmt: off + num_classes = cfg.MODEL.POINT_HEAD.NUM_CLASSES + fc_dim = cfg.MODEL.POINT_HEAD.FC_DIM + num_fc = cfg.MODEL.POINT_HEAD.NUM_FC + cls_agnostic_mask = cfg.MODEL.POINT_HEAD.CLS_AGNOSTIC_MASK + self.coarse_pred_each_layer = cfg.MODEL.POINT_HEAD.COARSE_PRED_EACH_LAYER + input_channels = input_shape.channels + # fmt: on + + fc_dim_in = input_channels + num_classes + self.fc_layers = [] + for k in range(num_fc): + fc = nn.Conv1d(fc_dim_in, fc_dim, kernel_size=1, stride=1, padding=0, bias=True) + self.add_module("fc{}".format(k + 1), fc) + self.fc_layers.append(fc) + fc_dim_in = fc_dim + fc_dim_in += num_classes if self.coarse_pred_each_layer else 0 + + num_mask_classes = 1 if cls_agnostic_mask else num_classes + self.predictor = nn.Conv1d(fc_dim_in, num_mask_classes, kernel_size=1, stride=1, padding=0) + + for layer in self.fc_layers: + weight_init.c2_msra_fill(layer) + # use normal distribution initialization for mask prediction layer + nn.init.normal_(self.predictor.weight, std=0.001) + if self.predictor.bias is not None: + nn.init.constant_(self.predictor.bias, 0) + + def forward(self, fine_grained_features, coarse_features): + x = torch.cat((fine_grained_features, coarse_features), dim=1) + for layer in self.fc_layers: + x = F.relu(layer(x)) + if self.coarse_pred_each_layer: + x = cat((x, coarse_features), dim=1) + return self.predictor(x) + + +def build_point_head(cfg, input_channels): + """ + Build a point head defined by `cfg.MODEL.POINT_HEAD.NAME`. + """ + head_name = cfg.MODEL.POINT_HEAD.NAME + return POINT_HEAD_REGISTRY.get(head_name)(cfg, input_channels) diff --git a/projects/PointRend/point_rend/roi_heads.py b/projects/PointRend/point_rend/roi_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..4f7225bf10544461bbe1e3c777863557f2ad5808 --- /dev/null +++ b/projects/PointRend/point_rend/roi_heads.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import torch + +from detectron2.layers import ShapeSpec, cat, interpolate +from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads +from detectron2.modeling.roi_heads.mask_head import ( + build_mask_head, + mask_rcnn_inference, + mask_rcnn_loss, +) +from detectron2.modeling.roi_heads.roi_heads import select_foreground_proposals + +from .point_features import ( + generate_regular_grid_point_coords, + get_uncertain_point_coords_on_grid, + get_uncertain_point_coords_with_randomness, + point_sample, + point_sample_fine_grained_features, +) +from .point_head import build_point_head, roi_mask_point_loss + + +def calculate_uncertainty(logits, classes): + """ + We estimate uncerainty as L1 distance between 0.0 and the logit prediction in 'logits' for the + foreground class in `classes`. + + Args: + logits (Tensor): A tensor of shape (R, C, ...) or (R, 1, ...) for class-specific or + class-agnostic, where R is the total number of predicted masks in all images and C is + the number of foreground classes. The values are logits. + classes (list): A list of length R that contains either predicted of ground truth class + for eash predicted mask. + + Returns: + scores (Tensor): A tensor of shape (R, 1, ...) that contains uncertainty scores with + the most uncertain locations having the highest uncertainty score. + """ + if logits.shape[1] == 1: + gt_class_logits = logits.clone() + else: + gt_class_logits = logits[ + torch.arange(logits.shape[0], device=logits.device), classes + ].unsqueeze(1) + return -(torch.abs(gt_class_logits)) + + +@ROI_HEADS_REGISTRY.register() +class PointRendROIHeads(StandardROIHeads): + """ + The RoI heads class for PointRend instance segmentation models. + + In this class we redefine the mask head of `StandardROIHeads` leaving all other heads intact. + To avoid namespace conflict with other heads we use names starting from `mask_` for all + variables that correspond to the mask head in the class's namespace. + """ + + def __init__(self, cfg, input_shape): + # TODO use explicit args style + super().__init__(cfg, input_shape) + self._init_mask_head(cfg, input_shape) + + def _init_mask_head(self, cfg, input_shape): + # fmt: off + self.mask_on = cfg.MODEL.MASK_ON + if not self.mask_on: + return + self.mask_coarse_in_features = cfg.MODEL.ROI_MASK_HEAD.IN_FEATURES + self.mask_coarse_side_size = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION + self._feature_scales = {k: 1.0 / v.stride for k, v in input_shape.items()} + # fmt: on + + in_channels = np.sum([input_shape[f].channels for f in self.mask_coarse_in_features]) + self.mask_coarse_head = build_mask_head( + cfg, + ShapeSpec( + channels=in_channels, + width=self.mask_coarse_side_size, + height=self.mask_coarse_side_size, + ), + ) + self._init_point_head(cfg, input_shape) + + def _init_point_head(self, cfg, input_shape): + # fmt: off + self.mask_point_on = cfg.MODEL.ROI_MASK_HEAD.POINT_HEAD_ON + if not self.mask_point_on: + return + assert cfg.MODEL.ROI_HEADS.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES + self.mask_point_in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES + self.mask_point_train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS + self.mask_point_oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO + self.mask_point_importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO + # next two parameters are use in the adaptive subdivions inference procedure + self.mask_point_subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS + self.mask_point_subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS + # fmt: on + + in_channels = np.sum([input_shape[f].channels for f in self.mask_point_in_features]) + self.mask_point_head = build_point_head( + cfg, ShapeSpec(channels=in_channels, width=1, height=1) + ) + + def _forward_mask(self, features, instances): + """ + Forward logic of the mask prediction branch. + + Args: + features (dict[str, Tensor]): #level input features for mask prediction + instances (list[Instances]): the per-image instances to train/predict masks. + In training, they can be the proposals. + In inference, they can be the predicted boxes. + + Returns: + In training, a dict of losses. + In inference, update `instances` with new fields "pred_masks" and return it. + """ + if not self.mask_on: + return {} if self.training else instances + + if self.training: + proposals, _ = select_foreground_proposals(instances, self.num_classes) + proposal_boxes = [x.proposal_boxes for x in proposals] + mask_coarse_logits = self._forward_mask_coarse(features, proposal_boxes) + + losses = {"loss_mask": mask_rcnn_loss(mask_coarse_logits, proposals)} + losses.update(self._forward_mask_point(features, mask_coarse_logits, proposals)) + return losses + else: + pred_boxes = [x.pred_boxes for x in instances] + mask_coarse_logits = self._forward_mask_coarse(features, pred_boxes) + + mask_logits = self._forward_mask_point(features, mask_coarse_logits, instances) + mask_rcnn_inference(mask_logits, instances) + return instances + + def _forward_mask_coarse(self, features, boxes): + """ + Forward logic of the coarse mask head. + """ + point_coords = generate_regular_grid_point_coords( + np.sum(len(x) for x in boxes), self.mask_coarse_side_size, boxes[0].device + ) + mask_coarse_features_list = [features[k] for k in self.mask_coarse_in_features] + features_scales = [self._feature_scales[k] for k in self.mask_coarse_in_features] + # For regular grids of points, this function is equivalent to `len(features_list)' calls + # of `ROIAlign` (with `SAMPLING_RATIO=2`), and concat the results. + mask_features, _ = point_sample_fine_grained_features( + mask_coarse_features_list, features_scales, boxes, point_coords + ) + return self.mask_coarse_head(mask_features) + + def _forward_mask_point(self, features, mask_coarse_logits, instances): + """ + Forward logic of the mask point head. + """ + if not self.mask_point_on: + return {} if self.training else mask_coarse_logits + + mask_features_list = [features[k] for k in self.mask_point_in_features] + features_scales = [self._feature_scales[k] for k in self.mask_point_in_features] + + if self.training: + proposal_boxes = [x.proposal_boxes for x in instances] + gt_classes = cat([x.gt_classes for x in instances]) + with torch.no_grad(): + point_coords = get_uncertain_point_coords_with_randomness( + mask_coarse_logits, + lambda logits: calculate_uncertainty(logits, gt_classes), + self.mask_point_train_num_points, + self.mask_point_oversample_ratio, + self.mask_point_importance_sample_ratio, + ) + + fine_grained_features, point_coords_wrt_image = point_sample_fine_grained_features( + mask_features_list, features_scales, proposal_boxes, point_coords + ) + coarse_features = point_sample(mask_coarse_logits, point_coords, align_corners=False) + point_logits = self.mask_point_head(fine_grained_features, coarse_features) + return { + "loss_mask_point": roi_mask_point_loss( + point_logits, instances, point_coords_wrt_image + ) + } + else: + pred_boxes = [x.pred_boxes for x in instances] + pred_classes = cat([x.pred_classes for x in instances]) + # The subdivision code will fail with the empty list of boxes + if len(pred_classes) == 0: + return mask_coarse_logits + + mask_logits = mask_coarse_logits.clone() + for subdivions_step in range(self.mask_point_subdivision_steps): + mask_logits = interpolate( + mask_logits, scale_factor=2, mode="bilinear", align_corners=False + ) + # If `mask_point_subdivision_num_points` is larger or equal to the + # resolution of the next step, then we can skip this step + H, W = mask_logits.shape[-2:] + if ( + self.mask_point_subdivision_num_points >= 4 * H * W + and subdivions_step < self.mask_point_subdivision_steps - 1 + ): + continue + uncertainty_map = calculate_uncertainty(mask_logits, pred_classes) + point_indices, point_coords = get_uncertain_point_coords_on_grid( + uncertainty_map, self.mask_point_subdivision_num_points + ) + fine_grained_features, _ = point_sample_fine_grained_features( + mask_features_list, features_scales, pred_boxes, point_coords + ) + coarse_features = point_sample( + mask_coarse_logits, point_coords, align_corners=False + ) + point_logits = self.mask_point_head(fine_grained_features, coarse_features) + + # put mask point predictions to the right places on the upsampled grid. + R, C, H, W = mask_logits.shape + point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) + mask_logits = ( + mask_logits.reshape(R, C, H * W) + .scatter_(2, point_indices, point_logits) + .view(R, C, H, W) + ) + return mask_logits diff --git a/projects/PointRend/point_rend/semantic_seg.py b/projects/PointRend/point_rend/semantic_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..670a0ea201a6de82f3126171e6320d56f65e1ba7 --- /dev/null +++ b/projects/PointRend/point_rend/semantic_seg.py @@ -0,0 +1,134 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +from typing import Dict +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.layers import ShapeSpec, cat +from detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from .point_features import ( + get_uncertain_point_coords_on_grid, + get_uncertain_point_coords_with_randomness, + point_sample, +) +from .point_head import build_point_head + + +def calculate_uncertainty(sem_seg_logits): + """ + For each location of the prediction `sem_seg_logits` we estimate uncerainty as the + difference between top first and top second predicted logits. + + Args: + mask_logits (Tensor): A tensor of shape (N, C, ...), where N is the minibatch size and + C is the number of foreground classes. The values are logits. + + Returns: + scores (Tensor): A tensor of shape (N, 1, ...) that contains uncertainty scores with + the most uncertain locations having the highest uncertainty score. + """ + top2_scores = torch.topk(sem_seg_logits, k=2, dim=1)[0] + return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1) + + +@SEM_SEG_HEADS_REGISTRY.register() +class PointRendSemSegHead(nn.Module): + """ + A semantic segmentation head that combines a head set in `POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME` + and a point head set in `MODEL.POINT_HEAD.NAME`. + """ + + def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]): + super().__init__() + + self.ignore_value = cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE + + self.coarse_sem_seg_head = SEM_SEG_HEADS_REGISTRY.get( + cfg.MODEL.POINT_HEAD.COARSE_SEM_SEG_HEAD_NAME + )(cfg, input_shape) + self._init_point_head(cfg, input_shape) + + def _init_point_head(self, cfg, input_shape: Dict[str, ShapeSpec]): + # fmt: off + assert cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES == cfg.MODEL.POINT_HEAD.NUM_CLASSES + feature_channels = {k: v.channels for k, v in input_shape.items()} + self.in_features = cfg.MODEL.POINT_HEAD.IN_FEATURES + self.train_num_points = cfg.MODEL.POINT_HEAD.TRAIN_NUM_POINTS + self.oversample_ratio = cfg.MODEL.POINT_HEAD.OVERSAMPLE_RATIO + self.importance_sample_ratio = cfg.MODEL.POINT_HEAD.IMPORTANCE_SAMPLE_RATIO + self.subdivision_steps = cfg.MODEL.POINT_HEAD.SUBDIVISION_STEPS + self.subdivision_num_points = cfg.MODEL.POINT_HEAD.SUBDIVISION_NUM_POINTS + # fmt: on + + in_channels = np.sum([feature_channels[f] for f in self.in_features]) + self.point_head = build_point_head(cfg, ShapeSpec(channels=in_channels, width=1, height=1)) + + def forward(self, features, targets=None): + coarse_sem_seg_logits = self.coarse_sem_seg_head.layers(features) + + if self.training: + losses = self.coarse_sem_seg_head.losses(coarse_sem_seg_logits, targets) + + with torch.no_grad(): + point_coords = get_uncertain_point_coords_with_randomness( + coarse_sem_seg_logits, + calculate_uncertainty, + self.train_num_points, + self.oversample_ratio, + self.importance_sample_ratio, + ) + coarse_features = point_sample(coarse_sem_seg_logits, point_coords, align_corners=False) + + fine_grained_features = cat( + [ + point_sample(features[in_feature], point_coords, align_corners=False) + for in_feature in self.in_features + ] + ) + point_logits = self.point_head(fine_grained_features, coarse_features) + point_targets = ( + point_sample( + targets.unsqueeze(1).to(torch.float), + point_coords, + mode="nearest", + align_corners=False, + ) + .squeeze(1) + .to(torch.long) + ) + losses["loss_sem_seg_point"] = F.cross_entropy( + point_logits, point_targets, reduction="mean", ignore_index=self.ignore_value + ) + return None, losses + else: + sem_seg_logits = coarse_sem_seg_logits.clone() + for _ in range(self.subdivision_steps): + sem_seg_logits = F.interpolate( + sem_seg_logits, scale_factor=2, mode="bilinear", align_corners=False + ) + uncertainty_map = calculate_uncertainty(sem_seg_logits) + point_indices, point_coords = get_uncertain_point_coords_on_grid( + uncertainty_map, self.subdivision_num_points + ) + fine_grained_features = cat( + [ + point_sample(features[in_feature], point_coords, align_corners=False) + for in_feature in self.in_features + ] + ) + coarse_features = point_sample( + coarse_sem_seg_logits, point_coords, align_corners=False + ) + point_logits = self.point_head(fine_grained_features, coarse_features) + + # put sem seg point predictions to the right places on the upsampled grid. + N, C, H, W = sem_seg_logits.shape + point_indices = point_indices.unsqueeze(1).expand(-1, C, -1) + sem_seg_logits = ( + sem_seg_logits.reshape(N, C, H * W) + .scatter_(2, point_indices, point_logits) + .view(N, C, H, W) + ) + return sem_seg_logits, {} diff --git a/projects/PointRend/train_net.py b/projects/PointRend/train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..7832867ec668c5715c4124c02b72909a318836e8 --- /dev/null +++ b/projects/PointRend/train_net.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +PointRend Training Script. + +This script is a simplified version of the training script in detectron2/tools. +""" + +import os +import torch + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import MetadataCatalog, build_detection_train_loader +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch +from detectron2.evaluation import ( + CityscapesInstanceEvaluator, + CityscapesSemSegEvaluator, + COCOEvaluator, + DatasetEvaluators, + LVISEvaluator, + SemSegEvaluator, + verify_results, +) + +from point_rend import SemSegDatasetMapper, add_pointrend_config + + +class Trainer(DefaultTrainer): + """ + We use the "DefaultTrainer" which contains a number pre-defined logic for + standard training workflow. They may not work for you, especially if you + are working on a new research project. In that case you can use the cleaner + "SimpleTrainer", or write your own training loop. + """ + + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + """ + Create evaluator(s) for a given dataset. + This uses the special metadata "evaluator_type" associated with each builtin dataset. + For your own dataset, you can simply create an evaluator manually in your + script and do not have to worry about the hacky if-else logic here. + """ + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type == "lvis": + return LVISEvaluator(dataset_name, cfg, True, output_folder) + if evaluator_type == "coco": + return COCOEvaluator(dataset_name, cfg, True, output_folder) + if evaluator_type == "sem_seg": + return SemSegEvaluator( + dataset_name, + distributed=True, + num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + output_dir=output_folder, + ) + if evaluator_type == "cityscapes_instance": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesInstanceEvaluator(dataset_name) + if evaluator_type == "cityscapes_sem_seg": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesSemSegEvaluator(dataset_name) + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format( + dataset_name, evaluator_type + ) + ) + if len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + @classmethod + def build_train_loader(cls, cfg): + if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE: + mapper = SemSegDatasetMapper(cfg, True) + else: + mapper = None + return build_detection_train_loader(cfg, mapper=mapper) + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + add_pointrend_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/projects/README.md b/projects/README.md new file mode 100644 index 0000000000000000000000000000000000000000..30a41f008776a8755ec4dc19f4be07c514cd3794 --- /dev/null +++ b/projects/README.md @@ -0,0 +1,31 @@ + +Here are a few projects that are built on detectron2. +They are examples of how to use detectron2 as a library, to make your projects more +maintainable. + +## Projects by Facebook + +Note that these are research projects, and therefore may not have the same level +of support or stability of detectron2. + ++ [DensePose: Dense Human Pose Estimation In The Wild](DensePose) ++ [Scale-Aware Trident Networks for Object Detection](TridentNet) ++ [TensorMask: A Foundation for Dense Object Segmentation](TensorMask) ++ [Mesh R-CNN](https://github.com/facebookresearch/meshrcnn) ++ [PointRend: Image Segmentation as Rendering](PointRend) ++ [Momentum Contrast for Unsupervised Visual Representation Learning](https://github.com/facebookresearch/moco/tree/master/detection) + + +## External Projects + +External projects in the community that use detectron2: + + + ++ [VoVNet backbones](https://github.com/youngwanLEE/vovnet-detectron2). ++ [AdelaiDet](https://github.com/aim-uofa/adet), a detection toolbox from the Universtiy of Adelaide. ++ [CenterMask : Real-Time Anchor-Free Instance Segmentation](https://github.com/youngwanLEE/centermask2) diff --git a/projects/TensorMask/README.md b/projects/TensorMask/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6831508b9aea37f0e88bec62c98f2bf2b64240ab --- /dev/null +++ b/projects/TensorMask/README.md @@ -0,0 +1,64 @@ + +# TensorMask in Detectron2 +**A Foundation for Dense Object Segmentation** + +Xinlei Chen, Ross Girshick, Kaiming He, Piotr Dollár + +[[`arXiv`](https://arxiv.org/abs/1903.12174)] [[`BibTeX`](#CitingTensorMask)] + +
+ +
+ +In this repository, we release code for TensorMask in Detectron2. +TensorMask is a dense sliding-window instance segmentation framework that, for the first time, achieves results close to the well-developed Mask R-CNN framework -- both qualitatively and quantitatively. It establishes a conceptually complementary direction for object instance segmentation research. + +## Installation +First install Detectron2 following the [documentation](https://detectron2.readthedocs.io/tutorials/install.html) and +[setup the dataset](../../datasets). Then compile the TensorMask-specific op (`swap_align2nat`): +```bash +cd /path/to/detectron2/projects/TensorMask +python setup.py build develop +``` + +## Training + +To train a model, run: +```bash +python /path/to/detectron2/projects/TensorMask/train_net.py --config-file +``` + +For example, to launch TensorMask BiPyramid training (1x schedule) with ResNet-50 backbone on 8 GPUs, +one should execute: +```bash +python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_1x.yaml --num-gpus 8 +``` + +## Evaluation + +Model evaluation can be done similarly (6x schedule with scale augmentation): +```bash +python /path/to/detectron2/projects/TensorMask/train_net.py --config-file configs/tensormask_R_50_FPN_6x.yaml --eval-only MODEL.WEIGHTS /path/to/model_checkpoint +``` + +# Pretrained Models + +| Backbone | lr sched | AP box | AP mask | download | +| -------- | -------- | -- | --- | -------- | +| R50 | 1x | 37.6 | 32.4 | model \|  metrics | +| R50 | 6x | 41.4 | 35.8 | model \|  metrics | + + +## Citing TensorMask + +If you use TensorMask, please use the following BibTeX entry. + +``` +@InProceedings{chen2019tensormask, + title={Tensormask: A Foundation for Dense Object Segmentation}, + author={Chen, Xinlei and Girshick, Ross and He, Kaiming and Doll{\'a}r, Piotr}, + journal={The International Conference on Computer Vision (ICCV)}, + year={2019} +} +``` + diff --git a/projects/TensorMask/configs/Base-TensorMask.yaml b/projects/TensorMask/configs/Base-TensorMask.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a7245349b4aa9cfa00f20074cc7cb5cdb02607f9 --- /dev/null +++ b/projects/TensorMask/configs/Base-TensorMask.yaml @@ -0,0 +1,25 @@ +MODEL: + META_ARCHITECTURE: "TensorMask" + MASK_ON: True + BACKBONE: + NAME: "build_retinanet_resnet_fpn_backbone" + RESNETS: + OUT_FEATURES: ["res2", "res3", "res4", "res5"] + ANCHOR_GENERATOR: + SIZES: [[44, 60], [88, 120], [176, 240], [352, 480], [704, 960], [1408, 1920]] + ASPECT_RATIOS: [[1.0]] + FPN: + IN_FEATURES: ["res2", "res3", "res4", "res5"] + FUSE_TYPE: "avg" + TENSOR_MASK: + ALIGNED_ON: True + BIPYRAMID_ON: True +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +VERSION: 2 diff --git a/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml b/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5d5eee135a93149a0c4b2148a47cee02e8aed8eb --- /dev/null +++ b/projects/TensorMask/configs/tensormask_R_50_FPN_1x.yaml @@ -0,0 +1,5 @@ +_BASE_: "Base-TensorMask.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 diff --git a/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml b/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..366a965c4adfdbba2482593c0c81f3e6af50dfd2 --- /dev/null +++ b/projects/TensorMask/configs/tensormask_R_50_FPN_6x.yaml @@ -0,0 +1,11 @@ +_BASE_: "Base-TensorMask.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (480000, 520000) + MAX_ITER: 540000 +INPUT: + MIN_SIZE_TRAIN_SAMPLING: "range" + MIN_SIZE_TRAIN: (640, 800) diff --git a/projects/TensorMask/setup.py b/projects/TensorMask/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..0194e76608966b528ab32879edc40a8e4ac3225f --- /dev/null +++ b/projects/TensorMask/setup.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import glob +import os +from setuptools import find_packages, setup +import torch +from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "tensormask", "layers", "csrc") + + main_source = os.path.join(extensions_dir, "vision.cpp") + sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob( + os.path.join(extensions_dir, "*.cu") + ) + + sources = [main_source] + sources + + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if (torch.cuda.is_available() and CUDA_HOME is not None) or os.getenv("FORCE_CUDA", "0") == "1": + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + # It's better if pytorch can do this by default .. + CC = os.environ.get("CC", None) + if CC is not None: + extra_compile_args["nvcc"].append("-ccbin={}".format(CC)) + + sources = [os.path.join(extensions_dir, s) for s in sources] + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "tensormask._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +setup( + name="tensormask", + version="0.1", + author="FAIR", + packages=find_packages(exclude=("configs", "tests")), + python_requires=">=3.6", + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/projects/TensorMask/tensormask/__init__.py b/projects/TensorMask/tensormask/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b642a55519867dc52ccc57a36c32c72c3d34da --- /dev/null +++ b/projects/TensorMask/tensormask/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .config import add_tensormask_config +from .arch import TensorMask diff --git a/projects/TensorMask/tensormask/arch.py b/projects/TensorMask/tensormask/arch.py new file mode 100644 index 0000000000000000000000000000000000000000..a3e89c6b4283b28fe8028300e146d7b7543f0da1 --- /dev/null +++ b/projects/TensorMask/tensormask/arch.py @@ -0,0 +1,904 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import copy +import logging +import math +from typing import List +import torch +import torch.nn.functional as F +from fvcore.nn import sigmoid_focal_loss_star_jit, smooth_l1_loss +from torch import nn + +from detectron2.layers import ShapeSpec, batched_nms, cat, paste_masks_in_image +from detectron2.modeling.anchor_generator import DefaultAnchorGenerator +from detectron2.modeling.backbone import build_backbone +from detectron2.modeling.box_regression import Box2BoxTransform +from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY +from detectron2.modeling.meta_arch.retinanet import ( + permute_all_cls_and_box_to_N_HWA_K_and_concat, + permute_to_N_HWA_K, +) +from detectron2.structures import Boxes, ImageList, Instances +from detectron2.utils.logger import log_first_n + +from tensormask.layers import SwapAlign2Nat + +__all__ = ["TensorMask"] + + +def _assignment_rule( + gt_boxes, + anchor_boxes, + unit_lengths, + min_anchor_size, + scale_thresh=2.0, + spatial_thresh=1.0, + uniqueness_on=True, +): + """ + Given two lists of boxes of N ground truth boxes and M anchor boxes, + compute the assignment between the two, following the assignment rules in + https://arxiv.org/abs/1903.12174. + The box order must be (xmin, ymin, xmax, ymax), so please make sure to convert + to BoxMode.XYXY_ABS before calling this function. + + Args: + gt_boxes, anchor_boxes (Boxes): two Boxes. Contains N & M boxes/anchors, respectively. + unit_lengths (Tensor): Contains the unit lengths of M anchor boxes. + min_anchor_size (float): Minimum size of the anchor, in pixels + scale_thresh (float): The `scale` threshold: the maximum size of the anchor + should not be greater than scale_thresh x max(h, w) of + the ground truth box. + spatial_thresh (float): The `spatial` threshold: the l2 distance between the + center of the anchor and the ground truth box should not + be greater than spatial_thresh x u where u is the unit length. + + Returns: + matches (Tensor[int64]): a vector of length M, where matches[i] is a matched + ground-truth index in [0, N) + match_labels (Tensor[int8]): a vector of length M, where pred_labels[i] indicates + whether a prediction is a true or false positive or ignored + """ + gt_boxes, anchor_boxes = gt_boxes.tensor, anchor_boxes.tensor + N = gt_boxes.shape[0] + M = anchor_boxes.shape[0] + if N == 0 or M == 0: + return ( + gt_boxes.new_full((N,), 0, dtype=torch.int64), + gt_boxes.new_full((N,), -1, dtype=torch.int8), + ) + + # Containment rule + lt = torch.min(gt_boxes[:, None, :2], anchor_boxes[:, :2]) # [N,M,2] + rb = torch.max(gt_boxes[:, None, 2:], anchor_boxes[:, 2:]) # [N,M,2] + union = cat([lt, rb], dim=2) # [N,M,4] + + dummy_gt_boxes = torch.zeros_like(gt_boxes) + anchor = dummy_gt_boxes[:, None, :] + anchor_boxes[:, :] # [N,M,4] + + contain_matrix = torch.all(union == anchor, dim=2) # [N,M] + + # Centrality rule, scale + gt_size_lower = torch.max(gt_boxes[:, 2:] - gt_boxes[:, :2], dim=1)[0] # [N] + gt_size_upper = gt_size_lower * scale_thresh # [N] + # Fall back for small objects + gt_size_upper[gt_size_upper < min_anchor_size] = min_anchor_size + # Due to sampling of locations, the anchor sizes are deducted with sampling strides + anchor_size = ( + torch.max(anchor_boxes[:, 2:] - anchor_boxes[:, :2], dim=1)[0] - unit_lengths + ) # [M] + + size_diff_upper = gt_size_upper[:, None] - anchor_size # [N,M] + scale_matrix = size_diff_upper >= 0 # [N,M] + + # Centrality rule, spatial + gt_center = (gt_boxes[:, 2:] + gt_boxes[:, :2]) / 2 # [N,2] + anchor_center = (anchor_boxes[:, 2:] + anchor_boxes[:, :2]) / 2 # [M,2] + offset_center = gt_center[:, None, :] - anchor_center[:, :] # [N,M,2] + offset_center /= unit_lengths[:, None] # [N,M,2] + spatial_square = spatial_thresh * spatial_thresh + spatial_matrix = torch.sum(offset_center * offset_center, dim=2) <= spatial_square + + assign_matrix = (contain_matrix & scale_matrix & spatial_matrix).int() + + # assign_matrix is N (gt) x M (predicted) + # Max over gt elements (dim 0) to find best gt candidate for each prediction + matched_vals, matches = assign_matrix.max(dim=0) + match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8) + + match_labels[matched_vals == 0] = 0 + match_labels[matched_vals == 1] = 1 + + # find all the elements that match to ground truths multiple times + not_unique_idxs = assign_matrix.sum(dim=0) > 1 + if uniqueness_on: + match_labels[not_unique_idxs] = 0 + else: + match_labels[not_unique_idxs] = -1 + + return matches, match_labels + + +# TODO make the paste_mask function in d2 core support mask list +def _paste_mask_lists_in_image(masks, boxes, image_shape, threshold=0.5): + """ + Paste a list of masks that are of various resolutions (e.g., 28 x 28) into an image. + The location, height, and width for pasting each mask is determined by their + corresponding bounding boxes in boxes. + + Args: + masks (list(Tensor)): A list of Tensor of shape (1, Hmask_i, Wmask_i). + Values are in [0, 1]. The list length, Bimg, is the + number of detected object instances in the image. + boxes (Boxes): A Boxes of length Bimg. boxes.tensor[i] and masks[i] correspond + to the same object instance. + image_shape (tuple): height, width + threshold (float): A threshold in [0, 1] for converting the (soft) masks to + binary masks. + + Returns: + img_masks (Tensor): A tensor of shape (Bimg, Himage, Wimage), where Bimg is the + number of detected object instances and Himage, Wimage are the image width + and height. img_masks[i] is a binary mask for object instance i. + """ + if len(masks) == 0: + return torch.empty((0, 1) + image_shape, dtype=torch.uint8) + + # Loop over masks groups. Each group has the same mask prediction size. + img_masks = [] + ind_masks = [] + mask_sizes = torch.tensor([m.shape[-1] for m in masks]) + unique_sizes = torch.unique(mask_sizes) + for msize in unique_sizes.tolist(): + cur_ind = torch.where(mask_sizes == msize)[0] + ind_masks.append(cur_ind) + + cur_masks = cat([masks[i] for i in cur_ind]) + cur_boxes = boxes[cur_ind] + img_masks.append(paste_masks_in_image(cur_masks, cur_boxes, image_shape, threshold)) + + img_masks = cat(img_masks) + ind_masks = cat(ind_masks) + + img_masks_out = torch.empty_like(img_masks) + img_masks_out[ind_masks, :, :] = img_masks + + return img_masks_out + + +def _postprocess(results, result_mask_info, output_height, output_width, mask_threshold=0.5): + """ + Post-process the output boxes for TensorMask. + The input images are often resized when entering an object detector. + As a result, we often need the outputs of the detector in a different + resolution from its inputs. + + This function will postprocess the raw outputs of TensorMask + to produce outputs according to the desired output resolution. + + Args: + results (Instances): the raw outputs from the detector. + `results.image_size` contains the input image resolution the detector sees. + This object might be modified in-place. Note that it does not contain the field + `pred_masks`, which is provided by another input `result_masks`. + result_mask_info (list[Tensor], Boxes): a pair of two items for mask related results. + The first item is a list of #detection tensors, each is the predicted masks. + The second item is the anchors corresponding to the predicted masks. + output_height, output_width: the desired output resolution. + + Returns: + Instances: the postprocessed output from the model, based on the output resolution + """ + scale_x, scale_y = (output_width / results.image_size[1], output_height / results.image_size[0]) + results = Instances((output_height, output_width), **results.get_fields()) + + output_boxes = results.pred_boxes + output_boxes.tensor[:, 0::2] *= scale_x + output_boxes.tensor[:, 1::2] *= scale_y + output_boxes.clip(results.image_size) + + inds_nonempty = output_boxes.nonempty() + results = results[inds_nonempty] + result_masks, result_anchors = result_mask_info + if result_masks: + result_anchors.tensor[:, 0::2] *= scale_x + result_anchors.tensor[:, 1::2] *= scale_y + result_masks = [x for (i, x) in zip(inds_nonempty.tolist(), result_masks) if i] + results.pred_masks = _paste_mask_lists_in_image( + result_masks, + result_anchors[inds_nonempty], + results.image_size, + threshold=mask_threshold, + ) + return results + + +class TensorMaskAnchorGenerator(DefaultAnchorGenerator): + """ + For a set of image sizes and feature maps, computes a set of anchors for TensorMask. + It also computes the unit lengths and indexes for each anchor box. + """ + + def grid_anchors_with_unit_lengths_and_indexes(self, grid_sizes): + anchors = [] + unit_lengths = [] + indexes = [] + for lvl, (size, stride, base_anchors) in enumerate( + zip(grid_sizes, self.strides, self.cell_anchors) + ): + grid_height, grid_width = size + device = base_anchors.device + shifts_x = torch.arange( + 0, grid_width * stride, step=stride, dtype=torch.float32, device=device + ) + shifts_y = torch.arange( + 0, grid_height * stride, step=stride, dtype=torch.float32, device=device + ) + shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) + shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=2) + # Stack anchors in shapes of (HWA, 4) + cur_anchor = (shifts[:, :, None, :] + base_anchors.view(1, 1, -1, 4)).view(-1, 4) + anchors.append(cur_anchor) + unit_lengths.append( + torch.full((cur_anchor.shape[0],), stride, dtype=torch.float32, device=device) + ) + # create mask indexes using mesh grid + shifts_l = torch.full((1,), lvl, dtype=torch.int64, device=device) + shifts_i = torch.zeros((1,), dtype=torch.int64, device=device) + shifts_h = torch.arange(0, grid_height, dtype=torch.int64, device=device) + shifts_w = torch.arange(0, grid_width, dtype=torch.int64, device=device) + shifts_a = torch.arange(0, base_anchors.shape[0], dtype=torch.int64, device=device) + grids = torch.meshgrid(shifts_l, shifts_i, shifts_h, shifts_w, shifts_a) + + indexes.append(torch.stack(grids, dim=5).view(-1, 5)) + + return anchors, unit_lengths, indexes + + def forward(self, features): + """ + Returns: + list[list[Boxes]]: a list of #image elements. Each is a list of #feature level Boxes. + The Boxes contains anchors of this image on the specific feature level. + list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors. + The tensor contains strides, or unit lengths for the anchors. + list[list[Tensor]]: a list of #image elements. Each is a list of #feature level tensors. + The Tensor contains indexes for the anchors, with the last dimension meaning + (L, N, H, W, A), where L is level, I is image (not set yet), H is height, + W is width, and A is anchor. + """ + num_images = len(features[0]) + grid_sizes = [feature_map.shape[-2:] for feature_map in features] + anchors_list, lengths_list, indexes_list = self.grid_anchors_with_unit_lengths_and_indexes( + grid_sizes + ) + + # Convert anchors from Tensor to Boxes + anchors_per_im = [Boxes(x) for x in anchors_list] + + # TODO it can be simplified to not return duplicated information for + # each image, just like detectron2's own AnchorGenerator + anchors = [copy.deepcopy(anchors_per_im) for _ in range(num_images)] + unit_lengths = [copy.deepcopy(lengths_list) for _ in range(num_images)] + indexes = [copy.deepcopy(indexes_list) for _ in range(num_images)] + + return anchors, unit_lengths, indexes + + +@META_ARCH_REGISTRY.register() +class TensorMask(nn.Module): + """ + TensorMask model. Creates FPN backbone, anchors and a head for classification + and box regression. Calculates and applies proper losses to class, box, and + masks. + """ + + def __init__(self, cfg): + super().__init__() + + # fmt: off + self.num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES + self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES + self.anchor_sizes = cfg.MODEL.ANCHOR_GENERATOR.SIZES + self.num_levels = len(cfg.MODEL.ANCHOR_GENERATOR.SIZES) + # Loss parameters: + self.focal_loss_alpha = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA + self.focal_loss_gamma = cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA + # Inference parameters: + self.score_threshold = cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST + self.topk_candidates = cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST + self.nms_threshold = cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST + self.detections_im = cfg.TEST.DETECTIONS_PER_IMAGE + # Mask parameters: + self.mask_on = cfg.MODEL.MASK_ON + self.mask_loss_weight = cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT + self.mask_pos_weight = torch.tensor(cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT, + dtype=torch.float32) + self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON + # fmt: on + + # build the backbone + self.backbone = build_backbone(cfg) + + backbone_shape = self.backbone.output_shape() + feature_shapes = [backbone_shape[f] for f in self.in_features] + feature_strides = [x.stride for x in feature_shapes] + # build anchors + self.anchor_generator = TensorMaskAnchorGenerator(cfg, feature_shapes) + self.num_anchors = self.anchor_generator.num_cell_anchors[0] + anchors_min_level = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0] + self.mask_sizes = [size // feature_strides[0] for size in anchors_min_level] + self.min_anchor_size = min(anchors_min_level) - feature_strides[0] + + # head of the TensorMask + self.head = TensorMaskHead( + cfg, self.num_levels, self.num_anchors, self.mask_sizes, feature_shapes + ) + # box transform + self.box2box_transform = Box2BoxTransform(weights=cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS) + self.register_buffer("pixel_mean", torch.Tensor(cfg.MODEL.PIXEL_MEAN).view(-1, 1, 1)) + self.register_buffer("pixel_std", torch.Tensor(cfg.MODEL.PIXEL_STD).view(-1, 1, 1)) + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DetectionTransform` . + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + image: Tensor, image in (C, H, W) format. + instances: Instances + Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model, used in inference. + See :meth:`postprocess` for details. + Returns: + losses (dict[str: Tensor]): mapping from a named loss to a tensor + storing the loss. Used during training only. + """ + images = self.preprocess_image(batched_inputs) + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + elif "targets" in batched_inputs[0]: + log_first_n( + logging.WARN, "'targets' in the model inputs is now renamed to 'instances'!", n=10 + ) + gt_instances = [x["targets"].to(self.device) for x in batched_inputs] + else: + gt_instances = None + + features = self.backbone(images.tensor) + features = [features[f] for f in self.in_features] + # apply the TensorMask head + pred_logits, pred_deltas, pred_masks = self.head(features) + # generate anchors based on features, is it image specific? + anchors, unit_lengths, indexes = self.anchor_generator(features) + + if self.training: + # get ground truths for class labels and box targets, it will label each anchor + gt_class_info, gt_delta_info, gt_mask_info, num_fg = self.get_ground_truth( + anchors, unit_lengths, indexes, gt_instances + ) + # compute the loss + return self.losses( + gt_class_info, + gt_delta_info, + gt_mask_info, + num_fg, + pred_logits, + pred_deltas, + pred_masks, + ) + else: + # do inference to get the output + results = self.inference(pred_logits, pred_deltas, pred_masks, anchors, indexes, images) + processed_results = [] + for results_im, input_im, image_size in zip( + results, batched_inputs, images.image_sizes + ): + height = input_im.get("height", image_size[0]) + width = input_im.get("width", image_size[1]) + # this is to do post-processing with the image size + result_box, result_mask = results_im + r = _postprocess(result_box, result_mask, height, width) + processed_results.append({"instances": r}) + return processed_results + + def losses( + self, + gt_class_info, + gt_delta_info, + gt_mask_info, + num_fg, + pred_logits, + pred_deltas, + pred_masks, + ): + """ + Args: + For `gt_class_info`, `gt_delta_info`, `gt_mask_info` and `num_fg` parameters, see + :meth:`TensorMask.get_ground_truth`. + For `pred_logits`, `pred_deltas` and `pred_masks`, see + :meth:`TensorMaskHead.forward`. + + Returns: + losses (dict[str: Tensor]): mapping from a named loss to a scalar tensor + storing the loss. Used during training only. The potential dict keys are: + "loss_cls", "loss_box_reg" and "loss_mask". + """ + gt_classes_target, gt_valid_inds = gt_class_info + gt_deltas, gt_fg_inds = gt_delta_info + gt_masks, gt_mask_inds = gt_mask_info + loss_normalizer = torch.tensor(max(1, num_fg), dtype=torch.float32, device=self.device) + + # classification and regression + pred_logits, pred_deltas = permute_all_cls_and_box_to_N_HWA_K_and_concat( + pred_logits, pred_deltas, self.num_classes + ) + loss_cls = ( + sigmoid_focal_loss_star_jit( + pred_logits[gt_valid_inds], + gt_classes_target[gt_valid_inds], + alpha=self.focal_loss_alpha, + gamma=self.focal_loss_gamma, + reduction="sum", + ) + / loss_normalizer + ) + + if num_fg == 0: + loss_box_reg = pred_deltas.sum() * 0 + else: + loss_box_reg = ( + smooth_l1_loss(pred_deltas[gt_fg_inds], gt_deltas, beta=0.0, reduction="sum") + / loss_normalizer + ) + losses = {"loss_cls": loss_cls, "loss_box_reg": loss_box_reg} + + # mask prediction + if self.mask_on: + loss_mask = 0 + for lvl in range(self.num_levels): + cur_level_factor = 2 ** lvl if self.bipyramid_on else 1 + for anc in range(self.num_anchors): + cur_gt_mask_inds = gt_mask_inds[lvl][anc] + if cur_gt_mask_inds is None: + loss_mask += pred_masks[lvl][anc][0, 0, 0, 0] * 0 + else: + cur_mask_size = self.mask_sizes[anc] * cur_level_factor + # TODO maybe there are numerical issues when mask sizes are large + cur_size_divider = torch.tensor( + self.mask_loss_weight / (cur_mask_size ** 2), + dtype=torch.float32, + device=self.device, + ) + + cur_pred_masks = pred_masks[lvl][anc][ + cur_gt_mask_inds[:, 0], # N + :, # V x U + cur_gt_mask_inds[:, 1], # H + cur_gt_mask_inds[:, 2], # W + ] + + loss_mask += F.binary_cross_entropy_with_logits( + cur_pred_masks.view(-1, cur_mask_size, cur_mask_size), # V, U + gt_masks[lvl][anc].to(dtype=torch.float32), + reduction="sum", + weight=cur_size_divider, + pos_weight=self.mask_pos_weight, + ) + losses["loss_mask"] = loss_mask / loss_normalizer + return losses + + @torch.no_grad() + def get_ground_truth(self, anchors, unit_lengths, indexes, targets): + """ + Args: + anchors (list[list[Boxes]]): a list of N=#image elements. Each is a + list of #feature level Boxes. The Boxes contains anchors of + this image on the specific feature level. + unit_lengths (list[list[Tensor]]): a list of N=#image elements. Each is a + list of #feature level Tensor. The tensor contains unit lengths for anchors of + this image on the specific feature level. + indexes (list[list[Tensor]]): a list of N=#image elements. Each is a + list of #feature level Tensor. The tensor contains the 5D index of + each anchor, the second dimension means (L, N, H, W, A), where L + is level, I is image, H is height, W is width, and A is anchor. + targets (list[Instances]): a list of N `Instances`s. The i-th + `Instances` contains the ground-truth per-instance annotations + for the i-th input image. Specify `targets` during training only. + + Returns: + gt_class_info (Tensor, Tensor): A pair of two tensors for classification. + The first one is an integer tensor of shape (R, #classes) storing ground-truth + labels for each anchor. R is the total number of anchors in the batch. + The second one is an integer tensor of shape (R,), to indicate which + anchors are valid for loss computation, which anchors are not. + gt_delta_info (Tensor, Tensor): A pair of two tensors for boxes. + The first one, of shape (F, 4). F=#foreground anchors. + The last dimension represents ground-truth box2box transform + targets (dx, dy, dw, dh) that map each anchor to its matched ground-truth box. + Only foreground anchors have values in this tensor. Could be `None` if F=0. + The second one, of shape (R,), is an integer tensor indicating which anchors + are foreground ones used for box regression. Could be `None` if F=0. + gt_mask_info (list[list[Tensor]], list[list[Tensor]]): A pair of two lists for masks. + The first one is a list of P=#feature level elements. Each is a + list of A=#anchor tensors. Each tensor contains the ground truth + masks of the same size and for the same feature level. Could be `None`. + The second one is a list of P=#feature level elements. Each is a + list of A=#anchor tensors. Each tensor contains the location of the ground truth + masks of the same size and for the same feature level. The second dimension means + (N, H, W), where N is image, H is height, and W is width. Could be `None`. + num_fg (int): F=#foreground anchors, used later for loss normalization. + """ + gt_classes = [] + gt_deltas = [] + gt_masks = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] + gt_mask_inds = [[[] for _ in range(self.num_anchors)] for _ in range(self.num_levels)] + + anchors = [Boxes.cat(anchors_i) for anchors_i in anchors] + unit_lengths = [cat(unit_lengths_i) for unit_lengths_i in unit_lengths] + indexes = [cat(indexes_i) for indexes_i in indexes] + + num_fg = 0 + for i, (anchors_im, unit_lengths_im, indexes_im, targets_im) in enumerate( + zip(anchors, unit_lengths, indexes, targets) + ): + # Initialize all + gt_classes_i = torch.full_like( + unit_lengths_im, self.num_classes, dtype=torch.int64, device=self.device + ) + # Ground truth classes + has_gt = len(targets_im) > 0 + if has_gt: + # Compute the pairwise matrix + gt_matched_inds, anchor_labels = _assignment_rule( + targets_im.gt_boxes, anchors_im, unit_lengths_im, self.min_anchor_size + ) + # Find the foreground instances + fg_inds = anchor_labels == 1 + fg_anchors = anchors_im[fg_inds] + num_fg += len(fg_anchors) + # Find the ground truths for foreground instances + gt_fg_matched_inds = gt_matched_inds[fg_inds] + # Assign labels for foreground instances + gt_classes_i[fg_inds] = targets_im.gt_classes[gt_fg_matched_inds] + # Anchors with label -1 are ignored, others are left as negative + gt_classes_i[anchor_labels == -1] = -1 + + # Boxes + # Ground truth box regression, only for foregrounds + matched_gt_boxes = targets_im[gt_fg_matched_inds].gt_boxes + # Compute box regression offsets for foregrounds only + gt_deltas_i = self.box2box_transform.get_deltas( + fg_anchors.tensor, matched_gt_boxes.tensor + ) + gt_deltas.append(gt_deltas_i) + + # Masks + if self.mask_on: + # Compute masks for each level and each anchor + matched_indexes = indexes_im[fg_inds, :] + for lvl in range(self.num_levels): + ids_lvl = matched_indexes[:, 0] == lvl + if torch.any(ids_lvl): + cur_level_factor = 2 ** lvl if self.bipyramid_on else 1 + for anc in range(self.num_anchors): + ids_lvl_anchor = ids_lvl & (matched_indexes[:, 4] == anc) + if torch.any(ids_lvl_anchor): + gt_masks[lvl][anc].append( + targets_im[ + gt_fg_matched_inds[ids_lvl_anchor] + ].gt_masks.crop_and_resize( + fg_anchors[ids_lvl_anchor].tensor, + self.mask_sizes[anc] * cur_level_factor, + ) + ) + # Select (N, H, W) dimensions + gt_mask_inds_lvl_anc = matched_indexes[ids_lvl_anchor, 1:4] + # Set the image index to the current image + gt_mask_inds_lvl_anc[:, 0] = i + gt_mask_inds[lvl][anc].append(gt_mask_inds_lvl_anc) + gt_classes.append(gt_classes_i) + + # Classes and boxes + gt_classes = cat(gt_classes) + gt_valid_inds = gt_classes >= 0 + gt_fg_inds = gt_valid_inds & (gt_classes < self.num_classes) + gt_classes_target = torch.zeros( + (gt_classes.shape[0], self.num_classes), dtype=torch.float32, device=self.device + ) + gt_classes_target[gt_fg_inds, gt_classes[gt_fg_inds]] = 1 + gt_deltas = cat(gt_deltas) if gt_deltas else None + + # Masks + gt_masks = [[cat(mla) if mla else None for mla in ml] for ml in gt_masks] + gt_mask_inds = [[cat(ila) if ila else None for ila in il] for il in gt_mask_inds] + return ( + (gt_classes_target, gt_valid_inds), + (gt_deltas, gt_fg_inds), + (gt_masks, gt_mask_inds), + num_fg, + ) + + def inference(self, pred_logits, pred_deltas, pred_masks, anchors, indexes, images): + """ + Arguments: + pred_logits, pred_deltas, pred_masks: Same as the output of: + meth:`TensorMaskHead.forward` + anchors, indexes: Same as the input of meth:`TensorMask.get_ground_truth` + images (ImageList): the input images + + Returns: + results (List[Instances]): a list of #images elements. + """ + assert len(anchors) == len(images) + results = [] + + pred_logits = [permute_to_N_HWA_K(x, self.num_classes) for x in pred_logits] + pred_deltas = [permute_to_N_HWA_K(x, 4) for x in pred_deltas] + + pred_logits = cat(pred_logits, dim=1) + pred_deltas = cat(pred_deltas, dim=1) + + for img_idx, (anchors_im, indexes_im) in enumerate(zip(anchors, indexes)): + # Get the size of the current image + image_size = images.image_sizes[img_idx] + + logits_im = pred_logits[img_idx] + deltas_im = pred_deltas[img_idx] + + if self.mask_on: + masks_im = [[mla[img_idx] for mla in ml] for ml in pred_masks] + else: + masks_im = [None] * self.num_levels + results_im = self.inference_single_image( + logits_im, + deltas_im, + masks_im, + Boxes.cat(anchors_im), + cat(indexes_im), + tuple(image_size), + ) + results.append(results_im) + return results + + def inference_single_image( + self, pred_logits, pred_deltas, pred_masks, anchors, indexes, image_size + ): + """ + Single-image inference. Return bounding-box detection results by thresholding + on scores and applying non-maximum suppression (NMS). + + Arguments: + pred_logits (list[Tensor]): list of #feature levels. Each entry contains + tensor of size (AxHxW, K) + pred_deltas (list[Tensor]): Same shape as 'pred_logits' except that K becomes 4. + pred_masks (list[list[Tensor]]): List of #feature levels, each is a list of #anchors. + Each entry contains tensor of size (M_i*M_i, H, W). `None` if mask_on=False. + anchors (list[Boxes]): list of #feature levels. Each entry contains + a Boxes object, which contains all the anchors for that + image in that feature level. + image_size (tuple(H, W)): a tuple of the image height and width. + + Returns: + Same as `inference`, but for only one image. + """ + pred_logits = pred_logits.flatten().sigmoid_() + # We get top locations across all levels to accelerate the inference speed, + # which does not seem to affect the accuracy. + # First select values above the threshold + logits_top_idxs = torch.where(pred_logits > self.score_threshold)[0] + # Then get the top values + num_topk = min(self.topk_candidates, logits_top_idxs.shape[0]) + pred_prob, topk_idxs = pred_logits[logits_top_idxs].sort(descending=True) + # Keep top k scoring values + pred_prob = pred_prob[:num_topk] + # Keep top k values + top_idxs = logits_top_idxs[topk_idxs[:num_topk]] + + # class index + cls_idxs = top_idxs % self.num_classes + # HWA index + top_idxs //= self.num_classes + # predict boxes + pred_boxes = self.box2box_transform.apply_deltas( + pred_deltas[top_idxs], anchors[top_idxs].tensor + ) + # apply nms + keep = batched_nms(pred_boxes, pred_prob, cls_idxs, self.nms_threshold) + # pick the top ones + keep = keep[: self.detections_im] + + results = Instances(image_size) + results.pred_boxes = Boxes(pred_boxes[keep]) + results.scores = pred_prob[keep] + results.pred_classes = cls_idxs[keep] + + # deal with masks + result_masks, result_anchors = [], None + if self.mask_on: + # index and anchors, useful for masks + top_indexes = indexes[top_idxs] + top_anchors = anchors[top_idxs] + result_indexes = top_indexes[keep] + result_anchors = top_anchors[keep] + # Get masks and do sigmoid + for lvl, _, h, w, anc in result_indexes.tolist(): + cur_size = self.mask_sizes[anc] * (2 ** lvl if self.bipyramid_on else 1) + result_masks.append( + torch.sigmoid(pred_masks[lvl][anc][:, h, w].view(1, cur_size, cur_size)) + ) + + return results, (result_masks, result_anchors) + + def preprocess_image(self, batched_inputs): + """ + Normalize, pad and batch the input images. + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.backbone.size_divisibility) + return images + + +class TensorMaskHead(nn.Module): + def __init__(self, cfg, num_levels, num_anchors, mask_sizes, input_shape: List[ShapeSpec]): + """ + TensorMask head. + """ + super().__init__() + # fmt: off + self.in_features = cfg.MODEL.TENSOR_MASK.IN_FEATURES + in_channels = input_shape[0].channels + num_classes = cfg.MODEL.TENSOR_MASK.NUM_CLASSES + cls_channels = cfg.MODEL.TENSOR_MASK.CLS_CHANNELS + num_convs = cfg.MODEL.TENSOR_MASK.NUM_CONVS + # box parameters + bbox_channels = cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS + # mask parameters + self.mask_on = cfg.MODEL.MASK_ON + self.mask_sizes = mask_sizes + mask_channels = cfg.MODEL.TENSOR_MASK.MASK_CHANNELS + self.align_on = cfg.MODEL.TENSOR_MASK.ALIGNED_ON + self.bipyramid_on = cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON + # fmt: on + + # class subnet + cls_subnet = [] + cur_channels = in_channels + for _ in range(num_convs): + cls_subnet.append( + nn.Conv2d(cur_channels, cls_channels, kernel_size=3, stride=1, padding=1) + ) + cur_channels = cls_channels + cls_subnet.append(nn.ReLU()) + + self.cls_subnet = nn.Sequential(*cls_subnet) + self.cls_score = nn.Conv2d( + cur_channels, num_anchors * num_classes, kernel_size=3, stride=1, padding=1 + ) + modules_list = [self.cls_subnet, self.cls_score] + + # box subnet + bbox_subnet = [] + cur_channels = in_channels + for _ in range(num_convs): + bbox_subnet.append( + nn.Conv2d(cur_channels, bbox_channels, kernel_size=3, stride=1, padding=1) + ) + cur_channels = bbox_channels + bbox_subnet.append(nn.ReLU()) + + self.bbox_subnet = nn.Sequential(*bbox_subnet) + self.bbox_pred = nn.Conv2d( + cur_channels, num_anchors * 4, kernel_size=3, stride=1, padding=1 + ) + modules_list.extend([self.bbox_subnet, self.bbox_pred]) + + # mask subnet + if self.mask_on: + mask_subnet = [] + cur_channels = in_channels + for _ in range(num_convs): + mask_subnet.append( + nn.Conv2d(cur_channels, mask_channels, kernel_size=3, stride=1, padding=1) + ) + cur_channels = mask_channels + mask_subnet.append(nn.ReLU()) + + self.mask_subnet = nn.Sequential(*mask_subnet) + modules_list.append(self.mask_subnet) + for mask_size in self.mask_sizes: + cur_mask_module = "mask_pred_%02d" % mask_size + self.add_module( + cur_mask_module, + nn.Conv2d( + cur_channels, mask_size * mask_size, kernel_size=1, stride=1, padding=0 + ), + ) + modules_list.append(getattr(self, cur_mask_module)) + if self.align_on: + if self.bipyramid_on: + for lvl in range(num_levels): + cur_mask_module = "align2nat_%02d" % lvl + lambda_val = 2 ** lvl + setattr(self, cur_mask_module, SwapAlign2Nat(lambda_val)) + # Also the fusing layer, stay at the same channel size + mask_fuse = [ + nn.Conv2d(cur_channels, cur_channels, kernel_size=3, stride=1, padding=1), + nn.ReLU(), + ] + self.mask_fuse = nn.Sequential(*mask_fuse) + modules_list.append(self.mask_fuse) + else: + self.align2nat = SwapAlign2Nat(1) + + # Initialization + for modules in modules_list: + for layer in modules.modules(): + if isinstance(layer, nn.Conv2d): + torch.nn.init.normal_(layer.weight, mean=0, std=0.01) + torch.nn.init.constant_(layer.bias, 0) + + # Use prior in model initialization to improve stability + bias_value = -(math.log((1 - 0.01) / 0.01)) + torch.nn.init.constant_(self.cls_score.bias, bias_value) + + def forward(self, features): + """ + Arguments: + features (list[Tensor]): FPN feature map tensors in high to low resolution. + Each tensor in the list correspond to different feature levels. + + Returns: + pred_logits (list[Tensor]): #lvl tensors, each has shape (N, AxK, Hi, Wi). + The tensor predicts the classification probability + at each spatial position for each of the A anchors and K object + classes. + pred_deltas (list[Tensor]): #lvl tensors, each has shape (N, Ax4, Hi, Wi). + The tensor predicts 4-vector (dx,dy,dw,dh) box + regression values for every anchor. These values are the + relative offset between the anchor and the ground truth box. + pred_masks (list(list[Tensor])): #lvl list of tensors, each is a list of + A tensors of shape (N, M_{i,a}, Hi, Wi). + The tensor predicts a dense set of M_ixM_i masks at every location. + """ + pred_logits = [self.cls_score(self.cls_subnet(x)) for x in features] + pred_deltas = [self.bbox_pred(self.bbox_subnet(x)) for x in features] + + pred_masks = None + if self.mask_on: + mask_feats = [self.mask_subnet(x) for x in features] + + if self.bipyramid_on: + mask_feat_high_res = mask_feats[0] + H, W = mask_feat_high_res.shape[-2:] + mask_feats_up = [] + for lvl, mask_feat in enumerate(mask_feats): + lambda_val = 2.0 ** lvl + mask_feat_up = mask_feat + if lvl > 0: + mask_feat_up = F.interpolate( + mask_feat, scale_factor=lambda_val, mode="bilinear", align_corners=False + ) + mask_feats_up.append( + self.mask_fuse(mask_feat_up[:, :, :H, :W] + mask_feat_high_res) + ) + mask_feats = mask_feats_up + + pred_masks = [] + for lvl, mask_feat in enumerate(mask_feats): + cur_masks = [] + for mask_size in self.mask_sizes: + cur_mask_module = getattr(self, "mask_pred_%02d" % mask_size) + cur_mask = cur_mask_module(mask_feat) + if self.align_on: + if self.bipyramid_on: + cur_mask_module = getattr(self, "align2nat_%02d" % lvl) + cur_mask = cur_mask_module(cur_mask) + else: + cur_mask = self.align2nat(cur_mask) + cur_masks.append(cur_mask) + pred_masks.append(cur_masks) + return pred_logits, pred_deltas, pred_masks diff --git a/projects/TensorMask/tensormask/config.py b/projects/TensorMask/tensormask/config.py new file mode 100644 index 0000000000000000000000000000000000000000..44479f211811bd4060c6afef9ed86791b0dcd0d4 --- /dev/null +++ b/projects/TensorMask/tensormask/config.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from detectron2.config import CfgNode as CN + + +def add_tensormask_config(cfg): + """ + Add config for TensorMask. + """ + cfg.MODEL.TENSOR_MASK = CN() + + # Anchor parameters + cfg.MODEL.TENSOR_MASK.IN_FEATURES = ["p2", "p3", "p4", "p5", "p6", "p7"] + + # Convolutions to use in the towers + cfg.MODEL.TENSOR_MASK.NUM_CONVS = 4 + + # Number of foreground classes. + cfg.MODEL.TENSOR_MASK.NUM_CLASSES = 80 + # Channel size for the classification tower + cfg.MODEL.TENSOR_MASK.CLS_CHANNELS = 256 + + cfg.MODEL.TENSOR_MASK.SCORE_THRESH_TEST = 0.05 + # Only the top (1000 * #levels) candidate boxes across all levels are + # considered jointly during test (to improve speed) + cfg.MODEL.TENSOR_MASK.TOPK_CANDIDATES_TEST = 6000 + cfg.MODEL.TENSOR_MASK.NMS_THRESH_TEST = 0.5 + + # Box parameters + # Channel size for the box tower + cfg.MODEL.TENSOR_MASK.BBOX_CHANNELS = 128 + # Weights on (dx, dy, dw, dh) + cfg.MODEL.TENSOR_MASK.BBOX_REG_WEIGHTS = (1.5, 1.5, 0.75, 0.75) + + # Loss parameters + cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_GAMMA = 3.0 + cfg.MODEL.TENSOR_MASK.FOCAL_LOSS_ALPHA = 0.3 + + # Mask parameters + # Channel size for the mask tower + cfg.MODEL.TENSOR_MASK.MASK_CHANNELS = 128 + # Mask loss weight + cfg.MODEL.TENSOR_MASK.MASK_LOSS_WEIGHT = 2.0 + # weight on positive pixels within the mask + cfg.MODEL.TENSOR_MASK.POSITIVE_WEIGHT = 1.5 + # Whether to predict in the aligned representation + cfg.MODEL.TENSOR_MASK.ALIGNED_ON = False + # Whether to use the bipyramid architecture + cfg.MODEL.TENSOR_MASK.BIPYRAMID_ON = False diff --git a/projects/TensorMask/tensormask/layers/__init__.py b/projects/TensorMask/tensormask/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cbbac429a69ce7cb17872e27b868f5603de5dc64 --- /dev/null +++ b/projects/TensorMask/tensormask/layers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .swap_align2nat import SwapAlign2Nat, swap_align2nat + +__all__ = [k for k in globals().keys() if not k.startswith("_")] diff --git a/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h new file mode 100644 index 0000000000000000000000000000000000000000..2ec037391f1c5a40e69190bbdb50f71501d54825 --- /dev/null +++ b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat.h @@ -0,0 +1,54 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#pragma once +#include + +namespace tensormask { + +#ifdef WITH_CUDA +at::Tensor SwapAlign2Nat_forward_cuda( + const at::Tensor& X, + const int lambda_val, + const float pad_val); + +at::Tensor SwapAlign2Nat_backward_cuda( + const at::Tensor& gY, + const int lambda_val, + const int batch_size, + const int channel, + const int height, + const int width); +#endif + +inline at::Tensor SwapAlign2Nat_forward( + const at::Tensor& X, + const int lambda_val, + const float pad_val) { + if (X.type().is_cuda()) { +#ifdef WITH_CUDA + return SwapAlign2Nat_forward_cuda(X, lambda_val, pad_val); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +inline at::Tensor SwapAlign2Nat_backward( + const at::Tensor& gY, + const int lambda_val, + const int batch_size, + const int channel, + const int height, + const int width) { + if (gY.type().is_cuda()) { +#ifdef WITH_CUDA + return SwapAlign2Nat_backward_cuda( + gY, lambda_val, batch_size, channel, height, width); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +} // namespace tensormask diff --git a/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..06de4a4d046523be9959dee73dfc1c2c20852ce1 --- /dev/null +++ b/projects/TensorMask/tensormask/layers/csrc/SwapAlign2Nat/SwapAlign2Nat_cuda.cu @@ -0,0 +1,526 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +#include +#include +#include +#include + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + +template +__device__ inline T get_pixel_val( + const T* tensor, + const int idx, + const int H, + const int W, + const int y, + const int x, + const int V, + const int U, + const int v, + const int u, + const T pad_val) { + if ((y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) || (v >= V) || + (u < 0) || (u >= U)) { + return pad_val; + } else { + return tensor[(((idx * V + v) * U + u) * H + y) * W + x]; + } +} + +template +__device__ inline void add_pixel_val( + T* tensor, + const T val, + const int idx, + const int H, + const int W, + const int y, + const int x, + const int V, + const int U, + const int v, + const int u) { + if ((val == 0.) || (y < 0) || (y >= H) || (x < 0) || (x >= W) || (v < 0) || + (v >= V) || (u < 0) || (u >= U)) { + return; + } else { + atomicAdd(tensor + ((((idx * V + v) * U + u) * H + y) * W + x), val); + } +} + +template +__global__ void SwapAlign2NatForwardFeat( + const int nthreads, + const T* bottom_data, + const int Vout, + const int Uout, + const float hVout, + const float hUout, + const int Vin, + const int Uin, + const float lambda, + const int Hin, + const int Win, + const int Hout, + const int Wout, + const T pad_val, + T* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int idx = index; + const int x = idx % Wout; + idx /= Wout; + const int y = idx % Hout; + idx /= Hout; + const int u = idx % Uout; + idx /= Uout; + const int v = idx % Vout; + idx /= Vout; + + const float ox = x * lambda + u - hUout + 0.5; + const int xf = static_cast(floor(ox)); + const int xc = static_cast(ceil(ox)); + const float xwc = ox - xf; + const float xwf = 1. - xwc; + + const float oy = y * lambda + v - hVout + 0.5; + const int yf = static_cast(floor(oy)); + const int yc = static_cast(ceil(oy)); + const float ywc = oy - yf; + const float ywf = 1. - ywc; + + const float ou = (u + 0.5) / lambda - 0.5; + const int uf = static_cast(floor(ou)); + const int uc = static_cast(ceil(ou)); + const float uwc = ou - uf; + const float uwf = 1. - uwc; + + const float ov = (v + 0.5) / lambda - 0.5; + const int vf = static_cast(floor(ov)); + const int vc = static_cast(ceil(ov)); + const float vwc = ov - vf; + const float vwf = 1. - vwc; + + T val = ywf * xwf * vwf * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uf, pad_val) + + ywf * xwf * vwf * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vf, uc, pad_val) + + ywf * xwf * vwc * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uf, pad_val) + + ywf * xwf * vwc * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xf, Vin, Uin, vc, uc, pad_val) + + ywf * xwc * vwf * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uf, pad_val) + + ywf * xwc * vwf * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vf, uc, pad_val) + + ywf * xwc * vwc * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uf, pad_val) + + ywf * xwc * vwc * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yf, xc, Vin, Uin, vc, uc, pad_val) + + ywc * xwf * vwf * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uf, pad_val) + + ywc * xwf * vwf * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vf, uc, pad_val) + + ywc * xwf * vwc * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uf, pad_val) + + ywc * xwf * vwc * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xf, Vin, Uin, vc, uc, pad_val) + + ywc * xwc * vwf * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uf, pad_val) + + ywc * xwc * vwf * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vf, uc, pad_val) + + ywc * xwc * vwc * uwf * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uf, pad_val) + + ywc * xwc * vwc * uwc * + get_pixel_val( + bottom_data, idx, Hin, Win, yc, xc, Vin, Uin, vc, uc, pad_val); + + top_data[index] = val; + } +} + +template +__global__ void SwapAlign2NatBackwardFeat( + const int nthreads, + const T* top_diff, + const int Vout, + const int Uout, + const float hVout, + const float hUout, + const int Vin, + const int Uin, + const float lambda, + const int Hin, + const int Win, + const int Hout, + const int Wout, + T* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int idx = index; + const int x = idx % Wout; + idx /= Wout; + const int y = idx % Hout; + idx /= Hout; + const int u = idx % Uout; + idx /= Uout; + const int v = idx % Vout; + idx /= Vout; + + const float ox = x * lambda + u - hUout + 0.5; + const int xf = static_cast(floor(ox)); + const int xc = static_cast(ceil(ox)); + const float xwc = ox - xf; + const float xwf = 1. - xwc; + + const float oy = y * lambda + v - hVout + 0.5; + const int yf = static_cast(floor(oy)); + const int yc = static_cast(ceil(oy)); + const float ywc = oy - yf; + const float ywf = 1. - ywc; + + const float ou = (u + 0.5) / lambda - 0.5; + const int uf = static_cast(floor(ou)); + const int uc = static_cast(ceil(ou)); + const float uwc = ou - uf; + const float uwf = 1. - uwc; + + const float ov = (v + 0.5) / lambda - 0.5; + const int vf = static_cast(floor(ov)); + const int vc = static_cast(ceil(ov)); + const float vwc = ov - vf; + const float vwf = 1. - vwc; + + const T grad = top_diff[index]; + + add_pixel_val( + bottom_diff, + ywf * xwf * vwf * uwf * grad, + idx, + Hin, + Win, + yf, + xf, + Vin, + Uin, + vf, + uf); + add_pixel_val( + bottom_diff, + ywf * xwf * vwf * uwc * grad, + idx, + Hin, + Win, + yf, + xf, + Vin, + Uin, + vf, + uc); + add_pixel_val( + bottom_diff, + ywf * xwf * vwc * uwf * grad, + idx, + Hin, + Win, + yf, + xf, + Vin, + Uin, + vc, + uf); + add_pixel_val( + bottom_diff, + ywf * xwf * vwc * uwc * grad, + idx, + Hin, + Win, + yf, + xf, + Vin, + Uin, + vc, + uc); + add_pixel_val( + bottom_diff, + ywf * xwc * vwf * uwf * grad, + idx, + Hin, + Win, + yf, + xc, + Vin, + Uin, + vf, + uf); + add_pixel_val( + bottom_diff, + ywf * xwc * vwf * uwc * grad, + idx, + Hin, + Win, + yf, + xc, + Vin, + Uin, + vf, + uc); + add_pixel_val( + bottom_diff, + ywf * xwc * vwc * uwf * grad, + idx, + Hin, + Win, + yf, + xc, + Vin, + Uin, + vc, + uf); + add_pixel_val( + bottom_diff, + ywf * xwc * vwc * uwc * grad, + idx, + Hin, + Win, + yf, + xc, + Vin, + Uin, + vc, + uc); + add_pixel_val( + bottom_diff, + ywc * xwf * vwf * uwf * grad, + idx, + Hin, + Win, + yc, + xf, + Vin, + Uin, + vf, + uf); + add_pixel_val( + bottom_diff, + ywc * xwf * vwf * uwc * grad, + idx, + Hin, + Win, + yc, + xf, + Vin, + Uin, + vf, + uc); + add_pixel_val( + bottom_diff, + ywc * xwf * vwc * uwf * grad, + idx, + Hin, + Win, + yc, + xf, + Vin, + Uin, + vc, + uf); + add_pixel_val( + bottom_diff, + ywc * xwf * vwc * uwc * grad, + idx, + Hin, + Win, + yc, + xf, + Vin, + Uin, + vc, + uc); + add_pixel_val( + bottom_diff, + ywc * xwc * vwf * uwf * grad, + idx, + Hin, + Win, + yc, + xc, + Vin, + Uin, + vf, + uf); + add_pixel_val( + bottom_diff, + ywc * xwc * vwf * uwc * grad, + idx, + Hin, + Win, + yc, + xc, + Vin, + Uin, + vf, + uc); + add_pixel_val( + bottom_diff, + ywc * xwc * vwc * uwf * grad, + idx, + Hin, + Win, + yc, + xc, + Vin, + Uin, + vc, + uf); + add_pixel_val( + bottom_diff, + ywc * xwc * vwc * uwc * grad, + idx, + Hin, + Win, + yc, + xc, + Vin, + Uin, + vc, + uc); + } +} + +namespace tensormask { + +at::Tensor SwapAlign2Nat_forward_cuda( + const at::Tensor& X, + const int lambda_val, + const float pad_val) { + AT_ASSERTM(X.device().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(X.ndimension() == 4, "input must be a 4D tensor"); + AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1"); + const int N = X.size(0); + const int C = X.size(1); + const int Vin = static_cast(sqrt(static_cast(C))); + const int Uin = C / Vin; + AT_ASSERTM( + C == Vin * Uin && Vin == Uin, "#channels should be a square number"); + const int Vout = lambda_val * Vin; + const int Uout = lambda_val * Uin; + const int Hin = X.size(2); + const int Win = X.size(3); + const float lambda = static_cast(lambda_val); + const int Hout = static_cast(ceil(Hin / lambda)); + const int Wout = static_cast(ceil(Win / lambda)); + const float hVout = Vout / 2.; + const float hUout = Uout / 2.; + + at::cuda::CUDAGuard device_guard(X.device()); + + at::Tensor Y = at::empty({N, Vout * Uout, Hout, Wout}, X.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(at::cuda::ATenCeilDiv(Y.numel(), 512L), 4096L)); + dim3 block(512); + + if (Y.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return Y; + } + + auto X_ = X.contiguous(); + AT_DISPATCH_FLOATING_TYPES(X.scalar_type(), "SwapAlign2Nat_forward", [&] { + SwapAlign2NatForwardFeat<<>>( + Y.numel(), + X_.data_ptr(), + Vout, + Uout, + hVout, + hUout, + Vin, + Uin, + lambda, + Hin, + Win, + Hout, + Wout, + pad_val, + Y.data_ptr()); + }); + cudaDeviceSynchronize(); + AT_CUDA_CHECK(cudaGetLastError()); + return Y; +} + +at::Tensor SwapAlign2Nat_backward_cuda( + const at::Tensor& gY, + const int lambda_val, + const int batch_size, + const int channel, + const int height, + const int width) { + AT_ASSERTM(gY.device().is_cuda(), "input gradient must be a CUDA tensor"); + AT_ASSERTM(gY.ndimension() == 4, "input gradient must be a 4D tensor"); + AT_ASSERTM(lambda_val >= 1, "lambda should be greater or equal to 1"); + const int Vin = static_cast(sqrt(static_cast(channel))); + const int Uin = channel / Vin; + const int Vout = lambda_val * Vin; + const int Uout = lambda_val * Uin; + const float hVout = Vout / 2.; + const float hUout = Uout / 2.; + const int Hout = gY.size(2); + const int Wout = gY.size(3); + + at::cuda::CUDAGuard device_guard(gY.device()); + + at::Tensor gX = at::zeros({batch_size, channel, height, width}, gY.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(at::cuda::ATenCeilDiv(gY.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (gY.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return gX; + } + + auto gY_ = gY.contiguous(); + AT_DISPATCH_FLOATING_TYPES(gY.scalar_type(), "SwapAlign2Nat_backward", [&] { + SwapAlign2NatBackwardFeat<<>>( + gY.numel(), + gY_.data_ptr(), + Vout, + Uout, + hVout, + hUout, + Vin, + Uin, + static_cast(lambda_val), + height, + width, + Hout, + Wout, + gX.data_ptr()); + }); + AT_CUDA_CHECK(cudaGetLastError()); + return gX; +} + +} // namespace tensormask diff --git a/projects/TensorMask/tensormask/layers/csrc/vision.cpp b/projects/TensorMask/tensormask/layers/csrc/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad8e472c2cfc7c10e00cd6b00fc22c0dd9384dd1 --- /dev/null +++ b/projects/TensorMask/tensormask/layers/csrc/vision.cpp @@ -0,0 +1,19 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +#include +#include "SwapAlign2Nat/SwapAlign2Nat.h" + +namespace tensormask { + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def( + "swap_align2nat_forward", + &SwapAlign2Nat_forward, + "SwapAlign2Nat_forward"); + m.def( + "swap_align2nat_backward", + &SwapAlign2Nat_backward, + "SwapAlign2Nat_backward"); +} + +} // namespace tensormask diff --git a/projects/TensorMask/tensormask/layers/swap_align2nat.py b/projects/TensorMask/tensormask/layers/swap_align2nat.py new file mode 100644 index 0000000000000000000000000000000000000000..a72c98a968577eff2302d75e4cb41620e4ecf582 --- /dev/null +++ b/projects/TensorMask/tensormask/layers/swap_align2nat.py @@ -0,0 +1,61 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from tensormask import _C + + +class _SwapAlign2Nat(Function): + @staticmethod + def forward(ctx, X, lambda_val, pad_val): + ctx.lambda_val = lambda_val + ctx.input_shape = X.size() + + Y = _C.swap_align2nat_forward(X, lambda_val, pad_val) + return Y + + @staticmethod + @once_differentiable + def backward(ctx, gY): + lambda_val = ctx.lambda_val + bs, ch, h, w = ctx.input_shape + + gX = _C.swap_align2nat_backward(gY, lambda_val, bs, ch, h, w) + + return gX, None, None + + +swap_align2nat = _SwapAlign2Nat.apply + + +class SwapAlign2Nat(nn.Module): + """ + The op `SwapAlign2Nat` described in https://arxiv.org/abs/1903.12174. + Given an input tensor that predicts masks of shape (N, C=VxU, H, W), + apply the op, it will return masks of shape (N, V'xU', H', W') where + the unit lengths of (V, U) and (H, W) are swapped, and the mask representation + is transformed from aligned to natural. + Args: + lambda_val (int): the relative unit length ratio between (V, U) and (H, W), + as we always have larger unit lengths for (V, U) than (H, W), + lambda_val is always >= 1. + pad_val (float): padding value for the values falling outside of the input + tensor, default set to -6 as sigmoid(-6) is ~0, indicating + that is no masks outside of the tensor. + """ + + def __init__(self, lambda_val, pad_val=-6.0): + super(SwapAlign2Nat, self).__init__() + self.lambda_val = lambda_val + self.pad_val = pad_val + + def forward(self, X): + return swap_align2nat(X, self.lambda_val, self.pad_val) + + def __repr__(self): + tmpstr = self.__class__.__name__ + "(" + tmpstr += "lambda_val=" + str(self.lambda_val) + tmpstr += ", pad_val=" + str(self.pad_val) + tmpstr += ")" + return tmpstr diff --git a/projects/TensorMask/tests/__init__.py b/projects/TensorMask/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec --- /dev/null +++ b/projects/TensorMask/tests/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/projects/TensorMask/tests/test_swap_align2nat.py b/projects/TensorMask/tests/test_swap_align2nat.py new file mode 100755 index 0000000000000000000000000000000000000000..b3d018ce199ddaa19af25e8304d969e8f59c747a --- /dev/null +++ b/projects/TensorMask/tests/test_swap_align2nat.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import unittest +import torch +from torch.autograd import gradcheck + +from tensormask.layers.swap_align2nat import SwapAlign2Nat + + +class SwapAlign2NatTest(unittest.TestCase): + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_swap_align2nat_gradcheck_cuda(self): + dtype = torch.float64 + device = torch.device("cuda") + m = SwapAlign2Nat(2).to(dtype=dtype, device=device) + x = torch.rand(2, 4, 10, 10, dtype=dtype, device=device, requires_grad=True) + + self.assertTrue(gradcheck(m, x), "gradcheck failed for SwapAlign2Nat CUDA") + + def _swap_align2nat(self, tensor, lambda_val): + """ + The basic setup for testing Swap_Align + """ + op = SwapAlign2Nat(lambda_val, pad_val=0.0) + input = torch.from_numpy(tensor[None, :, :, :].astype("float32")) + output = op.forward(input.cuda()).cpu().numpy() + return output[0] + + +if __name__ == "__main__": + unittest.main() diff --git a/projects/TensorMask/train_net.py b/projects/TensorMask/train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..b898fc77b7f52cae6ff398ac5aec73c59ab928ab --- /dev/null +++ b/projects/TensorMask/train_net.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +TensorMask Training Script. + +This script is a simplified version of the training script in detectron2/tools. +""" + +import os + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch +from detectron2.evaluation import COCOEvaluator, verify_results + +from tensormask import add_tensormask_config + + +class Trainer(DefaultTrainer): + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + return COCOEvaluator(dataset_name, cfg, True, output_folder) + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + add_tensormask_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/projects/TridentNet/README.md b/projects/TridentNet/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b7a90102d008a498e93dff595a09206be5269e7 --- /dev/null +++ b/projects/TridentNet/README.md @@ -0,0 +1,60 @@ + +# TridentNet in Detectron2 +**Scale-Aware Trident Networks for Object Detection** + +Yanghao Li\*, Yuntao Chen\*, Naiyan Wang, Zhaoxiang Zhang + +[[`TridentNet`](https://github.com/TuSimple/simpledet/tree/master/models/tridentnet)] [[`arXiv`](https://arxiv.org/abs/1901.01892)] [[`BibTeX`](#CitingTridentNet)] + +
+ +
+ +In this repository, we implement TridentNet-Fast in Detectron2. +Trident Network (TridentNet) aims to generate scale-specific feature maps with a uniform representational power. We construct a parallel multi-branch architecture in which each branch shares the same transformation parameters but with different receptive fields. TridentNet-Fast is a fast approximation version of TridentNet that could achieve significant improvements without any additional parameters and computational cost. + +## Training + +To train a model, run +```bash +python /path/to/detectron2/projects/TridentNet/train_net.py --config-file +``` + +For example, to launch end-to-end TridentNet training with ResNet-50 backbone on 8 GPUs, +one should execute: +```bash +python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --num-gpus 8 +``` + +## Evaluation + +Model evaluation can be done similarly: +```bash +python /path/to/detectron2/projects/TridentNet/train_net.py --config-file configs/tridentnet_fast_R_50_C4_1x.yaml --eval-only MODEL.WEIGHTS model.pth +``` + +## Results on MS-COCO in Detectron2 + +|Model|Backbone|Head|lr sched|AP|AP50|AP75|APs|APm|APl|download| +|-----|--------|----|--------|--|----|----|---|---|---|--------| +|Faster|R50-C4|C5-512ROI|1X|35.7|56.1|38.0|19.2|40.9|48.7|model \| metrics| +|TridentFast|R50-C4|C5-128ROI|1X|38.0|58.1|40.8|19.5|42.2|54.6|model \| metrics| +|Faster|R50-C4|C5-512ROI|3X|38.4|58.7|41.3|20.7|42.7|53.1|model \| metrics| +|TridentFast|R50-C4|C5-128ROI|3X|40.6|60.8|43.6|23.4|44.7|57.1|model \| metrics| +|Faster|R101-C4|C5-512ROI|3X|41.1|61.4|44.0|22.2|45.5|55.9|model \| metrics| +|TridentFast|R101-C4|C5-128ROI|3X|43.6|63.4|47.0|24.3|47.8|60.0|model \| metrics| + + +## Citing TridentNet + +If you use TridentNet, please use the following BibTeX entry. + +``` +@InProceedings{li2019scale, + title={Scale-Aware Trident Networks for Object Detection}, + author={Li, Yanghao and Chen, Yuntao and Wang, Naiyan and Zhang, Zhaoxiang}, + journal={The International Conference on Computer Vision (ICCV)}, + year={2019} +} +``` + diff --git a/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml b/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8c3d80797ba9ae63a5669ccbd74a0d2006fee3b7 --- /dev/null +++ b/projects/TridentNet/configs/Base-TridentNet-Fast-C4.yaml @@ -0,0 +1,29 @@ +MODEL: + META_ARCHITECTURE: "GeneralizedRCNN" + BACKBONE: + NAME: "build_trident_resnet_backbone" + ROI_HEADS: + NAME: "TridentRes5ROIHeads" + POSITIVE_FRACTION: 0.5 + BATCH_SIZE_PER_IMAGE: 128 + PROPOSAL_APPEND_GT: False + PROPOSAL_GENERATOR: + NAME: "TridentRPN" + RPN: + POST_NMS_TOPK_TRAIN: 500 + TRIDENT: + NUM_BRANCH: 3 + BRANCH_DILATIONS: [1, 2, 3] + TEST_BRANCH_IDX: 1 + TRIDENT_STAGE: "res4" +DATASETS: + TRAIN: ("coco_2017_train",) + TEST: ("coco_2017_val",) +SOLVER: + IMS_PER_BATCH: 16 + BASE_LR: 0.02 + STEPS: (60000, 80000) + MAX_ITER: 90000 +INPUT: + MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) +VERSION: 2 diff --git a/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..bc83c2f9e7b7653c8982e657b5f116abe6ad6e1f --- /dev/null +++ b/projects/TridentNet/configs/tridentnet_fast_R_101_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "Base-TridentNet-Fast-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-101.pkl" + MASK_ON: False + RESNETS: + DEPTH: 101 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..fda2cb6622d732c0f70d74d567c26182a9a41c44 --- /dev/null +++ b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_1x.yaml @@ -0,0 +1,6 @@ +_BASE_: "Base-TridentNet-Fast-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 diff --git a/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ebf89d03ea043810b02e71ecc2c1711c250e161c --- /dev/null +++ b/projects/TridentNet/configs/tridentnet_fast_R_50_C4_3x.yaml @@ -0,0 +1,9 @@ +_BASE_: "Base-TridentNet-Fast-C4.yaml" +MODEL: + WEIGHTS: "detectron2://ImageNetPretrained/MSRA/R-50.pkl" + MASK_ON: False + RESNETS: + DEPTH: 50 +SOLVER: + STEPS: (210000, 250000) + MAX_ITER: 270000 diff --git a/projects/TridentNet/train_net.py b/projects/TridentNet/train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..eac2ec5c39e4a3ce2221f354dcea288bffcb1fbb --- /dev/null +++ b/projects/TridentNet/train_net.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +""" +TridentNet Training Script. + +This script is a simplified version of the training script in detectron2/tools. +""" + +import os + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch +from detectron2.evaluation import COCOEvaluator + +from tridentnet import add_tridentnet_config + + +class Trainer(DefaultTrainer): + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + return COCOEvaluator(dataset_name, cfg, True, output_folder) + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + add_tridentnet_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + return res + + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/projects/TridentNet/tridentnet/__init__.py b/projects/TridentNet/tridentnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2fcdeb45a03d3835b3c2498ca8021a11d8cb4758 --- /dev/null +++ b/projects/TridentNet/tridentnet/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from .config import add_tridentnet_config +from .trident_backbone import ( + TridentBottleneckBlock, + build_trident_resnet_backbone, + make_trident_stage, +) +from .trident_rpn import TridentRPN +from .trident_rcnn import TridentRes5ROIHeads, TridentStandardROIHeads diff --git a/projects/TridentNet/tridentnet/config.py b/projects/TridentNet/tridentnet/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f33f473cb32633d9ba6582f0406ffe0a929d23c6 --- /dev/null +++ b/projects/TridentNet/tridentnet/config.py @@ -0,0 +1,26 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +from detectron2.config import CfgNode as CN + + +def add_tridentnet_config(cfg): + """ + Add config for tridentnet. + """ + _C = cfg + + _C.MODEL.TRIDENT = CN() + + # Number of branches for TridentNet. + _C.MODEL.TRIDENT.NUM_BRANCH = 3 + # Specify the dilations for each branch. + _C.MODEL.TRIDENT.BRANCH_DILATIONS = [1, 2, 3] + # Specify the stage for applying trident blocks. Default stage is Res4 according to the + # TridentNet paper. + _C.MODEL.TRIDENT.TRIDENT_STAGE = "res4" + # Specify the test branch index TridentNet Fast inference: + # - use -1 to aggregate results of all branches during inference. + # - otherwise, only using specified branch for fast inference. Recommended setting is + # to use the middle branch. + _C.MODEL.TRIDENT.TEST_BRANCH_IDX = 1 diff --git a/projects/TridentNet/tridentnet/trident_backbone.py b/projects/TridentNet/tridentnet/trident_backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..232dfaf1ca01c0395c0ceea544bfbdee0d45ce1a --- /dev/null +++ b/projects/TridentNet/tridentnet/trident_backbone.py @@ -0,0 +1,223 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import fvcore.nn.weight_init as weight_init +import torch +import torch.nn.functional as F + +from detectron2.layers import Conv2d, FrozenBatchNorm2d, get_norm +from detectron2.modeling import BACKBONE_REGISTRY, ResNet, ResNetBlockBase, make_stage +from detectron2.modeling.backbone.resnet import BasicStem, BottleneckBlock, DeformBottleneckBlock + +from .trident_conv import TridentConv + +__all__ = ["TridentBottleneckBlock", "make_trident_stage", "build_trident_resnet_backbone"] + + +class TridentBottleneckBlock(ResNetBlockBase): + def __init__( + self, + in_channels, + out_channels, + *, + bottleneck_channels, + stride=1, + num_groups=1, + norm="BN", + stride_in_1x1=False, + num_branch=3, + dilations=(1, 2, 3), + concat_output=False, + test_branch_idx=-1, + ): + """ + Args: + num_branch (int): the number of branches in TridentNet. + dilations (tuple): the dilations of multiple branches in TridentNet. + concat_output (bool): if concatenate outputs of multiple branches in TridentNet. + Use 'True' for the last trident block. + """ + super().__init__(in_channels, out_channels, stride) + + assert num_branch == len(dilations) + + self.num_branch = num_branch + self.concat_output = concat_output + self.test_branch_idx = test_branch_idx + + if in_channels != out_channels: + self.shortcut = Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=stride, + bias=False, + norm=get_norm(norm, out_channels), + ) + else: + self.shortcut = None + + stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride) + + self.conv1 = Conv2d( + in_channels, + bottleneck_channels, + kernel_size=1, + stride=stride_1x1, + bias=False, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv2 = TridentConv( + bottleneck_channels, + bottleneck_channels, + kernel_size=3, + stride=stride_3x3, + paddings=dilations, + bias=False, + groups=num_groups, + dilations=dilations, + num_branch=num_branch, + test_branch_idx=test_branch_idx, + norm=get_norm(norm, bottleneck_channels), + ) + + self.conv3 = Conv2d( + bottleneck_channels, + out_channels, + kernel_size=1, + bias=False, + norm=get_norm(norm, out_channels), + ) + + for layer in [self.conv1, self.conv2, self.conv3, self.shortcut]: + if layer is not None: # shortcut can be None + weight_init.c2_msra_fill(layer) + + def forward(self, x): + num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1 + if not isinstance(x, list): + x = [x] * num_branch + out = [self.conv1(b) for b in x] + out = [F.relu_(b) for b in out] + + out = self.conv2(out) + out = [F.relu_(b) for b in out] + + out = [self.conv3(b) for b in out] + + if self.shortcut is not None: + shortcut = [self.shortcut(b) for b in x] + else: + shortcut = x + + out = [out_b + shortcut_b for out_b, shortcut_b in zip(out, shortcut)] + out = [F.relu_(b) for b in out] + if self.concat_output: + out = torch.cat(out) + return out + + +def make_trident_stage(block_class, num_blocks, first_stride, **kwargs): + """ + Create a resnet stage by creating many blocks for TridentNet. + """ + blocks = [] + for i in range(num_blocks - 1): + blocks.append(block_class(stride=first_stride if i == 0 else 1, **kwargs)) + kwargs["in_channels"] = kwargs["out_channels"] + blocks.append(block_class(stride=1, concat_output=True, **kwargs)) + return blocks + + +@BACKBONE_REGISTRY.register() +def build_trident_resnet_backbone(cfg, input_shape): + """ + Create a ResNet instance from config for TridentNet. + + Returns: + ResNet: a :class:`ResNet` instance. + """ + # need registration of new blocks/stems? + norm = cfg.MODEL.RESNETS.NORM + stem = BasicStem( + in_channels=input_shape.channels, + out_channels=cfg.MODEL.RESNETS.STEM_OUT_CHANNELS, + norm=norm, + ) + freeze_at = cfg.MODEL.BACKBONE.FREEZE_AT + + if freeze_at >= 1: + for p in stem.parameters(): + p.requires_grad = False + stem = FrozenBatchNorm2d.convert_frozen_batchnorm(stem) + + # fmt: off + out_features = cfg.MODEL.RESNETS.OUT_FEATURES + depth = cfg.MODEL.RESNETS.DEPTH + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group + in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS + out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS + stride_in_1x1 = cfg.MODEL.RESNETS.STRIDE_IN_1X1 + res5_dilation = cfg.MODEL.RESNETS.RES5_DILATION + deform_on_per_stage = cfg.MODEL.RESNETS.DEFORM_ON_PER_STAGE + deform_modulated = cfg.MODEL.RESNETS.DEFORM_MODULATED + deform_num_groups = cfg.MODEL.RESNETS.DEFORM_NUM_GROUPS + num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH + branch_dilations = cfg.MODEL.TRIDENT.BRANCH_DILATIONS + trident_stage = cfg.MODEL.TRIDENT.TRIDENT_STAGE + test_branch_idx = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX + # fmt: on + assert res5_dilation in {1, 2}, "res5_dilation cannot be {}.".format(res5_dilation) + + num_blocks_per_stage = {50: [3, 4, 6, 3], 101: [3, 4, 23, 3], 152: [3, 8, 36, 3]}[depth] + + stages = [] + + res_stage_idx = {"res2": 2, "res3": 3, "res4": 4, "res5": 5} + out_stage_idx = [res_stage_idx[f] for f in out_features] + trident_stage_idx = res_stage_idx[trident_stage] + max_stage_idx = max(out_stage_idx) + for idx, stage_idx in enumerate(range(2, max_stage_idx + 1)): + dilation = res5_dilation if stage_idx == 5 else 1 + first_stride = 1 if idx == 0 or (stage_idx == 5 and dilation == 2) else 2 + stage_kargs = { + "num_blocks": num_blocks_per_stage[idx], + "first_stride": first_stride, + "in_channels": in_channels, + "bottleneck_channels": bottleneck_channels, + "out_channels": out_channels, + "num_groups": num_groups, + "norm": norm, + "stride_in_1x1": stride_in_1x1, + "dilation": dilation, + } + if stage_idx == trident_stage_idx: + assert not deform_on_per_stage[ + idx + ], "Not support deformable conv in Trident blocks yet." + stage_kargs["block_class"] = TridentBottleneckBlock + stage_kargs["num_branch"] = num_branch + stage_kargs["dilations"] = branch_dilations + stage_kargs["test_branch_idx"] = test_branch_idx + stage_kargs.pop("dilation") + elif deform_on_per_stage[idx]: + stage_kargs["block_class"] = DeformBottleneckBlock + stage_kargs["deform_modulated"] = deform_modulated + stage_kargs["deform_num_groups"] = deform_num_groups + else: + stage_kargs["block_class"] = BottleneckBlock + blocks = ( + make_trident_stage(**stage_kargs) + if stage_idx == trident_stage_idx + else make_stage(**stage_kargs) + ) + in_channels = out_channels + out_channels *= 2 + bottleneck_channels *= 2 + + if freeze_at >= stage_idx: + for block in blocks: + block.freeze() + stages.append(blocks) + return ResNet(stem, stages, out_features=out_features) diff --git a/projects/TridentNet/tridentnet/trident_conv.py b/projects/TridentNet/tridentnet/trident_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..7e2d5252bda5ebb2e9eee10af9c9a14fc72bb8fe --- /dev/null +++ b/projects/TridentNet/tridentnet/trident_conv.py @@ -0,0 +1,107 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.modules.utils import _pair + +from detectron2.layers.wrappers import _NewEmptyTensorOp + + +class TridentConv(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + paddings=0, + dilations=1, + groups=1, + num_branch=1, + test_branch_idx=-1, + bias=False, + norm=None, + activation=None, + ): + super(TridentConv, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.num_branch = num_branch + self.stride = _pair(stride) + self.groups = groups + self.with_bias = bias + if isinstance(paddings, int): + paddings = [paddings] * self.num_branch + if isinstance(dilations, int): + dilations = [dilations] * self.num_branch + self.paddings = [_pair(padding) for padding in paddings] + self.dilations = [_pair(dilation) for dilation in dilations] + self.test_branch_idx = test_branch_idx + self.norm = norm + self.activation = activation + + assert len({self.num_branch, len(self.paddings), len(self.dilations)}) == 1 + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, *self.kernel_size) + ) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.bias = None + + nn.init.kaiming_uniform_(self.weight, nonlinearity="relu") + if self.bias is not None: + nn.init.constant_(self.bias, 0) + + def forward(self, inputs): + num_branch = self.num_branch if self.training or self.test_branch_idx == -1 else 1 + assert len(inputs) == num_branch + + if inputs[0].numel() == 0: + output_shape = [ + (i + 2 * p - (di * (k - 1) + 1)) // s + 1 + for i, p, di, k, s in zip( + inputs[0].shape[-2:], self.padding, self.dilation, self.kernel_size, self.stride + ) + ] + output_shape = [input[0].shape[0], self.weight.shape[0]] + output_shape + return [_NewEmptyTensorOp.apply(input, output_shape) for input in inputs] + + if self.training or self.test_branch_idx == -1: + outputs = [ + F.conv2d(input, self.weight, self.bias, self.stride, padding, dilation, self.groups) + for input, dilation, padding in zip(inputs, self.dilations, self.paddings) + ] + else: + outputs = [ + F.conv2d( + inputs[0], + self.weight, + self.bias, + self.stride, + self.paddings[self.test_branch_idx], + self.dilations[self.test_branch_idx], + self.groups, + ) + ] + + if self.norm is not None: + outputs = [self.norm(x) for x in outputs] + if self.activation is not None: + outputs = [self.activation(x) for x in outputs] + return outputs + + def extra_repr(self): + tmpstr = "in_channels=" + str(self.in_channels) + tmpstr += ", out_channels=" + str(self.out_channels) + tmpstr += ", kernel_size=" + str(self.kernel_size) + tmpstr += ", num_branch=" + str(self.num_branch) + tmpstr += ", test_branch_idx=" + str(self.test_branch_idx) + tmpstr += ", stride=" + str(self.stride) + tmpstr += ", paddings=" + str(self.paddings) + tmpstr += ", dilations=" + str(self.dilations) + tmpstr += ", groups=" + str(self.groups) + tmpstr += ", bias=" + str(self.with_bias) + return tmpstr diff --git a/projects/TridentNet/tridentnet/trident_rcnn.py b/projects/TridentNet/tridentnet/trident_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..65deb90977c525f9e42ea9b2581944832a9af47e --- /dev/null +++ b/projects/TridentNet/tridentnet/trident_rcnn.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from detectron2.layers import batched_nms +from detectron2.modeling import ROI_HEADS_REGISTRY, StandardROIHeads +from detectron2.modeling.roi_heads.roi_heads import Res5ROIHeads +from detectron2.structures import Instances + + +def merge_branch_instances(instances, num_branch, nms_thresh, topk_per_image): + """ + Merge detection results from different branches of TridentNet. + Return detection results by applying non-maximum suppression (NMS) on bounding boxes + and keep the unsuppressed boxes and other instances (e.g mask) if any. + + Args: + instances (list[Instances]): A list of N * num_branch instances that store detection + results. Contain N images and each image has num_branch instances. + num_branch (int): Number of branches used for merging detection results for each image. + nms_thresh (float): The threshold to use for box non-maximum suppression. Value in [0, 1]. + topk_per_image (int): The number of top scoring detections to return. Set < 0 to return + all detections. + + Returns: + results: (list[Instances]): A list of N instances, one for each image in the batch, + that stores the topk most confidence detections after merging results from multiple + branches. + """ + if num_branch == 1: + return instances + + batch_size = len(instances) // num_branch + results = [] + for i in range(batch_size): + instance = Instances.cat([instances[i + batch_size * j] for j in range(num_branch)]) + + # Apply per-class NMS + keep = batched_nms( + instance.pred_boxes.tensor, instance.scores, instance.pred_classes, nms_thresh + ) + keep = keep[:topk_per_image] + result = instance[keep] + + results.append(result) + + return results + + +@ROI_HEADS_REGISTRY.register() +class TridentRes5ROIHeads(Res5ROIHeads): + """ + The TridentNet ROIHeads in a typical "C4" R-CNN model. + See :class:`Res5ROIHeads`. + """ + + def __init__(self, cfg, input_shape): + super().__init__(cfg, input_shape) + + self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH + self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1 + + def forward(self, images, features, proposals, targets=None): + """ + See :class:`Res5ROIHeads.forward`. + """ + num_branch = self.num_branch if self.training or not self.trident_fast else 1 + all_targets = targets * num_branch if targets is not None else None + pred_instances, losses = super().forward(images, features, proposals, all_targets) + del images, all_targets, targets + + if self.training: + return pred_instances, losses + else: + pred_instances = merge_branch_instances( + pred_instances, + num_branch, + self.box_predictor.test_nms_thresh, + self.box_predictor.test_topk_per_image, + ) + + return pred_instances, {} + + +@ROI_HEADS_REGISTRY.register() +class TridentStandardROIHeads(StandardROIHeads): + """ + The `StandardROIHeads` for TridentNet. + See :class:`StandardROIHeads`. + """ + + def __init__(self, cfg, input_shape): + super(TridentStandardROIHeads, self).__init__(cfg, input_shape) + + self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH + self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1 + + def forward(self, images, features, proposals, targets=None): + """ + See :class:`Res5ROIHeads.forward`. + """ + # Use 1 branch if using trident_fast during inference. + num_branch = self.num_branch if self.training or not self.trident_fast else 1 + # Duplicate targets for all branches in TridentNet. + all_targets = targets * num_branch if targets is not None else None + pred_instances, losses = super().forward(images, features, proposals, all_targets) + del images, all_targets, targets + + if self.training: + return pred_instances, losses + else: + pred_instances = merge_branch_instances( + pred_instances, + num_branch, + self.box_predictor.test_nms_thresh, + self.box_predictor.test_topk_per_image, + ) + + return pred_instances, {} diff --git a/projects/TridentNet/tridentnet/trident_rpn.py b/projects/TridentNet/tridentnet/trident_rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..c30137f312232ccccd86182108949fbe34b97231 --- /dev/null +++ b/projects/TridentNet/tridentnet/trident_rpn.py @@ -0,0 +1,32 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import torch + +from detectron2.modeling import PROPOSAL_GENERATOR_REGISTRY +from detectron2.modeling.proposal_generator.rpn import RPN +from detectron2.structures import ImageList + + +@PROPOSAL_GENERATOR_REGISTRY.register() +class TridentRPN(RPN): + """ + Trident RPN subnetwork. + """ + + def __init__(self, cfg, input_shape): + super(TridentRPN, self).__init__(cfg, input_shape) + + self.num_branch = cfg.MODEL.TRIDENT.NUM_BRANCH + self.trident_fast = cfg.MODEL.TRIDENT.TEST_BRANCH_IDX != -1 + + def forward(self, images, features, gt_instances=None): + """ + See :class:`RPN.forward`. + """ + num_branch = self.num_branch if self.training or not self.trident_fast else 1 + # Duplicate images and gt_instances for all branches in TridentNet. + all_images = ImageList( + torch.cat([images.tensor] * num_branch), images.image_sizes * num_branch + ) + all_gt_instances = gt_instances * num_branch if gt_instances is not None else None + + return super(TridentRPN, self).forward(all_images, features, all_gt_instances) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..b09bba99ca88d5cc900d1cc7fb0947d0443522be --- /dev/null +++ b/setup.cfg @@ -0,0 +1,26 @@ +[isort] +line_length=100 +multi_line_output=3 +include_trailing_comma=True +known_standard_library=numpy,setuptools,mock +skip=./datasets,docs +skip_glob=*/__init__.py +known_myself=detectron2 +known_third_party=fvcore,matplotlib,cv2,torch,torchvision,PIL,pycocotools,yacs,termcolor,cityscapesscripts,tabulate,tqdm,scipy,lvis,psutil,pkg_resources,caffe2,onnx +no_lines_before=STDLIB,THIRDPARTY +sections=FUTURE,STDLIB,THIRDPARTY,myself,FIRSTPARTY,LOCALFOLDER +default_section=FIRSTPARTY + +[mypy] +python_version=3.6 +ignore_missing_imports = True +warn_unused_configs = True +disallow_untyped_defs = True +check_untyped_defs = True +warn_unused_ignores = True +warn_redundant_casts = True +show_column_numbers = True +follow_imports = silent +allow_redefinition = True +; Require all functions to be annotated +disallow_incomplete_defs = True diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..a863fab1b7658a888df8623b57fe53673698cf60 --- /dev/null +++ b/setup.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import glob +import os +import shutil +from os import path +from setuptools import find_packages, setup +from typing import List +import torch +from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension + +torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] +assert torch_ver >= [1, 4], "Requires PyTorch >= 1.4" + + +def get_version(): + init_py_path = path.join(path.abspath(path.dirname(__file__)), "detectron2", "__init__.py") + init_py = open(init_py_path, "r").readlines() + version_line = [l.strip() for l in init_py if l.startswith("__version__")][0] + version = version_line.split("=")[-1].strip().strip("'\"") + + # The following is used to build release packages. + # Users should never use it. + suffix = os.getenv("D2_VERSION_SUFFIX", "") + version = version + suffix + if os.getenv("BUILD_NIGHTLY", "0") == "1": + from datetime import datetime + + date_str = datetime.today().strftime("%y%m%d") + version = version + ".dev" + date_str + + new_init_py = [l for l in init_py if not l.startswith("__version__")] + new_init_py.append('__version__ = "{}"\n'.format(version)) + with open(init_py_path, "w") as f: + f.write("".join(new_init_py)) + return version + + +def get_extensions(): + this_dir = path.dirname(path.abspath(__file__)) + extensions_dir = path.join(this_dir, "detectron2", "layers", "csrc") + + main_source = path.join(extensions_dir, "vision.cpp") + sources = glob.glob(path.join(extensions_dir, "**", "*.cpp")) + source_cuda = glob.glob(path.join(extensions_dir, "**", "*.cu")) + glob.glob( + path.join(extensions_dir, "*.cu") + ) + + sources = [main_source] + sources + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if ( + torch.cuda.is_available() and CUDA_HOME is not None and os.path.isdir(CUDA_HOME) + ) or os.getenv("FORCE_CUDA", "0") == "1": + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + + # It's better if pytorch can do this by default .. + CC = os.environ.get("CC", None) + if CC is not None: + extra_compile_args["nvcc"].append("-ccbin={}".format(CC)) + + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "detectron2._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +def get_model_zoo_configs() -> List[str]: + """ + Return a list of configs to include in package for model zoo. Copy over these configs inside + detectron2/model_zoo. + """ + + # Use absolute paths while symlinking. + source_configs_dir = path.join(path.dirname(path.realpath(__file__)), "configs") + destination = path.join( + path.dirname(path.realpath(__file__)), "detectron2", "model_zoo", "configs" + ) + # Symlink the config directory inside package to have a cleaner pip install. + + # Remove stale symlink/directory from a previous build. + if path.exists(source_configs_dir): + if path.islink(destination): + os.unlink(destination) + elif path.isdir(destination): + shutil.rmtree(destination) + + if not path.exists(destination): + try: + os.symlink(source_configs_dir, destination) + except OSError: + # Fall back to copying if symlink fails: ex. on Windows. + shutil.copytree(source_configs_dir, destination) + + config_paths = glob.glob("configs/**/*.yaml", recursive=True) + return config_paths + + +setup( + name="detectron2", + version=get_version(), + author="FAIR", + url="https://github.com/facebookresearch/detectron2", + description="Detectron2 is FAIR's next-generation research " + "platform for object detection and segmentation.", + packages=find_packages(exclude=("configs", "tests*")), + package_data={"detectron2.model_zoo": get_model_zoo_configs()}, + python_requires=">=3.6", + install_requires=[ + "termcolor>=1.1", + "Pillow", # you can also use pillow-simd for better performance + "yacs>=0.1.6", + "tabulate", + "cloudpickle", + "matplotlib", + "mock", + "tqdm>4.29.0", + "tensorboard", + "fvcore>=0.1.1", + "future", # used by caffe2 + "pydot", # used to save caffe2 SVGs + ], + extras_require={ + "all": ["shapely", "psutil"], + "dev": [ + "flake8==3.7.9", + "isort", + "black @ git+https://github.com/psf/black@673327449f86fce558adde153bb6cbe54bfebad2", + "flake8-bugbear", + "flake8-comprehensions", + ], + }, + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f560384045ab4f6bc2beabef1170308fca117eb3 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,9 @@ +## Unit Tests + +To run the unittests, do: +``` +cd detectron2 +python -m unittest discover -v -s ./tests +``` + +There are also end-to-end inference & training tests, in [dev/run_*_tests.sh](../dev). diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..168f9979a4623806934b0ff1102ac166704e7dec --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved diff --git a/tests/data/__init__.py b/tests/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/data/test_coco.py b/tests/data/test_coco.py new file mode 100644 index 0000000000000000000000000000000000000000..2cd807d0ae465ad2e060a373f2e75db2483771c7 --- /dev/null +++ b/tests/data/test_coco.py @@ -0,0 +1,77 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import json +import numpy as np +import os +import tempfile +import unittest +import pycocotools + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets.coco import convert_to_coco_dict, load_coco_json +from detectron2.structures import BoxMode + + +def make_mask(): + """ + Makes a donut shaped binary mask. + """ + H = 100 + W = 100 + mask = np.zeros([H, W], dtype=np.uint8) + for x in range(W): + for y in range(H): + d = np.linalg.norm(np.array([W, H]) / 2 - np.array([x, y])) + if d > 10 and d < 20: + mask[y, x] = 1 + return mask + + +def make_dataset_dicts(mask): + """ + Returns a list of dicts that represents a single COCO data point for + object detection. The single instance given by `mask` is represented by + RLE. + """ + record = {} + record["file_name"] = "test" + record["image_id"] = 0 + record["height"] = mask.shape[0] + record["width"] = mask.shape[1] + + y, x = np.nonzero(mask) + segmentation = pycocotools.mask.encode(np.asarray(mask, order="F")) + min_x = np.min(x) + max_x = np.max(x) + min_y = np.min(y) + max_y = np.max(y) + obj = { + "bbox": [min_x, min_y, max_x, max_y], + "bbox_mode": BoxMode.XYXY_ABS, + "category_id": 0, + "iscrowd": 0, + "segmentation": segmentation, + } + record["annotations"] = [obj] + return [record] + + +class TestRLEToJson(unittest.TestCase): + def test(self): + # Make a dummy dataset. + mask = make_mask() + DatasetCatalog.register("test_dataset", lambda: make_dataset_dicts(mask)) + MetadataCatalog.get("test_dataset").set(thing_classes=["test_label"]) + + # Dump to json. + json_dict = convert_to_coco_dict("test_dataset") + with tempfile.TemporaryDirectory() as tmpdir: + json_file_name = os.path.join(tmpdir, "test.json") + with open(json_file_name, "w") as f: + json.dump(json_dict, f) + # Load from json. + dicts = load_coco_json(json_file_name, "") + + # Check the loaded mask matches the original. + anno = dicts[0]["annotations"][0] + loaded_mask = pycocotools.mask.decode(anno["segmentation"]) + self.assertTrue(np.array_equal(loaded_mask, mask)) diff --git a/tests/data/test_detection_utils.py b/tests/data/test_detection_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..bdd94dd92366418347cc74a58e807240fd795111 --- /dev/null +++ b/tests/data/test_detection_utils.py @@ -0,0 +1,116 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +import copy +import numpy as np +import unittest +import pycocotools.mask as mask_util + +from detectron2.data import detection_utils +from detectron2.data import transforms as T +from detectron2.structures import BitMasks, BoxMode + + +class TestTransformAnnotations(unittest.TestCase): + def test_transform_simple_annotation(self): + transforms = T.TransformList([T.HFlipTransform(400)]) + anno = { + "bbox": np.asarray([10, 10, 200, 300]), + "bbox_mode": BoxMode.XYXY_ABS, + "category_id": 3, + "segmentation": [[10, 10, 100, 100, 100, 10], [150, 150, 200, 150, 200, 200]], + } + + output = detection_utils.transform_instance_annotations(anno, transforms, (400, 400)) + self.assertTrue(np.allclose(output["bbox"], [200, 10, 390, 300])) + self.assertEqual(len(output["segmentation"]), len(anno["segmentation"])) + self.assertTrue(np.allclose(output["segmentation"][0], [390, 10, 300, 100, 300, 10])) + + detection_utils.annotations_to_instances([output, output], (400, 400)) + + def test_flip_keypoints(self): + transforms = T.TransformList([T.HFlipTransform(400)]) + anno = { + "bbox": np.asarray([10, 10, 200, 300]), + "bbox_mode": BoxMode.XYXY_ABS, + "keypoints": np.random.rand(17, 3) * 50 + 15, + } + + output = detection_utils.transform_instance_annotations( + copy.deepcopy(anno), + transforms, + (400, 400), + keypoint_hflip_indices=detection_utils.create_keypoint_hflip_indices( + ["keypoints_coco_2017_train"] + ), + ) + # The first keypoint is nose + self.assertTrue(np.allclose(output["keypoints"][0, 0], 400 - anno["keypoints"][0, 0])) + # The last 16 keypoints are 8 left-right pairs + self.assertTrue( + np.allclose( + output["keypoints"][1:, 0].reshape(-1, 2)[:, ::-1], + 400 - anno["keypoints"][1:, 0].reshape(-1, 2), + ) + ) + self.assertTrue( + np.allclose( + output["keypoints"][1:, 1:].reshape(-1, 2, 2)[:, ::-1, :], + anno["keypoints"][1:, 1:].reshape(-1, 2, 2), + ) + ) + + def test_transform_RLE(self): + transforms = T.TransformList([T.HFlipTransform(400)]) + mask = np.zeros((300, 400), order="F").astype("uint8") + mask[:, :200] = 1 + + anno = { + "bbox": np.asarray([10, 10, 200, 300]), + "bbox_mode": BoxMode.XYXY_ABS, + "segmentation": mask_util.encode(mask[:, :, None])[0], + "category_id": 3, + } + output = detection_utils.transform_instance_annotations( + copy.deepcopy(anno), transforms, (300, 400) + ) + mask = output["segmentation"] + self.assertTrue((mask[:, 200:] == 1).all()) + self.assertTrue((mask[:, :200] == 0).all()) + + inst = detection_utils.annotations_to_instances( + [output, output], (400, 400), mask_format="bitmask" + ) + self.assertTrue(isinstance(inst.gt_masks, BitMasks)) + + def test_transform_RLE_resize(self): + transforms = T.TransformList( + [T.HFlipTransform(400), T.ScaleTransform(300, 400, 400, 400, "bilinear")] + ) + mask = np.zeros((300, 400), order="F").astype("uint8") + mask[:, :200] = 1 + + anno = { + "bbox": np.asarray([10, 10, 200, 300]), + "bbox_mode": BoxMode.XYXY_ABS, + "segmentation": mask_util.encode(mask[:, :, None])[0], + "category_id": 3, + } + output = detection_utils.transform_instance_annotations( + copy.deepcopy(anno), transforms, (400, 400) + ) + + inst = detection_utils.annotations_to_instances( + [output, output], (400, 400), mask_format="bitmask" + ) + self.assertTrue(isinstance(inst.gt_masks, BitMasks)) + + def test_gen_crop(self): + instance = {"bbox": [10, 10, 100, 100], "bbox_mode": BoxMode.XYXY_ABS} + t = detection_utils.gen_crop_transform_with_instance((10, 10), (150, 150), instance) + # the box center must fall into the cropped region + self.assertTrue(t.x0 <= 55 <= t.x0 + t.w) + + def test_gen_crop_outside_boxes(self): + instance = {"bbox": [10, 10, 100, 100], "bbox_mode": BoxMode.XYXY_ABS} + with self.assertRaises(AssertionError): + detection_utils.gen_crop_transform_with_instance((10, 10), (15, 15), instance) diff --git a/tests/data/test_rotation_transform.py b/tests/data/test_rotation_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..45faf7e25eb08d70e92e5f6be326083ed0d23c76 --- /dev/null +++ b/tests/data/test_rotation_transform.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import unittest + +from detectron2.data.transforms.transform import RotationTransform + + +class TestRotationTransform(unittest.TestCase): + def assertEqualsArrays(self, a1, a2): + self.assertTrue(np.allclose(a1, a2)) + + def randomData(self, h=5, w=5): + image = np.random.rand(h, w) + coords = np.array([[i, j] for j in range(h + 1) for i in range(w + 1)], dtype=float) + return image, coords, h, w + + def test180(self): + image, coords, h, w = self.randomData(6, 6) + rot = RotationTransform(h, w, 180, expand=False, center=None) + self.assertEqualsArrays(rot.apply_image(image), image[::-1, ::-1]) + rotated_coords = [[w - c[0], h - c[1]] for c in coords] + self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords) + + def test45_coords(self): + _, coords, h, w = self.randomData(4, 6) + rot = RotationTransform(h, w, 45, expand=False, center=None) + rotated_coords = [ + [(x + y - (h + w) / 2) / np.sqrt(2) + w / 2, h / 2 + (y + (w - h) / 2 - x) / np.sqrt(2)] + for (x, y) in coords + ] + self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords) + + def test90(self): + image, coords, h, w = self.randomData() + rot = RotationTransform(h, w, 90, expand=False, center=None) + self.assertEqualsArrays(rot.apply_image(image), image.T[::-1]) + rotated_coords = [[c[1], w - c[0]] for c in coords] + self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords) + + def test90_expand(self): # non-square image + image, coords, h, w = self.randomData(h=5, w=8) + rot = RotationTransform(h, w, 90, expand=True, center=None) + self.assertEqualsArrays(rot.apply_image(image), image.T[::-1]) + rotated_coords = [[c[1], w - c[0]] for c in coords] + self.assertEqualsArrays(rot.apply_coords(coords), rotated_coords) + + def test_center_expand(self): + # center has no effect if expand=True because it only affects shifting + image, coords, h, w = self.randomData(h=5, w=8) + angle = np.random.randint(360) + rot1 = RotationTransform(h, w, angle, expand=True, center=None) + rot2 = RotationTransform(h, w, angle, expand=True, center=(0, 0)) + rot3 = RotationTransform(h, w, angle, expand=True, center=(h, w)) + rot4 = RotationTransform(h, w, angle, expand=True, center=(2, 5)) + for r1 in [rot1, rot2, rot3, rot4]: + for r2 in [rot1, rot2, rot3, rot4]: + self.assertEqualsArrays(r1.apply_image(image), r2.apply_image(image)) + self.assertEqualsArrays(r1.apply_coords(coords), r2.apply_coords(coords)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1256a87a9cc3405ac20bb6b2cf1ee0b22b8f180f --- /dev/null +++ b/tests/data/test_sampler.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import unittest +from torch.utils.data.sampler import SequentialSampler + +from detectron2.data.samplers import GroupedBatchSampler + + +class TestGroupedBatchSampler(unittest.TestCase): + def test_missing_group_id(self): + sampler = SequentialSampler(list(range(100))) + group_ids = [1] * 100 + samples = GroupedBatchSampler(sampler, group_ids, 2) + + for mini_batch in samples: + self.assertEqual(len(mini_batch), 2) + + def test_groups(self): + sampler = SequentialSampler(list(range(100))) + group_ids = [1, 0] * 50 + samples = GroupedBatchSampler(sampler, group_ids, 2) + + for mini_batch in samples: + self.assertEqual((mini_batch[0] + mini_batch[1]) % 2, 0) diff --git a/tests/data/test_transforms.py b/tests/data/test_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..6d8551887aca5d5fa773d33227cb1685f4e2a8c8 --- /dev/null +++ b/tests/data/test_transforms.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import numpy as np +import unittest +from unittest import mock + +from detectron2.config import get_cfg +from detectron2.data import detection_utils +from detectron2.data import transforms as T +from detectron2.utils.logger import setup_logger + +logger = logging.getLogger(__name__) + + +class TestTransforms(unittest.TestCase): + def setUp(self): + setup_logger() + + def test_apply_rotated_boxes(self): + np.random.seed(125) + cfg = get_cfg() + is_train = True + transform_gen = detection_utils.build_transform_gen(cfg, is_train) + image = np.random.rand(200, 300) + image, transforms = T.apply_transform_gens(transform_gen, image) + image_shape = image.shape[:2] # h, w + assert image_shape == (800, 1200) + annotation = {"bbox": [179, 97, 62, 40, -56]} + + boxes = np.array([annotation["bbox"]], dtype=np.float64) # boxes.shape = (1, 5) + transformed_bbox = transforms.apply_rotated_box(boxes)[0] + + expected_bbox = np.array([484, 388, 248, 160, 56], dtype=np.float64) + err_msg = "transformed_bbox = {}, expected {}".format(transformed_bbox, expected_bbox) + assert np.allclose(transformed_bbox, expected_bbox), err_msg + + def test_apply_rotated_boxes_unequal_scaling_factor(self): + np.random.seed(125) + h, w = 400, 200 + newh, neww = 800, 800 + image = np.random.rand(h, w) + transform_gen = [] + transform_gen.append(T.Resize(shape=(newh, neww))) + image, transforms = T.apply_transform_gens(transform_gen, image) + image_shape = image.shape[:2] # h, w + assert image_shape == (newh, neww) + + boxes = np.array( + [ + [150, 100, 40, 20, 0], + [150, 100, 40, 20, 30], + [150, 100, 40, 20, 90], + [150, 100, 40, 20, -90], + ], + dtype=np.float64, + ) + transformed_boxes = transforms.apply_rotated_box(boxes) + + expected_bboxes = np.array( + [ + [600, 200, 160, 40, 0], + [600, 200, 144.22205102, 52.91502622, 49.10660535], + [600, 200, 80, 80, 90], + [600, 200, 80, 80, -90], + ], + dtype=np.float64, + ) + err_msg = "transformed_boxes = {}, expected {}".format(transformed_boxes, expected_bboxes) + assert np.allclose(transformed_boxes, expected_bboxes), err_msg + + def test_print_transform_gen(self): + t = T.RandomCrop("relative", (100, 100)) + self.assertTrue(str(t) == "RandomCrop(crop_type='relative', crop_size=(100, 100))") + + t = T.RandomFlip(prob=0.5) + self.assertTrue(str(t) == "RandomFlip(prob=0.5)") + + t = T.RandomFlip() + self.assertTrue(str(t) == "RandomFlip()") + + def test_random_apply_prob_out_of_range_check(self): + # GIVEN + test_probabilities = {0.0: True, 0.5: True, 1.0: True, -0.01: False, 1.01: False} + + # WHEN + for given_probability, is_valid in test_probabilities.items(): + # THEN + if not is_valid: + self.assertRaises(AssertionError, T.RandomApply, None, prob=given_probability) + else: + T.RandomApply(T.NoOpTransform(), prob=given_probability) + + def test_random_apply_wrapping_transform_gen_probability_occured_evaluation(self): + # GIVEN + transform_mock = mock.MagicMock(name="MockTransform", spec=T.TransformGen) + image_mock = mock.MagicMock(name="MockImage") + random_apply = T.RandomApply(transform_mock, prob=0.001) + + # WHEN + with mock.patch.object(random_apply, "_rand_range", return_value=0.0001): + transform = random_apply.get_transform(image_mock) + + # THEN + transform_mock.get_transform.assert_called_once_with(image_mock) + self.assertIsNot(transform, transform_mock) + + def test_random_apply_wrapping_std_transform_probability_occured_evaluation(self): + # GIVEN + transform_mock = mock.MagicMock(name="MockTransform", spec=T.Transform) + image_mock = mock.MagicMock(name="MockImage") + random_apply = T.RandomApply(transform_mock, prob=0.001) + + # WHEN + with mock.patch.object(random_apply, "_rand_range", return_value=0.0001): + transform = random_apply.get_transform(image_mock) + + # THEN + self.assertIs(transform, transform_mock) + + def test_random_apply_probability_not_occured_evaluation(self): + # GIVEN + transform_mock = mock.MagicMock(name="MockTransform", spec=T.TransformGen) + image_mock = mock.MagicMock(name="MockImage") + random_apply = T.RandomApply(transform_mock, prob=0.001) + + # WHEN + with mock.patch.object(random_apply, "_rand_range", return_value=0.9): + transform = random_apply.get_transform(image_mock) + + # THEN + transform_mock.get_transform.assert_not_called() + self.assertIsInstance(transform, T.NoOpTransform) diff --git a/tests/layers/__init__.py b/tests/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/layers/test_mask_ops.py b/tests/layers/test_mask_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..d180627354b6b9d8e0776d70f78e91ee5e530210 --- /dev/null +++ b/tests/layers/test_mask_ops.py @@ -0,0 +1,190 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import contextlib +import io +import numpy as np +import unittest +from collections import defaultdict +import torch +import tqdm +from fvcore.common.benchmark import benchmark +from fvcore.common.file_io import PathManager +from pycocotools.coco import COCO +from tabulate import tabulate +from torch.nn import functional as F + +from detectron2.data import MetadataCatalog +from detectron2.layers.mask_ops import ( + pad_masks, + paste_mask_in_image_old, + paste_masks_in_image, + scale_boxes, +) +from detectron2.structures import BitMasks, Boxes, BoxMode, PolygonMasks +from detectron2.structures.masks import polygons_to_bitmask + + +def iou_between_full_image_bit_masks(a, b): + intersect = (a & b).sum() + union = (a | b).sum() + return intersect / union + + +def rasterize_polygons_with_grid_sample(full_image_bit_mask, box, mask_size, threshold=0.5): + x0, y0, x1, y1 = box[0], box[1], box[2], box[3] + + img_h, img_w = full_image_bit_mask.shape + + mask_y = np.arange(0.0, mask_size) + 0.5 # mask y sample coords in [0.5, mask_size - 0.5] + mask_x = np.arange(0.0, mask_size) + 0.5 # mask x sample coords in [0.5, mask_size - 0.5] + mask_y = mask_y / mask_size * (y1 - y0) + y0 + mask_x = mask_x / mask_size * (x1 - x0) + x0 + + mask_x = (mask_x - 0.5) / (img_w - 1) * 2 + -1 + mask_y = (mask_y - 0.5) / (img_h - 1) * 2 + -1 + gy, gx = torch.meshgrid(torch.from_numpy(mask_y), torch.from_numpy(mask_x)) + ind = torch.stack([gx, gy], dim=-1).to(dtype=torch.float32) + + full_image_bit_mask = torch.from_numpy(full_image_bit_mask) + mask = F.grid_sample( + full_image_bit_mask[None, None, :, :].to(dtype=torch.float32), + ind[None, :, :, :], + align_corners=True, + ) + + return mask[0, 0] >= threshold + + +class TestMaskCropPaste(unittest.TestCase): + def setUp(self): + json_file = MetadataCatalog.get("coco_2017_val_100").json_file + if not PathManager.isfile(json_file): + raise unittest.SkipTest("{} not found".format(json_file)) + with contextlib.redirect_stdout(io.StringIO()): + json_file = PathManager.get_local_path(json_file) + self.coco = COCO(json_file) + + def test_crop_paste_consistency(self): + """ + rasterize_polygons_within_box (used in training) + and + paste_masks_in_image (used in inference) + should be inverse operations to each other. + + This function runs several implementation of the above two operations and prints + the reconstruction error. + """ + + anns = self.coco.loadAnns(self.coco.getAnnIds(iscrowd=False)) # avoid crowd annotations + + selected_anns = anns[:100] + + ious = [] + for ann in tqdm.tqdm(selected_anns): + results = self.process_annotation(ann) + ious.append([k[2] for k in results]) + + ious = np.array(ious) + mean_ious = ious.mean(axis=0) + table = [] + res_dic = defaultdict(dict) + for row, iou in zip(results, mean_ious): + table.append((row[0], row[1], iou)) + res_dic[row[0]][row[1]] = iou + print(tabulate(table, headers=["rasterize", "paste", "iou"], tablefmt="simple")) + # assert that the reconstruction is good: + self.assertTrue(res_dic["polygon"]["aligned"] > 0.94) + self.assertTrue(res_dic["roialign"]["aligned"] > 0.95) + + def process_annotation(self, ann, mask_side_len=28): + # Parse annotation data + img_info = self.coco.loadImgs(ids=[ann["image_id"]])[0] + height, width = img_info["height"], img_info["width"] + gt_polygons = [np.array(p, dtype=np.float64) for p in ann["segmentation"]] + gt_bbox = BoxMode.convert(ann["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + gt_bit_mask = polygons_to_bitmask(gt_polygons, height, width) + + # Run rasterize .. + torch_gt_bbox = torch.tensor(gt_bbox).to(dtype=torch.float32).reshape(-1, 4) + box_bitmasks = { + "polygon": PolygonMasks([gt_polygons]).crop_and_resize(torch_gt_bbox, mask_side_len)[0], + "gridsample": rasterize_polygons_with_grid_sample(gt_bit_mask, gt_bbox, mask_side_len), + "roialign": BitMasks(torch.from_numpy(gt_bit_mask[None, :, :])).crop_and_resize( + torch_gt_bbox, mask_side_len + )[0], + } + + # Run paste .. + results = defaultdict(dict) + for k, box_bitmask in box_bitmasks.items(): + padded_bitmask, scale = pad_masks(box_bitmask[None, :, :], 1) + scaled_boxes = scale_boxes(torch_gt_bbox, scale) + + r = results[k] + r["old"] = paste_mask_in_image_old( + padded_bitmask[0], scaled_boxes[0], height, width, threshold=0.5 + ) + r["aligned"] = paste_masks_in_image( + box_bitmask[None, :, :], Boxes(torch_gt_bbox), (height, width) + )[0] + + table = [] + for rasterize_method, r in results.items(): + for paste_method, mask in r.items(): + mask = np.asarray(mask) + iou = iou_between_full_image_bit_masks(gt_bit_mask.astype("uint8"), mask) + table.append((rasterize_method, paste_method, iou)) + return table + + def test_polygon_area(self): + # Draw polygon boxes + for d in [5.0, 10.0, 1000.0]: + polygon = PolygonMasks([[[0, 0, 0, d, d, d, d, 0]]]) + area = polygon.area()[0] + target = d ** 2 + self.assertEqual(area, target) + + # Draw polygon triangles + for d in [5.0, 10.0, 1000.0]: + polygon = PolygonMasks([[[0, 0, 0, d, d, d]]]) + area = polygon.area()[0] + target = d ** 2 / 2 + self.assertEqual(area, target) + + +def benchmark_paste(): + S = 800 + H, W = image_shape = (S, S) + N = 64 + torch.manual_seed(42) + masks = torch.rand(N, 28, 28) + + center = torch.rand(N, 2) * 600 + 100 + wh = torch.clamp(torch.randn(N, 2) * 40 + 200, min=50) + x0y0 = torch.clamp(center - wh * 0.5, min=0.0) + x1y1 = torch.clamp(center + wh * 0.5, max=S) + boxes = Boxes(torch.cat([x0y0, x1y1], axis=1)) + + def func(device, n=3): + m = masks.to(device=device) + b = boxes.to(device=device) + + def bench(): + for _ in range(n): + paste_masks_in_image(m, b, image_shape) + if device.type == "cuda": + torch.cuda.synchronize() + + return bench + + specs = [{"device": torch.device("cpu"), "n": 3}] + if torch.cuda.is_available(): + specs.append({"device": torch.device("cuda"), "n": 3}) + + benchmark(func, "paste_masks", specs, num_iters=10, warmup_iters=2) + + +if __name__ == "__main__": + benchmark_paste() + unittest.main() diff --git a/tests/layers/test_nms_rotated.py b/tests/layers/test_nms_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..94b346c524d2c372273dfe992df045962b9605cd --- /dev/null +++ b/tests/layers/test_nms_rotated.py @@ -0,0 +1,188 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from __future__ import absolute_import, division, print_function, unicode_literals +import numpy as np +import unittest +import torch +from torchvision import ops + +from detectron2.layers import batched_nms, batched_nms_rotated, nms_rotated + + +def nms_edit_distance(keep1, keep2): + """ + Compare the "keep" result of two nms call. + They are allowed to be different in terms of edit distance + due to floating point precision issues, e.g., + if a box happen to have an IoU of 0.5 with another box, + one implentation may choose to keep it while another may discard it. + """ + if torch.equal(keep1, keep2): + # they should be equal most of the time + return 0 + keep1, keep2 = tuple(keep1.cpu()), tuple(keep2.cpu()) + m, n = len(keep1), len(keep2) + + # edit distance with DP + f = [np.arange(n + 1), np.arange(n + 1)] + for i in range(m): + cur_row = i % 2 + other_row = (i + 1) % 2 + f[other_row][0] = i + 1 + for j in range(n): + f[other_row][j + 1] = ( + f[cur_row][j] + if keep1[i] == keep2[j] + else min(min(f[cur_row][j], f[cur_row][j + 1]), f[other_row][j]) + 1 + ) + return f[m % 2][n] + + +class TestNMSRotated(unittest.TestCase): + def reference_horizontal_nms(self, boxes, scores, iou_threshold): + """ + Args: + box_scores (N, 5): boxes in corner-form and probabilities. + (Note here 5 == 4 + 1, i.e., 4-dim horizontal box + 1-dim prob) + iou_threshold: intersection over union threshold. + Returns: + picked: a list of indexes of the kept boxes + """ + picked = [] + _, indexes = scores.sort(descending=True) + while len(indexes) > 0: + current = indexes[0] + picked.append(current.item()) + if len(indexes) == 1: + break + current_box = boxes[current, :] + indexes = indexes[1:] + rest_boxes = boxes[indexes, :] + iou = ops.box_iou(rest_boxes, current_box.unsqueeze(0)).squeeze(1) + indexes = indexes[iou <= iou_threshold] + + return torch.as_tensor(picked) + + def _create_tensors(self, N): + boxes = torch.rand(N, 4) * 100 + # Note: the implementation of this function in torchvision is: + # boxes[:, 2:] += torch.rand(N, 2) * 100 + # but it does not guarantee non-negative widths/heights constraints: + # boxes[:, 2] >= boxes[:, 0] and boxes[:, 3] >= boxes[:, 1]: + boxes[:, 2:] += boxes[:, :2] + scores = torch.rand(N) + return boxes, scores + + def test_batched_nms_rotated_0_degree_cpu(self): + N = 2000 + num_classes = 50 + boxes, scores = self._create_tensors(N) + idxs = torch.randint(0, num_classes, (N,)) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}" + for iou in [0.2, 0.5, 0.8]: + backup = boxes.clone() + keep_ref = batched_nms(boxes, scores, idxs, iou) + assert torch.allclose(boxes, backup), "boxes modified by batched_nms" + backup = rotated_boxes.clone() + keep = batched_nms_rotated(rotated_boxes, scores, idxs, iou) + assert torch.allclose( + rotated_boxes, backup + ), "rotated_boxes modified by batched_nms_rotated" + self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_batched_nms_rotated_0_degree_cuda(self): + N = 2000 + num_classes = 50 + boxes, scores = self._create_tensors(N) + idxs = torch.randint(0, num_classes, (N,)) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + err_msg = "Rotated NMS with 0 degree is incompatible with horizontal NMS for IoU={}" + for iou in [0.2, 0.5, 0.8]: + backup = boxes.clone() + keep_ref = batched_nms(boxes.cuda(), scores.cuda(), idxs, iou) + self.assertTrue(torch.allclose(boxes, backup), "boxes modified by batched_nms") + backup = rotated_boxes.clone() + keep = batched_nms_rotated(rotated_boxes.cuda(), scores.cuda(), idxs, iou) + self.assertTrue( + torch.allclose(rotated_boxes, backup), + "rotated_boxes modified by batched_nms_rotated", + ) + self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou)) + + def test_nms_rotated_0_degree_cpu(self): + N = 1000 + boxes, scores = self._create_tensors(N) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}" + for iou in [0.5]: + keep_ref = self.reference_horizontal_nms(boxes, scores, iou) + keep = nms_rotated(rotated_boxes, scores, iou) + self.assertLessEqual(nms_edit_distance(keep, keep_ref), 1, err_msg.format(iou)) + + def test_nms_rotated_90_degrees_cpu(self): + N = 1000 + boxes, scores = self._create_tensors(N) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + # Note for rotated_boxes[:, 2] and rotated_boxes[:, 3]: + # widths and heights are intentionally swapped here for 90 degrees case + # so that the reference horizontal nms could be used + rotated_boxes[:, 2] = boxes[:, 3] - boxes[:, 1] + rotated_boxes[:, 3] = boxes[:, 2] - boxes[:, 0] + + rotated_boxes[:, 4] = torch.ones(N) * 90 + err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}" + for iou in [0.2, 0.5, 0.8]: + keep_ref = self.reference_horizontal_nms(boxes, scores, iou) + keep = nms_rotated(rotated_boxes, scores, iou) + assert torch.equal(keep, keep_ref), err_msg.format(iou) + + def test_nms_rotated_180_degrees_cpu(self): + N = 1000 + boxes, scores = self._create_tensors(N) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + rotated_boxes[:, 4] = torch.ones(N) * 180 + err_msg = "Rotated NMS incompatible between CPU and reference implementation for IoU={}" + for iou in [0.2, 0.5, 0.8]: + keep_ref = self.reference_horizontal_nms(boxes, scores, iou) + keep = nms_rotated(rotated_boxes, scores, iou) + assert torch.equal(keep, keep_ref), err_msg.format(iou) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_nms_rotated_0_degree_cuda(self): + N = 1000 + boxes, scores = self._create_tensors(N) + rotated_boxes = torch.zeros(N, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + err_msg = "Rotated NMS incompatible between CPU and CUDA for IoU={}" + + for iou in [0.2, 0.5, 0.8]: + r_cpu = nms_rotated(rotated_boxes, scores, iou) + r_cuda = nms_rotated(rotated_boxes.cuda(), scores.cuda(), iou) + + assert torch.equal(r_cpu, r_cuda.cpu()), err_msg.format(iou) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/layers/test_roi_align.py b/tests/layers/test_roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..633d7c29c41b94b8a57c15aff728f23a71b535d1 --- /dev/null +++ b/tests/layers/test_roi_align.py @@ -0,0 +1,152 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import numpy as np +import unittest +import cv2 +import torch +from fvcore.common.benchmark import benchmark + +from detectron2.layers.roi_align import ROIAlign + + +class ROIAlignTest(unittest.TestCase): + def test_forward_output(self): + input = np.arange(25).reshape(5, 5).astype("float32") + """ + 0 1 2 3 4 + 5 6 7 8 9 + 10 11 12 13 14 + 15 16 17 18 19 + 20 21 22 23 24 + """ + + output = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=False) + output_correct = self._simple_roialign(input, [1, 1, 3, 3], (4, 4), aligned=True) + + # without correction: + old_results = [ + [7.5, 8, 8.5, 9], + [10, 10.5, 11, 11.5], + [12.5, 13, 13.5, 14], + [15, 15.5, 16, 16.5], + ] + + # with 0.5 correction: + correct_results = [ + [4.5, 5.0, 5.5, 6.0], + [7.0, 7.5, 8.0, 8.5], + [9.5, 10.0, 10.5, 11.0], + [12.0, 12.5, 13.0, 13.5], + ] + # This is an upsampled version of [[6, 7], [11, 12]] + + self.assertTrue(np.allclose(output.flatten(), np.asarray(old_results).flatten())) + self.assertTrue( + np.allclose(output_correct.flatten(), np.asarray(correct_results).flatten()) + ) + + # Also see similar issues in tensorflow at + # https://github.com/tensorflow/tensorflow/issues/26278 + + def test_resize(self): + H, W = 30, 30 + input = np.random.rand(H, W).astype("float32") * 100 + box = [10, 10, 20, 20] + output = self._simple_roialign(input, box, (5, 5), aligned=True) + + input2x = cv2.resize(input, (W // 2, H // 2), interpolation=cv2.INTER_LINEAR) + box2x = [x / 2 for x in box] + output2x = self._simple_roialign(input2x, box2x, (5, 5), aligned=True) + diff = np.abs(output2x - output) + self.assertTrue(diff.max() < 1e-4) + + def _simple_roialign(self, img, box, resolution, aligned=True): + """ + RoiAlign with scale 1.0 and 0 sample ratio. + """ + if isinstance(resolution, int): + resolution = (resolution, resolution) + op = ROIAlign(resolution, 1.0, 0, aligned=aligned) + input = torch.from_numpy(img[None, None, :, :].astype("float32")) + + rois = [0] + list(box) + rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32")) + output = op.forward(input, rois) + if torch.cuda.is_available(): + output_cuda = op.forward(input.cuda(), rois.cuda()).cpu() + self.assertTrue(torch.allclose(output, output_cuda)) + return output[0, 0] + + def _simple_roialign_with_grad(self, img, box, resolution, device): + if isinstance(resolution, int): + resolution = (resolution, resolution) + + op = ROIAlign(resolution, 1.0, 0, aligned=True) + input = torch.from_numpy(img[None, None, :, :].astype("float32")) + + rois = [0] + list(box) + rois = torch.from_numpy(np.asarray(rois)[None, :].astype("float32")) + input = input.to(device=device) + rois = rois.to(device=device) + input.requires_grad = True + output = op.forward(input, rois) + return input, output + + def test_empty_box(self): + img = np.random.rand(5, 5) + box = [3, 4, 5, 4] + o = self._simple_roialign(img, box, 7) + self.assertTrue(o.shape == (7, 7)) + self.assertTrue((o == 0).all()) + + for dev in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + input, output = self._simple_roialign_with_grad(img, box, 7, torch.device(dev)) + output.sum().backward() + self.assertTrue(torch.allclose(input.grad, torch.zeros_like(input))) + + def test_empty_batch(self): + input = torch.zeros(0, 3, 10, 10, dtype=torch.float32) + rois = torch.zeros(0, 5, dtype=torch.float32) + op = ROIAlign((7, 7), 1.0, 0, aligned=True) + output = op.forward(input, rois) + self.assertTrue(output.shape == (0, 3, 7, 7)) + + +def benchmark_roi_align(): + from detectron2 import _C + + def random_boxes(mean_box, stdev, N, maxsize): + ret = torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float) + ret.clamp_(min=0, max=maxsize) + return ret + + def func(N, C, H, W, nboxes_per_img): + input = torch.rand(N, C, H, W) + boxes = [] + batch_idx = [] + for k in range(N): + b = random_boxes([80, 80, 130, 130], 24, nboxes_per_img, H) + # try smaller boxes: + # b = random_boxes([100, 100, 110, 110], 4, nboxes_per_img, H) + boxes.append(b) + batch_idx.append(torch.zeros(nboxes_per_img, 1, dtype=torch.float32) + k) + boxes = torch.cat(boxes, axis=0) + batch_idx = torch.cat(batch_idx, axis=0) + boxes = torch.cat([batch_idx, boxes], axis=1) + + input = input.cuda() + boxes = boxes.cuda() + + def bench(): + _C.roi_align_forward(input, boxes, 1.0, 7, 7, 0, True) + torch.cuda.synchronize() + + return bench + + args = [dict(N=2, C=512, H=256, W=256, nboxes_per_img=500)] + benchmark(func, "cuda_roialign", args, num_iters=20, warmup_iters=1) + + +if __name__ == "__main__": + if torch.cuda.is_available(): + benchmark_roi_align() + unittest.main() diff --git a/tests/layers/test_roi_align_rotated.py b/tests/layers/test_roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..1915b59ff6774a54ee0e5dbfdbe0ecf89f2e2235 --- /dev/null +++ b/tests/layers/test_roi_align_rotated.py @@ -0,0 +1,176 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import cv2 +import torch +from torch.autograd import Variable, gradcheck + +from detectron2.layers.roi_align import ROIAlign +from detectron2.layers.roi_align_rotated import ROIAlignRotated + +logger = logging.getLogger(__name__) + + +class ROIAlignRotatedTest(unittest.TestCase): + def _box_to_rotated_box(self, box, angle): + return [ + (box[0] + box[2]) / 2.0, + (box[1] + box[3]) / 2.0, + box[2] - box[0], + box[3] - box[1], + angle, + ] + + def _rot90(self, img, num): + num = num % 4 # note: -1 % 4 == 3 + for _ in range(num): + img = img.transpose(0, 1).flip(0) + return img + + def test_forward_output_0_90_180_270(self): + for i in range(4): + # i = 0, 1, 2, 3 corresponding to 0, 90, 180, 270 degrees + img = torch.arange(25, dtype=torch.float32).reshape(5, 5) + """ + 0 1 2 3 4 + 5 6 7 8 9 + 10 11 12 13 14 + 15 16 17 18 19 + 20 21 22 23 24 + """ + box = [1, 1, 3, 3] + rotated_box = self._box_to_rotated_box(box=box, angle=90 * i) + + result = self._simple_roi_align_rotated(img=img, box=rotated_box, resolution=(4, 4)) + + # Here's an explanation for 0 degree case: + # point 0 in the original input lies at [0.5, 0.5] + # (the center of bin [0, 1] x [0, 1]) + # point 1 in the original input lies at [1.5, 0.5], etc. + # since the resolution is (4, 4) that divides [1, 3] x [1, 3] + # into 4 x 4 equal bins, + # the top-left bin is [1, 1.5] x [1, 1.5], and its center + # (1.25, 1.25) lies at the 3/4 position + # between point 0 and point 1, point 5 and point 6, + # point 0 and point 5, point 1 and point 6, so it can be calculated as + # 0.25*(0*0.25+1*0.75)+(5*0.25+6*0.75)*0.75 = 4.5 + result_expected = torch.tensor( + [ + [4.5, 5.0, 5.5, 6.0], + [7.0, 7.5, 8.0, 8.5], + [9.5, 10.0, 10.5, 11.0], + [12.0, 12.5, 13.0, 13.5], + ] + ) + # This is also an upsampled version of [[6, 7], [11, 12]] + + # When the box is rotated by 90 degrees CCW, + # the result would be rotated by 90 degrees CW, thus it's -i here + result_expected = self._rot90(result_expected, -i) + + assert torch.allclose(result, result_expected) + + def test_resize(self): + H, W = 30, 30 + input = torch.rand(H, W) * 100 + box = [10, 10, 20, 20] + rotated_box = self._box_to_rotated_box(box, angle=0) + output = self._simple_roi_align_rotated(img=input, box=rotated_box, resolution=(5, 5)) + + input2x = cv2.resize(input.numpy(), (W // 2, H // 2), interpolation=cv2.INTER_LINEAR) + input2x = torch.from_numpy(input2x) + box2x = [x / 2 for x in box] + rotated_box2x = self._box_to_rotated_box(box2x, angle=0) + output2x = self._simple_roi_align_rotated(img=input2x, box=rotated_box2x, resolution=(5, 5)) + assert torch.allclose(output2x, output) + + def _simple_roi_align_rotated(self, img, box, resolution): + """ + RoiAlignRotated with scale 1.0 and 0 sample ratio. + """ + op = ROIAlignRotated(output_size=resolution, spatial_scale=1.0, sampling_ratio=0) + input = img[None, None, :, :] + + rois = [0] + list(box) + rois = torch.tensor(rois, dtype=torch.float32)[None, :] + result_cpu = op.forward(input, rois) + if torch.cuda.is_available(): + result_cuda = op.forward(input.cuda(), rois.cuda()) + assert torch.allclose(result_cpu, result_cuda.cpu()) + return result_cpu[0, 0] + + def test_empty_box(self): + img = torch.rand(5, 5) + out = self._simple_roi_align_rotated(img, [2, 3, 0, 0, 0], (7, 7)) + self.assertTrue((out == 0).all()) + + def test_roi_align_rotated_gradcheck_cpu(self): + dtype = torch.float64 + device = torch.device("cpu") + roi_align_rotated_op = ROIAlignRotated( + output_size=(5, 5), spatial_scale=0.5, sampling_ratio=1 + ).to(dtype=dtype, device=device) + x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True) + # roi format is (batch index, x_center, y_center, width, height, angle) + rois = torch.tensor( + [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]], + dtype=dtype, + device=device, + ) + + def func(input): + return roi_align_rotated_op(input, rois) + + assert gradcheck(func, (x,)), "gradcheck failed for RoIAlignRotated CPU" + assert gradcheck(func, (x.transpose(2, 3),)), "gradcheck failed for RoIAlignRotated CPU" + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_roi_align_rotated_gradient_cuda(self): + """ + Compute gradients for ROIAlignRotated with multiple bounding boxes on the GPU, + and compare the result with ROIAlign + """ + # torch.manual_seed(123) + dtype = torch.float64 + device = torch.device("cuda") + pool_h, pool_w = (5, 5) + + roi_align = ROIAlign(output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2).to( + device=device + ) + + roi_align_rotated = ROIAlignRotated( + output_size=(pool_h, pool_w), spatial_scale=1, sampling_ratio=2 + ).to(device=device) + + x = torch.rand(1, 1, 10, 10, dtype=dtype, device=device, requires_grad=True) + # x_rotated = x.clone() won't work (will lead to grad_fun=CloneBackward)! + x_rotated = Variable(x.data.clone(), requires_grad=True) + + # roi_rotated format is (batch index, x_center, y_center, width, height, angle) + rois_rotated = torch.tensor( + [[0, 4.5, 4.5, 9, 9, 0], [0, 2, 7, 4, 4, 0], [0, 7, 7, 4, 4, 0]], + dtype=dtype, + device=device, + ) + + y_rotated = roi_align_rotated(x_rotated, rois_rotated) + s_rotated = y_rotated.sum() + s_rotated.backward() + + # roi format is (batch index, x1, y1, x2, y2) + rois = torch.tensor( + [[0, 0, 0, 9, 9], [0, 0, 5, 4, 9], [0, 5, 5, 9, 9]], dtype=dtype, device=device + ) + + y = roi_align(x, rois) + s = y.sum() + s.backward() + + assert torch.allclose( + x.grad, x_rotated.grad + ), "gradients for ROIAlign and ROIAlignRotated mismatch on CUDA" + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/__init__.py b/tests/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/modeling/test_anchor_generator.py b/tests/modeling/test_anchor_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..bc14f0279ee682040082e51f96a41a267269d6ce --- /dev/null +++ b/tests/modeling/test_anchor_generator.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.config import get_cfg +from detectron2.layers import ShapeSpec +from detectron2.modeling.anchor_generator import DefaultAnchorGenerator, RotatedAnchorGenerator + +logger = logging.getLogger(__name__) + + +class TestAnchorGenerator(unittest.TestCase): + def test_default_anchor_generator(self): + cfg = get_cfg() + cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] + cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]] + + anchor_generator = DefaultAnchorGenerator(cfg, [ShapeSpec(stride=4)]) + + # only the last two dimensions of features matter here + num_images = 2 + features = {"stage3": torch.rand(num_images, 96, 1, 2)} + anchors = anchor_generator([features["stage3"]]) + expected_anchor_tensor = torch.tensor( + [ + [-32.0, -8.0, 32.0, 8.0], + [-16.0, -16.0, 16.0, 16.0], + [-8.0, -32.0, 8.0, 32.0], + [-64.0, -16.0, 64.0, 16.0], + [-32.0, -32.0, 32.0, 32.0], + [-16.0, -64.0, 16.0, 64.0], + [-28.0, -8.0, 36.0, 8.0], # -28.0 == -32.0 + STRIDE (4) + [-12.0, -16.0, 20.0, 16.0], + [-4.0, -32.0, 12.0, 32.0], + [-60.0, -16.0, 68.0, 16.0], + [-28.0, -32.0, 36.0, 32.0], + [-12.0, -64.0, 20.0, 64.0], + ] + ) + + assert torch.allclose(anchors[0].tensor, expected_anchor_tensor) + + def test_default_anchor_generator_centered(self): + # test explicit args + anchor_generator = DefaultAnchorGenerator( + sizes=[32, 64], aspect_ratios=[0.25, 1, 4], strides=[4] + ) + + # only the last two dimensions of features matter here + num_images = 2 + features = {"stage3": torch.rand(num_images, 96, 1, 2)} + expected_anchor_tensor = torch.tensor( + [ + [-30.0, -6.0, 34.0, 10.0], + [-14.0, -14.0, 18.0, 18.0], + [-6.0, -30.0, 10.0, 34.0], + [-62.0, -14.0, 66.0, 18.0], + [-30.0, -30.0, 34.0, 34.0], + [-14.0, -62.0, 18.0, 66.0], + [-26.0, -6.0, 38.0, 10.0], + [-10.0, -14.0, 22.0, 18.0], + [-2.0, -30.0, 14.0, 34.0], + [-58.0, -14.0, 70.0, 18.0], + [-26.0, -30.0, 38.0, 34.0], + [-10.0, -62.0, 22.0, 66.0], + ] + ) + + anchors = anchor_generator([features["stage3"]]) + assert torch.allclose(anchors[0].tensor, expected_anchor_tensor) + + # doesn't work yet + # anchors = torch.jit.script(anchor_generator)([features["stage3"]]) + # assert torch.allclose(anchors[0].tensor, expected_anchor_tensor) + + def test_rrpn_anchor_generator(self): + cfg = get_cfg() + cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] + cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1, 4]] + cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [0, 45] # test single list[float] + anchor_generator = RotatedAnchorGenerator(cfg, [ShapeSpec(stride=4)]) + + # only the last two dimensions of features matter here + num_images = 2 + features = {"stage3": torch.rand(num_images, 96, 1, 2)} + anchors = anchor_generator([features["stage3"]]) + expected_anchor_tensor = torch.tensor( + [ + [0.0, 0.0, 64.0, 16.0, 0.0], + [0.0, 0.0, 64.0, 16.0, 45.0], + [0.0, 0.0, 32.0, 32.0, 0.0], + [0.0, 0.0, 32.0, 32.0, 45.0], + [0.0, 0.0, 16.0, 64.0, 0.0], + [0.0, 0.0, 16.0, 64.0, 45.0], + [0.0, 0.0, 128.0, 32.0, 0.0], + [0.0, 0.0, 128.0, 32.0, 45.0], + [0.0, 0.0, 64.0, 64.0, 0.0], + [0.0, 0.0, 64.0, 64.0, 45.0], + [0.0, 0.0, 32.0, 128.0, 0.0], + [0.0, 0.0, 32.0, 128.0, 45.0], + [4.0, 0.0, 64.0, 16.0, 0.0], # 4.0 == 0.0 + STRIDE (4) + [4.0, 0.0, 64.0, 16.0, 45.0], + [4.0, 0.0, 32.0, 32.0, 0.0], + [4.0, 0.0, 32.0, 32.0, 45.0], + [4.0, 0.0, 16.0, 64.0, 0.0], + [4.0, 0.0, 16.0, 64.0, 45.0], + [4.0, 0.0, 128.0, 32.0, 0.0], + [4.0, 0.0, 128.0, 32.0, 45.0], + [4.0, 0.0, 64.0, 64.0, 0.0], + [4.0, 0.0, 64.0, 64.0, 45.0], + [4.0, 0.0, 32.0, 128.0, 0.0], + [4.0, 0.0, 32.0, 128.0, 45.0], + ] + ) + + assert torch.allclose(anchors[0].tensor, expected_anchor_tensor) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/test_box2box_transform.py b/tests/modeling/test_box2box_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..9d124d79fc0e17f268f6b5b50fcb8f8dfad59368 --- /dev/null +++ b/tests/modeling/test_box2box_transform.py @@ -0,0 +1,64 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated + +logger = logging.getLogger(__name__) + + +def random_boxes(mean_box, stdev, N): + return torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float) + + +class TestBox2BoxTransform(unittest.TestCase): + def test_reconstruction(self): + weights = (5, 5, 10, 10) + b2b_tfm = Box2BoxTransform(weights=weights) + src_boxes = random_boxes([10, 10, 20, 20], 1, 10) + dst_boxes = random_boxes([10, 10, 20, 20], 1, 10) + + devices = [torch.device("cpu")] + if torch.cuda.is_available(): + devices.append(torch.device("cuda")) + for device in devices: + src_boxes = src_boxes.to(device=device) + dst_boxes = dst_boxes.to(device=device) + deltas = b2b_tfm.get_deltas(src_boxes, dst_boxes) + dst_boxes_reconstructed = b2b_tfm.apply_deltas(deltas, src_boxes) + assert torch.allclose(dst_boxes, dst_boxes_reconstructed) + + +def random_rotated_boxes(mean_box, std_length, std_angle, N): + return torch.cat( + [torch.rand(N, 4) * std_length, torch.rand(N, 1) * std_angle], dim=1 + ) + torch.tensor(mean_box, dtype=torch.float) + + +class TestBox2BoxTransformRotated(unittest.TestCase): + def test_reconstruction(self): + weights = (5, 5, 10, 10, 1) + b2b_transform = Box2BoxTransformRotated(weights=weights) + src_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10) + dst_boxes = random_rotated_boxes([10, 10, 20, 20, -30], 5, 60.0, 10) + + devices = [torch.device("cpu")] + if torch.cuda.is_available(): + devices.append(torch.device("cuda")) + for device in devices: + src_boxes = src_boxes.to(device=device) + dst_boxes = dst_boxes.to(device=device) + deltas = b2b_transform.get_deltas(src_boxes, dst_boxes) + dst_boxes_reconstructed = b2b_transform.apply_deltas(deltas, src_boxes) + assert torch.allclose(dst_boxes[:, :4], dst_boxes_reconstructed[:, :4], atol=1e-5) + # angle difference has to be normalized + assert torch.allclose( + (dst_boxes[:, 4] - dst_boxes_reconstructed[:, 4] + 180.0) % 360.0 - 180.0, + torch.zeros_like(dst_boxes[:, 4]), + atol=1e-4, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/test_fast_rcnn.py b/tests/modeling/test_fast_rcnn.py new file mode 100644 index 0000000000000000000000000000000000000000..70b64d3db497bac52e127d02a543b14d2e37e8eb --- /dev/null +++ b/tests/modeling/test_fast_rcnn.py @@ -0,0 +1,106 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.layers import ShapeSpec +from detectron2.modeling.box_regression import Box2BoxTransform, Box2BoxTransformRotated +from detectron2.modeling.roi_heads.fast_rcnn import FastRCNNOutputLayers +from detectron2.modeling.roi_heads.rotated_fast_rcnn import RotatedFastRCNNOutputLayers +from detectron2.structures import Boxes, Instances, RotatedBoxes +from detectron2.utils.events import EventStorage + +logger = logging.getLogger(__name__) + + +class FastRCNNTest(unittest.TestCase): + def test_fast_rcnn(self): + torch.manual_seed(132) + + box_head_output_size = 8 + + box_predictor = FastRCNNOutputLayers( + ShapeSpec(channels=box_head_output_size), + box2box_transform=Box2BoxTransform(weights=(10, 10, 5, 5)), + num_classes=5, + ) + feature_pooled = torch.rand(2, box_head_output_size) + predictions = box_predictor(feature_pooled) + + proposal_boxes = torch.tensor([[0.8, 1.1, 3.2, 2.8], [2.3, 2.5, 7, 8]], dtype=torch.float32) + gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) + proposal = Instances((10, 10)) + proposal.proposal_boxes = Boxes(proposal_boxes) + proposal.gt_boxes = Boxes(gt_boxes) + proposal.gt_classes = torch.tensor([1, 2]) + + with EventStorage(): # capture events in a new storage to discard them + losses = box_predictor.losses(predictions, [proposal]) + + expected_losses = { + "loss_cls": torch.tensor(1.7951188087), + "loss_box_reg": torch.tensor(4.0357131958), + } + for name in expected_losses.keys(): + assert torch.allclose(losses[name], expected_losses[name]) + + def test_fast_rcnn_empty_batch(self, device="cpu"): + box_predictor = FastRCNNOutputLayers( + ShapeSpec(channels=10), + box2box_transform=Box2BoxTransform(weights=(10, 10, 5, 5)), + num_classes=8, + ).to(device=device) + + logits = torch.randn(0, 100, requires_grad=True, device=device) + deltas = torch.randn(0, 4, requires_grad=True, device=device) + losses = box_predictor.losses([logits, deltas], []) + for value in losses.values(): + self.assertTrue(torch.allclose(value, torch.zeros_like(value))) + sum(losses.values()).backward() + self.assertTrue(logits.grad is not None) + self.assertTrue(deltas.grad is not None) + + predictions, _ = box_predictor.inference([logits, deltas], []) + self.assertEqual(len(predictions), 0) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_fast_rcnn_empty_batch_cuda(self): + self.test_fast_rcnn_empty_batch(device=torch.device("cuda")) + + def test_fast_rcnn_rotated(self): + torch.manual_seed(132) + box_head_output_size = 8 + + box_predictor = RotatedFastRCNNOutputLayers( + ShapeSpec(channels=box_head_output_size), + box2box_transform=Box2BoxTransformRotated(weights=(10, 10, 5, 5, 1)), + num_classes=5, + ) + feature_pooled = torch.rand(2, box_head_output_size) + predictions = box_predictor(feature_pooled) + proposal_boxes = torch.tensor( + [[2, 1.95, 2.4, 1.7, 0], [4.65, 5.25, 4.7, 5.5, 0]], dtype=torch.float32 + ) + gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32) + proposal = Instances((10, 10)) + proposal.proposal_boxes = RotatedBoxes(proposal_boxes) + proposal.gt_boxes = RotatedBoxes(gt_boxes) + proposal.gt_classes = torch.tensor([1, 2]) + + with EventStorage(): # capture events in a new storage to discard them + losses = box_predictor.losses(predictions, [proposal]) + + # Note: the expected losses are slightly different even if + # the boxes are essentially the same as in the FastRCNNOutput test, because + # bbox_pred in FastRCNNOutputLayers have different Linear layers/initialization + # between the two cases. + expected_losses = { + "loss_cls": torch.tensor(1.7920907736), + "loss_box_reg": torch.tensor(4.0410838127), + } + for name in expected_losses.keys(): + assert torch.allclose(losses[name], expected_losses[name]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/test_model_e2e.py b/tests/modeling/test_model_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..95fe6a09fd15f877544392ddeccd9906025b0fdd --- /dev/null +++ b/tests/modeling/test_model_e2e.py @@ -0,0 +1,154 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +import unittest +import torch + +import detectron2.model_zoo as model_zoo +from detectron2.config import get_cfg +from detectron2.modeling import build_model +from detectron2.structures import BitMasks, Boxes, ImageList, Instances +from detectron2.utils.events import EventStorage + + +def get_model_zoo(config_path): + """ + Like model_zoo.get, but do not load any weights (even pretrained) + """ + cfg_file = model_zoo.get_config_file(config_path) + cfg = get_cfg() + cfg.merge_from_file(cfg_file) + if not torch.cuda.is_available(): + cfg.MODEL.DEVICE = "cpu" + return build_model(cfg) + + +def create_model_input(img, inst=None): + if inst is not None: + return {"image": img, "instances": inst} + else: + return {"image": img} + + +def get_empty_instance(h, w): + inst = Instances((h, w)) + inst.gt_boxes = Boxes(torch.rand(0, 4)) + inst.gt_classes = torch.tensor([]).to(dtype=torch.int64) + inst.gt_masks = BitMasks(torch.rand(0, h, w)) + return inst + + +def get_regular_bitmask_instances(h, w): + inst = Instances((h, w)) + inst.gt_boxes = Boxes(torch.rand(3, 4)) + inst.gt_boxes.tensor[:, 2:] += inst.gt_boxes.tensor[:, :2] + inst.gt_classes = torch.tensor([3, 4, 5]).to(dtype=torch.int64) + inst.gt_masks = BitMasks((torch.rand(3, h, w) > 0.5)) + return inst + + +class ModelE2ETest: + def setUp(self): + torch.manual_seed(43) + self.model = get_model_zoo(self.CONFIG_PATH) + + def _test_eval(self, input_sizes): + inputs = [create_model_input(torch.rand(3, s[0], s[1])) for s in input_sizes] + self.model.eval() + self.model(inputs) + + def _test_train(self, input_sizes, instances): + assert len(input_sizes) == len(instances) + inputs = [ + create_model_input(torch.rand(3, s[0], s[1]), inst) + for s, inst in zip(input_sizes, instances) + ] + self.model.train() + with EventStorage(): + losses = self.model(inputs) + sum(losses.values()).backward() + del losses + + def _inf_tensor(self, *shape): + return 1.0 / torch.zeros(*shape, device=self.model.device) + + def _nan_tensor(self, *shape): + return torch.zeros(*shape, device=self.model.device).fill_(float("nan")) + + def test_empty_data(self): + instances = [get_empty_instance(200, 250), get_empty_instance(200, 249)] + self._test_eval([(200, 250), (200, 249)]) + self._test_train([(200, 250), (200, 249)], instances) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA unavailable") + def test_eval_tocpu(self): + model = get_model_zoo(self.CONFIG_PATH).cpu() + model.eval() + input_sizes = [(200, 250), (200, 249)] + inputs = [create_model_input(torch.rand(3, s[0], s[1])) for s in input_sizes] + model(inputs) + + +class MaskRCNNE2ETest(ModelE2ETest, unittest.TestCase): + CONFIG_PATH = "COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml" + + def test_half_empty_data(self): + instances = [get_empty_instance(200, 250), get_regular_bitmask_instances(200, 249)] + self._test_train([(200, 250), (200, 249)], instances) + + # This test is flaky because in some environment the output features are zero due to relu + # def test_rpn_inf_nan_data(self): + # self.model.eval() + # for tensor in [self._inf_tensor, self._nan_tensor]: + # images = ImageList(tensor(1, 3, 512, 512), [(510, 510)]) + # features = { + # "p2": tensor(1, 256, 256, 256), + # "p3": tensor(1, 256, 128, 128), + # "p4": tensor(1, 256, 64, 64), + # "p5": tensor(1, 256, 32, 32), + # "p6": tensor(1, 256, 16, 16), + # } + # props, _ = self.model.proposal_generator(images, features) + # self.assertEqual(len(props[0]), 0) + + def test_roiheads_inf_nan_data(self): + self.model.eval() + for tensor in [self._inf_tensor, self._nan_tensor]: + images = ImageList(tensor(1, 3, 512, 512), [(510, 510)]) + features = { + "p2": tensor(1, 256, 256, 256), + "p3": tensor(1, 256, 128, 128), + "p4": tensor(1, 256, 64, 64), + "p5": tensor(1, 256, 32, 32), + "p6": tensor(1, 256, 16, 16), + } + props = [Instances((510, 510))] + props[0].proposal_boxes = Boxes([[10, 10, 20, 20]]).to(device=self.model.device) + props[0].objectness_logits = torch.tensor([1.0]).reshape(1, 1) + det, _ = self.model.roi_heads(images, features, props) + self.assertEqual(len(det[0]), 0) + + +class RetinaNetE2ETest(ModelE2ETest, unittest.TestCase): + CONFIG_PATH = "COCO-Detection/retinanet_R_50_FPN_1x.yaml" + + def test_inf_nan_data(self): + self.model.eval() + self.model.score_threshold = -999999999 + for tensor in [self._inf_tensor, self._nan_tensor]: + images = ImageList(tensor(1, 3, 512, 512), [(510, 510)]) + features = [ + tensor(1, 256, 128, 128), + tensor(1, 256, 64, 64), + tensor(1, 256, 32, 32), + tensor(1, 256, 16, 16), + tensor(1, 256, 8, 8), + ] + anchors = self.model.anchor_generator(features) + box_cls, box_delta = self.model.head(features) + box_cls = [tensor(*k.shape) for k in box_cls] + box_delta = [tensor(*k.shape) for k in box_delta] + det = self.model.inference(box_cls, box_delta, anchors, images.image_sizes) + # all predictions (if any) are infinite or nan + if len(det[0]): + self.assertTrue(torch.isfinite(det[0].pred_boxes.tensor).sum() == 0) diff --git a/tests/modeling/test_roi_heads.py b/tests/modeling/test_roi_heads.py new file mode 100644 index 0000000000000000000000000000000000000000..5a0630353ca1c2fbb33d2dee7ddb922d57cad3cd --- /dev/null +++ b/tests/modeling/test_roi_heads.py @@ -0,0 +1,108 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.config import get_cfg +from detectron2.modeling.backbone import build_backbone +from detectron2.modeling.proposal_generator.build import build_proposal_generator +from detectron2.modeling.roi_heads import build_roi_heads +from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes +from detectron2.utils.events import EventStorage + +logger = logging.getLogger(__name__) + + +class ROIHeadsTest(unittest.TestCase): + def test_roi_heads(self): + torch.manual_seed(121) + cfg = get_cfg() + cfg.MODEL.ROI_HEADS.NAME = "StandardROIHeads" + cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead" + cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 + cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignV2" + cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5) + backbone = build_backbone(cfg) + num_images = 2 + images_tensor = torch.rand(num_images, 20, 30) + image_sizes = [(10, 10), (20, 30)] + images = ImageList(images_tensor, image_sizes) + num_channels = 1024 + features = {"res4": torch.rand(num_images, num_channels, 1, 2)} + + image_shape = (15, 15) + gt_boxes0 = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) + gt_instance0 = Instances(image_shape) + gt_instance0.gt_boxes = Boxes(gt_boxes0) + gt_instance0.gt_classes = torch.tensor([2, 1]) + gt_boxes1 = torch.tensor([[1, 5, 2, 8], [7, 3, 10, 5]], dtype=torch.float32) + gt_instance1 = Instances(image_shape) + gt_instance1.gt_boxes = Boxes(gt_boxes1) + gt_instance1.gt_classes = torch.tensor([1, 2]) + gt_instances = [gt_instance0, gt_instance1] + + proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) + roi_heads = build_roi_heads(cfg, backbone.output_shape()) + + with EventStorage(): # capture events in a new storage to discard them + proposals, proposal_losses = proposal_generator(images, features, gt_instances) + _, detector_losses = roi_heads(images, features, proposals, gt_instances) + + expected_losses = { + "loss_cls": torch.tensor(4.4236516953), + "loss_box_reg": torch.tensor(0.0091214813), + } + for name in expected_losses.keys(): + self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name])) + + def test_rroi_heads(self): + torch.manual_seed(121) + cfg = get_cfg() + cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN" + cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" + cfg.MODEL.ROI_HEADS.NAME = "RROIHeads" + cfg.MODEL.ROI_BOX_HEAD.NAME = "FastRCNNConvFCHead" + cfg.MODEL.ROI_BOX_HEAD.NUM_FC = 2 + cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) + cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" + cfg.MODEL.ROI_BOX_HEAD.POOLER_TYPE = "ROIAlignRotated" + cfg.MODEL.ROI_BOX_HEAD.BBOX_REG_WEIGHTS = (10, 10, 5, 5, 1) + backbone = build_backbone(cfg) + num_images = 2 + images_tensor = torch.rand(num_images, 20, 30) + image_sizes = [(10, 10), (20, 30)] + images = ImageList(images_tensor, image_sizes) + num_channels = 1024 + features = {"res4": torch.rand(num_images, num_channels, 1, 2)} + + image_shape = (15, 15) + gt_boxes0 = torch.tensor([[2, 2, 2, 2, 30], [4, 4, 4, 4, 0]], dtype=torch.float32) + gt_instance0 = Instances(image_shape) + gt_instance0.gt_boxes = RotatedBoxes(gt_boxes0) + gt_instance0.gt_classes = torch.tensor([2, 1]) + gt_boxes1 = torch.tensor([[1.5, 5.5, 1, 3, 0], [8.5, 4, 3, 2, -50]], dtype=torch.float32) + gt_instance1 = Instances(image_shape) + gt_instance1.gt_boxes = RotatedBoxes(gt_boxes1) + gt_instance1.gt_classes = torch.tensor([1, 2]) + gt_instances = [gt_instance0, gt_instance1] + + proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) + roi_heads = build_roi_heads(cfg, backbone.output_shape()) + + with EventStorage(): # capture events in a new storage to discard them + proposals, proposal_losses = proposal_generator(images, features, gt_instances) + _, detector_losses = roi_heads(images, features, proposals, gt_instances) + + expected_losses = { + "loss_cls": torch.tensor(4.381618499755859), + "loss_box_reg": torch.tensor(0.0011829272843897343), + } + for name in expected_losses.keys(): + err_msg = "detector_losses[{}] = {}, expected losses = {}".format( + name, detector_losses[name], expected_losses[name] + ) + self.assertTrue(torch.allclose(detector_losses[name], expected_losses[name]), err_msg) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/test_roi_pooler.py b/tests/modeling/test_roi_pooler.py new file mode 100644 index 0000000000000000000000000000000000000000..9aa3825c0196e4a6d89162e3d7c797e3d77b23bd --- /dev/null +++ b/tests/modeling/test_roi_pooler.py @@ -0,0 +1,85 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.modeling.poolers import ROIPooler +from detectron2.structures import Boxes, RotatedBoxes + +logger = logging.getLogger(__name__) + + +class TestROIPooler(unittest.TestCase): + def _rand_boxes(self, num_boxes, x_max, y_max): + coords = torch.rand(num_boxes, 4) + coords[:, 0] *= x_max + coords[:, 1] *= y_max + coords[:, 2] *= x_max + coords[:, 3] *= y_max + boxes = torch.zeros(num_boxes, 4) + boxes[:, 0] = torch.min(coords[:, 0], coords[:, 2]) + boxes[:, 1] = torch.min(coords[:, 1], coords[:, 3]) + boxes[:, 2] = torch.max(coords[:, 0], coords[:, 2]) + boxes[:, 3] = torch.max(coords[:, 1], coords[:, 3]) + return boxes + + def _test_roialignv2_roialignrotated_match(self, device): + pooler_resolution = 14 + canonical_level = 4 + canonical_scale_factor = 2 ** canonical_level + pooler_scales = (1.0 / canonical_scale_factor,) + sampling_ratio = 0 + + N, C, H, W = 2, 4, 10, 8 + N_rois = 10 + std = 11 + mean = 0 + feature = (torch.rand(N, C, H, W) - 0.5) * 2 * std + mean + + features = [feature.to(device)] + + rois = [] + rois_rotated = [] + for _ in range(N): + boxes = self._rand_boxes( + num_boxes=N_rois, x_max=W * canonical_scale_factor, y_max=H * canonical_scale_factor + ) + + rotated_boxes = torch.zeros(N_rois, 5) + rotated_boxes[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2.0 + rotated_boxes[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2.0 + rotated_boxes[:, 2] = boxes[:, 2] - boxes[:, 0] + rotated_boxes[:, 3] = boxes[:, 3] - boxes[:, 1] + rois.append(Boxes(boxes).to(device)) + rois_rotated.append(RotatedBoxes(rotated_boxes).to(device)) + + roialignv2_pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type="ROIAlignV2", + ) + + roialignv2_out = roialignv2_pooler(features, rois) + + roialignrotated_pooler = ROIPooler( + output_size=pooler_resolution, + scales=pooler_scales, + sampling_ratio=sampling_ratio, + pooler_type="ROIAlignRotated", + ) + + roialignrotated_out = roialignrotated_pooler(features, rois_rotated) + + self.assertTrue(torch.allclose(roialignv2_out, roialignrotated_out, atol=1e-4)) + + def test_roialignv2_roialignrotated_match_cpu(self): + self._test_roialignv2_roialignrotated_match(device="cpu") + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_roialignv2_roialignrotated_match_cuda(self): + self._test_roialignv2_roialignrotated_match(device="cuda") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/modeling/test_rpn.py b/tests/modeling/test_rpn.py new file mode 100644 index 0000000000000000000000000000000000000000..967d2102b85f2d66e3f0b32b31805c4ac01afa0c --- /dev/null +++ b/tests/modeling/test_rpn.py @@ -0,0 +1,234 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest +import torch + +from detectron2.config import get_cfg +from detectron2.modeling.backbone import build_backbone +from detectron2.modeling.proposal_generator.build import build_proposal_generator +from detectron2.modeling.proposal_generator.rpn_outputs import find_top_rpn_proposals +from detectron2.structures import Boxes, ImageList, Instances, RotatedBoxes +from detectron2.utils.events import EventStorage + +logger = logging.getLogger(__name__) + + +class RPNTest(unittest.TestCase): + def test_rpn(self): + torch.manual_seed(121) + cfg = get_cfg() + cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RPN" + cfg.MODEL.ANCHOR_GENERATOR.NAME = "DefaultAnchorGenerator" + cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1) + backbone = build_backbone(cfg) + proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) + num_images = 2 + images_tensor = torch.rand(num_images, 20, 30) + image_sizes = [(10, 10), (20, 30)] + images = ImageList(images_tensor, image_sizes) + image_shape = (15, 15) + num_channels = 1024 + features = {"res4": torch.rand(num_images, num_channels, 1, 2)} + gt_boxes = torch.tensor([[1, 1, 3, 3], [2, 2, 6, 6]], dtype=torch.float32) + gt_instances = Instances(image_shape) + gt_instances.gt_boxes = Boxes(gt_boxes) + with EventStorage(): # capture events in a new storage to discard them + proposals, proposal_losses = proposal_generator( + images, features, [gt_instances[0], gt_instances[1]] + ) + + expected_losses = { + "loss_rpn_cls": torch.tensor(0.0804563984), + "loss_rpn_loc": torch.tensor(0.0990132466), + } + for name in expected_losses.keys(): + err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( + name, proposal_losses[name], expected_losses[name] + ) + self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) + + expected_proposal_boxes = [ + Boxes(torch.tensor([[0, 0, 10, 10], [7.3365392685, 0, 10, 10]])), + Boxes( + torch.tensor( + [ + [0, 0, 30, 20], + [0, 0, 16.7862777710, 13.1362524033], + [0, 0, 30, 13.3173446655], + [0, 0, 10.8602609634, 20], + [7.7165775299, 0, 27.3875980377, 20], + ] + ) + ), + ] + + expected_objectness_logits = [ + torch.tensor([0.1225359365, -0.0133192837]), + torch.tensor([0.1415634006, 0.0989848152, 0.0565387346, -0.0072308783, -0.0428492837]), + ] + + for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( + proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits + ): + self.assertEqual(len(proposal), len(expected_proposal_box)) + self.assertEqual(proposal.image_size, im_size) + self.assertTrue( + torch.allclose(proposal.proposal_boxes.tensor, expected_proposal_box.tensor) + ) + self.assertTrue(torch.allclose(proposal.objectness_logits, expected_objectness_logit)) + + def test_rrpn(self): + torch.manual_seed(121) + cfg = get_cfg() + cfg.MODEL.PROPOSAL_GENERATOR.NAME = "RRPN" + cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator" + cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[32, 64]] + cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[0.25, 1]] + cfg.MODEL.ANCHOR_GENERATOR.ANGLES = [[0, 60]] + cfg.MODEL.RPN.BBOX_REG_WEIGHTS = (1, 1, 1, 1, 1) + cfg.MODEL.RPN.HEAD_NAME = "StandardRPNHead" + backbone = build_backbone(cfg) + proposal_generator = build_proposal_generator(cfg, backbone.output_shape()) + num_images = 2 + images_tensor = torch.rand(num_images, 20, 30) + image_sizes = [(10, 10), (20, 30)] + images = ImageList(images_tensor, image_sizes) + image_shape = (15, 15) + num_channels = 1024 + features = {"res4": torch.rand(num_images, num_channels, 1, 2)} + gt_boxes = torch.tensor([[2, 2, 2, 2, 0], [4, 4, 4, 4, 0]], dtype=torch.float32) + gt_instances = Instances(image_shape) + gt_instances.gt_boxes = RotatedBoxes(gt_boxes) + with EventStorage(): # capture events in a new storage to discard them + proposals, proposal_losses = proposal_generator( + images, features, [gt_instances[0], gt_instances[1]] + ) + + expected_losses = { + "loss_rpn_cls": torch.tensor(0.043263837695121765), + "loss_rpn_loc": torch.tensor(0.14432406425476074), + } + for name in expected_losses.keys(): + err_msg = "proposal_losses[{}] = {}, expected losses = {}".format( + name, proposal_losses[name], expected_losses[name] + ) + self.assertTrue(torch.allclose(proposal_losses[name], expected_losses[name]), err_msg) + + expected_proposal_boxes = [ + RotatedBoxes( + torch.tensor( + [ + [0.60189795, 1.24095452, 61.98131943, 18.03621292, -4.07244873], + [15.64940453, 1.69624567, 59.59749603, 16.34339333, 2.62692475], + [-3.02982378, -2.69752932, 67.90952301, 59.62455750, 59.97010040], + [16.71863365, 1.98309708, 35.61507797, 32.81484985, 62.92267227], + [0.49432933, -7.92979717, 67.77606201, 62.93098450, -1.85656738], + [8.00880814, 1.36017394, 121.81007385, 32.74150467, 50.44297409], + [16.44299889, -4.82221127, 63.39775848, 61.22503662, 54.12270737], + [5.00000000, 5.00000000, 10.00000000, 10.00000000, -0.76943970], + [17.64130402, -0.98095351, 61.40377808, 16.28918839, 55.53118134], + [0.13016054, 4.60568953, 35.80157471, 32.30180359, 62.52872086], + [-4.26460743, 0.39604485, 124.30079651, 31.84611320, -1.58203125], + [7.52815342, -0.91636634, 62.39784622, 15.45565224, 60.79549789], + ] + ) + ), + RotatedBoxes( + torch.tensor( + [ + [0.07734215, 0.81635046, 65.33510590, 17.34688377, -1.51821899], + [-3.41833067, -3.11320257, 64.17595673, 60.55617905, 58.27033234], + [20.67383385, -6.16561556, 63.60531998, 62.52315903, 54.85546494], + [15.00000000, 10.00000000, 30.00000000, 20.00000000, -0.18218994], + [9.22646523, -6.84775209, 62.09895706, 65.46472931, -2.74307251], + [15.00000000, 4.93451595, 30.00000000, 9.86903191, -0.60272217], + [8.88342094, 2.65560246, 120.95362854, 32.45022202, 55.75970078], + [16.39088631, 2.33887148, 34.78761292, 35.61492920, 60.81977463], + [9.78298569, 10.00000000, 19.56597137, 20.00000000, -0.86660767], + [1.28576660, 5.49873352, 34.93610382, 33.22600174, 60.51599884], + [17.58912468, -1.63270092, 62.96052551, 16.45713997, 52.91245270], + [5.64749718, -1.90428460, 62.37649155, 16.19474792, 61.09543991], + [0.82255805, 2.34931135, 118.83985901, 32.83671188, 56.50753784], + [-5.33874989, 1.64404404, 125.28501892, 33.35424042, -2.80731201], + ] + ) + ), + ] + + expected_objectness_logits = [ + torch.tensor( + [ + 0.10111768, + 0.09112845, + 0.08466332, + 0.07589971, + 0.06650183, + 0.06350251, + 0.04299347, + 0.01864817, + 0.00986163, + 0.00078543, + -0.04573630, + -0.04799230, + ] + ), + torch.tensor( + [ + 0.11373727, + 0.09377633, + 0.05281663, + 0.05143715, + 0.04040275, + 0.03250912, + 0.01307789, + 0.01177734, + 0.00038105, + -0.00540255, + -0.01194804, + -0.01461012, + -0.03061717, + -0.03599222, + ] + ), + ] + + torch.set_printoptions(precision=8, sci_mode=False) + + for proposal, expected_proposal_box, im_size, expected_objectness_logit in zip( + proposals, expected_proposal_boxes, image_sizes, expected_objectness_logits + ): + self.assertEqual(len(proposal), len(expected_proposal_box)) + self.assertEqual(proposal.image_size, im_size) + # It seems that there's some randomness in the result across different machines: + # This test can be run on a local machine for 100 times with exactly the same result, + # However, a different machine might produce slightly different results, + # thus the atol here. + err_msg = "computed proposal boxes = {}, expected {}".format( + proposal.proposal_boxes.tensor, expected_proposal_box.tensor + ) + self.assertTrue( + torch.allclose( + proposal.proposal_boxes.tensor, expected_proposal_box.tensor, atol=1e-5 + ), + err_msg, + ) + + err_msg = "computed objectness logits = {}, expected {}".format( + proposal.objectness_logits, expected_objectness_logit + ) + self.assertTrue( + torch.allclose(proposal.objectness_logits, expected_objectness_logit, atol=1e-5), + err_msg, + ) + + def test_rpn_proposals_inf(self): + N, Hi, Wi, A = 3, 3, 3, 3 + proposals = [torch.rand(N, Hi * Wi * A, 4)] + pred_logits = [torch.rand(N, Hi * Wi * A)] + pred_logits[0][1][3:5].fill_(float("inf")) + images = ImageList.from_tensors([torch.rand(3, 10, 10)] * 3) + find_top_rpn_proposals(proposals, pred_logits, images, 0.5, 1000, 1000, 0, False) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/structures/__init__.py b/tests/structures/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tests/structures/test_boxes.py b/tests/structures/test_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..4d33c3bf9b7471c7e4382bc9e66c26e1fb60e29f --- /dev/null +++ b/tests/structures/test_boxes.py @@ -0,0 +1,182 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import json +import math +import numpy as np +import unittest +import torch + +from detectron2.structures import Boxes, BoxMode, pairwise_iou + + +class TestBoxMode(unittest.TestCase): + def _convert_xy_to_wh(self, x): + return BoxMode.convert(x, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + + def _convert_xywha_to_xyxy(self, x): + return BoxMode.convert(x, BoxMode.XYWHA_ABS, BoxMode.XYXY_ABS) + + def _convert_xywh_to_xywha(self, x): + return BoxMode.convert(x, BoxMode.XYWH_ABS, BoxMode.XYWHA_ABS) + + def test_box_convert_list(self): + for tp in [list, tuple]: + box = tp([5.0, 5.0, 10.0, 10.0]) + output = self._convert_xy_to_wh(box) + self.assertIsInstance(output, tp) + self.assertIsInstance(output[0], float) + self.assertEqual(output, tp([5.0, 5.0, 5.0, 5.0])) + + with self.assertRaises(Exception): + self._convert_xy_to_wh([box]) + + def test_box_convert_array(self): + box = np.asarray([[5, 5, 10, 10], [1, 1, 2, 3]]) + output = self._convert_xy_to_wh(box) + self.assertEqual(output.dtype, box.dtype) + self.assertEqual(output.shape, box.shape) + self.assertTrue((output[0] == [5, 5, 5, 5]).all()) + self.assertTrue((output[1] == [1, 1, 1, 2]).all()) + + def test_box_convert_cpu_tensor(self): + box = torch.tensor([[5, 5, 10, 10], [1, 1, 2, 3]]) + output = self._convert_xy_to_wh(box) + self.assertEqual(output.dtype, box.dtype) + self.assertEqual(output.shape, box.shape) + output = output.numpy() + self.assertTrue((output[0] == [5, 5, 5, 5]).all()) + self.assertTrue((output[1] == [1, 1, 1, 2]).all()) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_box_convert_cuda_tensor(self): + box = torch.tensor([[5, 5, 10, 10], [1, 1, 2, 3]]).cuda() + output = self._convert_xy_to_wh(box) + self.assertEqual(output.dtype, box.dtype) + self.assertEqual(output.shape, box.shape) + self.assertEqual(output.device, box.device) + output = output.cpu().numpy() + self.assertTrue((output[0] == [5, 5, 5, 5]).all()) + self.assertTrue((output[1] == [1, 1, 1, 2]).all()) + + def test_box_convert_xywha_to_xyxy_list(self): + for tp in [list, tuple]: + box = tp([50, 50, 30, 20, 0]) + output = self._convert_xywha_to_xyxy(box) + self.assertIsInstance(output, tp) + self.assertEqual(output, tp([35, 40, 65, 60])) + + with self.assertRaises(Exception): + self._convert_xywha_to_xyxy([box]) + + def test_box_convert_xywha_to_xyxy_array(self): + for dtype in [np.float64, np.float32]: + box = np.asarray( + [ + [50, 50, 30, 20, 0], + [50, 50, 30, 20, 90], + [1, 1, math.sqrt(2), math.sqrt(2), -45], + ], + dtype=dtype, + ) + output = self._convert_xywha_to_xyxy(box) + self.assertEqual(output.dtype, box.dtype) + expected = np.asarray([[35, 40, 65, 60], [40, 35, 60, 65], [0, 0, 2, 2]], dtype=dtype) + self.assertTrue(np.allclose(output, expected, atol=1e-6), "output={}".format(output)) + + def test_box_convert_xywha_to_xyxy_tensor(self): + for dtype in [torch.float32, torch.float64]: + box = torch.tensor( + [ + [50, 50, 30, 20, 0], + [50, 50, 30, 20, 90], + [1, 1, math.sqrt(2), math.sqrt(2), -45], + ], + dtype=dtype, + ) + output = self._convert_xywha_to_xyxy(box) + self.assertEqual(output.dtype, box.dtype) + expected = torch.tensor([[35, 40, 65, 60], [40, 35, 60, 65], [0, 0, 2, 2]], dtype=dtype) + + self.assertTrue(torch.allclose(output, expected, atol=1e-6), "output={}".format(output)) + + def test_box_convert_xywh_to_xywha_list(self): + for tp in [list, tuple]: + box = tp([50, 50, 30, 20]) + output = self._convert_xywh_to_xywha(box) + self.assertIsInstance(output, tp) + self.assertEqual(output, tp([65, 60, 30, 20, 0])) + + with self.assertRaises(Exception): + self._convert_xywh_to_xywha([box]) + + def test_box_convert_xywh_to_xywha_array(self): + for dtype in [np.float64, np.float32]: + box = np.asarray([[30, 40, 70, 60], [30, 40, 60, 70], [-1, -1, 2, 2]], dtype=dtype) + output = self._convert_xywh_to_xywha(box) + self.assertEqual(output.dtype, box.dtype) + expected = np.asarray( + [[65, 70, 70, 60, 0], [60, 75, 60, 70, 0], [0, 0, 2, 2, 0]], dtype=dtype + ) + self.assertTrue(np.allclose(output, expected, atol=1e-6), "output={}".format(output)) + + def test_box_convert_xywh_to_xywha_tensor(self): + for dtype in [torch.float32, torch.float64]: + box = torch.tensor([[30, 40, 70, 60], [30, 40, 60, 70], [-1, -1, 2, 2]], dtype=dtype) + output = self._convert_xywh_to_xywha(box) + self.assertEqual(output.dtype, box.dtype) + expected = torch.tensor( + [[65, 70, 70, 60, 0], [60, 75, 60, 70, 0], [0, 0, 2, 2, 0]], dtype=dtype + ) + + self.assertTrue(torch.allclose(output, expected, atol=1e-6), "output={}".format(output)) + + def test_json_serializable(self): + payload = {"box_mode": BoxMode.XYWH_REL} + try: + json.dumps(payload) + except Exception: + self.fail("JSON serialization failed") + + def test_json_deserializable(self): + payload = '{"box_mode": 2}' + obj = json.loads(payload) + try: + obj["box_mode"] = BoxMode(obj["box_mode"]) + except Exception: + self.fail("JSON deserialization failed") + + +class TestBoxIOU(unittest.TestCase): + def test_pairwise_iou(self): + boxes1 = torch.tensor([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]) + + boxes2 = torch.tensor( + [ + [0.0, 0.0, 1.0, 1.0], + [0.0, 0.0, 0.5, 1.0], + [0.0, 0.0, 1.0, 0.5], + [0.0, 0.0, 0.5, 0.5], + [0.5, 0.5, 1.0, 1.0], + [0.5, 0.5, 1.5, 1.5], + ] + ) + + expected_ious = torch.tensor( + [ + [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], + [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], + ] + ) + + ious = pairwise_iou(Boxes(boxes1), Boxes(boxes2)) + + self.assertTrue(torch.allclose(ious, expected_ious)) + + +class TestBoxes(unittest.TestCase): + def test_empty_cat(self): + x = Boxes.cat([]) + self.assertTrue(x.tensor.shape, (0, 4)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/structures/test_imagelist.py b/tests/structures/test_imagelist.py new file mode 100644 index 0000000000000000000000000000000000000000..abeb35569ddc34a618735f4989dfbfae23d47bc1 --- /dev/null +++ b/tests/structures/test_imagelist.py @@ -0,0 +1,38 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import unittest +from typing import Sequence +import torch + +from detectron2.structures import ImageList + + +class TestImageList(unittest.TestCase): + def test_imagelist_padding_shape(self): + class TensorToImageList(torch.nn.Module): + def forward(self, tensors: Sequence[torch.Tensor]): + return ImageList.from_tensors(tensors, 4).tensor + + func = torch.jit.trace( + TensorToImageList(), ([torch.ones((3, 10, 10), dtype=torch.float32)],) + ) + ret = func([torch.ones((3, 15, 20), dtype=torch.float32)]) + self.assertEqual(list(ret.shape), [1, 3, 16, 20], str(ret.shape)) + + func = torch.jit.trace( + TensorToImageList(), + ( + [ + torch.ones((3, 16, 10), dtype=torch.float32), + torch.ones((3, 13, 11), dtype=torch.float32), + ], + ), + ) + ret = func( + [ + torch.ones((3, 25, 20), dtype=torch.float32), + torch.ones((3, 10, 10), dtype=torch.float32), + ] + ) + # does not support calling with different #images + self.assertEqual(list(ret.shape), [2, 3, 28, 20], str(ret.shape)) diff --git a/tests/structures/test_instances.py b/tests/structures/test_instances.py new file mode 100644 index 0000000000000000000000000000000000000000..79c5249217633d3f144d02f14d11f32d1d4be7c9 --- /dev/null +++ b/tests/structures/test_instances.py @@ -0,0 +1,25 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import unittest +import torch + +from detectron2.structures import Instances + + +class TestInstancesIndexing(unittest.TestCase): + def test_int_indexing(self): + attr1 = torch.tensor([[0.0, 0.0, 1.0], [0.0, 0.0, 0.5], [0.0, 0.0, 1.0], [0.0, 0.5, 0.5]]) + attr2 = torch.tensor([0.1, 0.2, 0.3, 0.4]) + instances = Instances((100, 100)) + instances.attr1 = attr1 + instances.attr2 = attr2 + for i in range(-len(instances), len(instances)): + inst = instances[i] + self.assertEqual((inst.attr1 == attr1[i]).all(), True) + self.assertEqual((inst.attr2 == attr2[i]).all(), True) + + self.assertRaises(IndexError, lambda: instances[len(instances)]) + self.assertRaises(IndexError, lambda: instances[-len(instances) - 1]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/structures/test_rotated_boxes.py b/tests/structures/test_rotated_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..575ac480e39d7406e55f4ff45b867e6f5c3796a0 --- /dev/null +++ b/tests/structures/test_rotated_boxes.py @@ -0,0 +1,357 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +from __future__ import absolute_import, division, print_function, unicode_literals +import logging +import math +import random +import unittest +import torch +from fvcore.common.benchmark import benchmark + +from detectron2.layers.rotated_boxes import pairwise_iou_rotated +from detectron2.structures.boxes import Boxes +from detectron2.structures.rotated_boxes import RotatedBoxes, pairwise_iou + +logger = logging.getLogger(__name__) + + +class TestRotatedBoxesLayer(unittest.TestCase): + def test_iou_0_dim_cpu(self): + boxes1 = torch.rand(0, 5, dtype=torch.float32) + boxes2 = torch.rand(10, 5, dtype=torch.float32) + expected_ious = torch.zeros(0, 10, dtype=torch.float32) + ious = pairwise_iou_rotated(boxes1, boxes2) + self.assertTrue(torch.allclose(ious, expected_ious)) + + boxes1 = torch.rand(10, 5, dtype=torch.float32) + boxes2 = torch.rand(0, 5, dtype=torch.float32) + expected_ious = torch.zeros(10, 0, dtype=torch.float32) + ious = pairwise_iou_rotated(boxes1, boxes2) + self.assertTrue(torch.allclose(ious, expected_ious)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_iou_0_dim_cuda(self): + boxes1 = torch.rand(0, 5, dtype=torch.float32) + boxes2 = torch.rand(10, 5, dtype=torch.float32) + expected_ious = torch.zeros(0, 10, dtype=torch.float32) + ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda()) + self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious)) + + boxes1 = torch.rand(10, 5, dtype=torch.float32) + boxes2 = torch.rand(0, 5, dtype=torch.float32) + expected_ious = torch.zeros(10, 0, dtype=torch.float32) + ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda()) + self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious)) + + def test_iou_half_overlap_cpu(self): + boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32) + boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32) + expected_ious = torch.tensor([[0.5]], dtype=torch.float32) + ious = pairwise_iou_rotated(boxes1, boxes2) + self.assertTrue(torch.allclose(ious, expected_ious)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_iou_half_overlap_cuda(self): + boxes1 = torch.tensor([[0.5, 0.5, 1.0, 1.0, 0.0]], dtype=torch.float32) + boxes2 = torch.tensor([[0.25, 0.5, 0.5, 1.0, 0.0]], dtype=torch.float32) + expected_ious = torch.tensor([[0.5]], dtype=torch.float32) + ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda()) + self.assertTrue(torch.allclose(ious_cuda.cpu(), expected_ious)) + + def test_iou_precision(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor([[565, 565, 10, 10.0, 0]], dtype=torch.float32, device=device) + boxes2 = torch.tensor([[565, 565, 10, 8.3, 0]], dtype=torch.float32, device=device) + iou = 8.3 / 10.0 + expected_ious = torch.tensor([[iou]], dtype=torch.float32) + ious = pairwise_iou_rotated(boxes1, boxes2) + self.assertTrue(torch.allclose(ious.cpu(), expected_ious)) + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def test_iou_too_many_boxes_cuda(self): + s1, s2 = 5, 1289035 + boxes1 = torch.zeros(s1, 5) + boxes2 = torch.zeros(s2, 5) + ious_cuda = pairwise_iou_rotated(boxes1.cuda(), boxes2.cuda()) + self.assertTupleEqual(tuple(ious_cuda.shape), (s1, s2)) + + def test_iou_extreme(self): + # Cause floating point issues in cuda kernels (#1266) + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor([[160.0, 153.0, 230.0, 23.0, -37.0]], device=device) + boxes2 = torch.tensor( + [ + [ + -1.117407639806935e17, + 1.3858420478349148e18, + 1000.0000610351562, + 1000.0000610351562, + 1612.0, + ] + ], + device=device, + ) + ious = pairwise_iou_rotated(boxes1, boxes2) + self.assertTrue(ious.min() >= 0, ious) + + +class TestRotatedBoxesStructure(unittest.TestCase): + def test_clip_area_0_degree(self): + for _ in range(50): + num_boxes = 100 + boxes_5d = torch.zeros(num_boxes, 5) + boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500) + boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500) + # Convert from (x_ctr, y_ctr, w, h, 0) to (x1, y1, x2, y2) + boxes_4d = torch.zeros(num_boxes, 4) + boxes_4d[:, 0] = boxes_5d[:, 0] - boxes_5d[:, 2] / 2.0 + boxes_4d[:, 1] = boxes_5d[:, 1] - boxes_5d[:, 3] / 2.0 + boxes_4d[:, 2] = boxes_5d[:, 0] + boxes_5d[:, 2] / 2.0 + boxes_4d[:, 3] = boxes_5d[:, 1] + boxes_5d[:, 3] / 2.0 + + image_size = (500, 600) + test_boxes_4d = Boxes(boxes_4d) + test_boxes_5d = RotatedBoxes(boxes_5d) + # Before clip + areas_4d = test_boxes_4d.area() + areas_5d = test_boxes_5d.area() + self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5)) + # After clip + test_boxes_4d.clip(image_size) + test_boxes_5d.clip(image_size) + areas_4d = test_boxes_4d.area() + areas_5d = test_boxes_5d.area() + self.assertTrue(torch.allclose(areas_4d, areas_5d, atol=1e-1, rtol=1e-5)) + + def test_clip_area_arbitrary_angle(self): + num_boxes = 100 + boxes_5d = torch.zeros(num_boxes, 5) + boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500) + boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500) + boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800) + clip_angle_threshold = random.uniform(0, 180) + + image_size = (500, 600) + test_boxes_5d = RotatedBoxes(boxes_5d) + # Before clip + areas_before = test_boxes_5d.area() + # After clip + test_boxes_5d.clip(image_size, clip_angle_threshold) + areas_diff = test_boxes_5d.area() - areas_before + + # the areas should only decrease after clipping + self.assertTrue(torch.all(areas_diff <= 0)) + # whenever the box is clipped (thus the area shrinks), + # the angle for the box must be within the clip_angle_threshold + # Note that the clip function will normalize the angle range + # to be within (-180, 180] + self.assertTrue( + torch.all(torch.abs(boxes_5d[:, 4][torch.where(areas_diff < 0)]) < clip_angle_threshold) + ) + + def test_normalize_angles(self): + # torch.manual_seed(0) + for _ in range(50): + num_boxes = 100 + boxes_5d = torch.zeros(num_boxes, 5) + boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-100, 500) + boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, 500) + boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, 500) + boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800) + rotated_boxes = RotatedBoxes(boxes_5d) + normalized_boxes = rotated_boxes.clone() + normalized_boxes.normalize_angles() + self.assertTrue(torch.all(normalized_boxes.tensor[:, 4] >= -180)) + self.assertTrue(torch.all(normalized_boxes.tensor[:, 4] < 180)) + # x, y, w, h should not change + self.assertTrue(torch.allclose(boxes_5d[:, :4], normalized_boxes.tensor[:, :4])) + # the cos/sin values of the angles should stay the same + + self.assertTrue( + torch.allclose( + torch.cos(boxes_5d[:, 4] * math.pi / 180), + torch.cos(normalized_boxes.tensor[:, 4] * math.pi / 180), + atol=1e-5, + ) + ) + + self.assertTrue( + torch.allclose( + torch.sin(boxes_5d[:, 4] * math.pi / 180), + torch.sin(normalized_boxes.tensor[:, 4] * math.pi / 180), + atol=1e-5, + ) + ) + + def test_pairwise_iou_0_degree(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor( + [[0.5, 0.5, 1.0, 1.0, 0.0], [0.5, 0.5, 1.0, 1.0, 0.0]], + dtype=torch.float32, + device=device, + ) + boxes2 = torch.tensor( + [ + [0.5, 0.5, 1.0, 1.0, 0.0], + [0.25, 0.5, 0.5, 1.0, 0.0], + [0.5, 0.25, 1.0, 0.5, 0.0], + [0.25, 0.25, 0.5, 0.5, 0.0], + [0.75, 0.75, 0.5, 0.5, 0.0], + [1.0, 1.0, 1.0, 1.0, 0.0], + ], + dtype=torch.float32, + device=device, + ) + expected_ious = torch.tensor( + [ + [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], + [1.0, 0.5, 0.5, 0.25, 0.25, 0.25 / (2 - 0.25)], + ], + dtype=torch.float32, + device=device, + ) + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_45_degrees(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor( + [ + [1, 1, math.sqrt(2), math.sqrt(2), 45], + [1, 1, 2 * math.sqrt(2), 2 * math.sqrt(2), -45], + ], + dtype=torch.float32, + device=device, + ) + boxes2 = torch.tensor([[1, 1, 2, 2, 0]], dtype=torch.float32, device=device) + expected_ious = torch.tensor([[0.5], [0.5]], dtype=torch.float32, device=device) + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_orthogonal(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor([[5, 5, 10, 6, 55]], dtype=torch.float32, device=device) + boxes2 = torch.tensor([[5, 5, 10, 6, -35]], dtype=torch.float32, device=device) + iou = (6.0 * 6.0) / (6.0 * 6.0 + 4.0 * 6.0 + 4.0 * 6.0) + expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device) + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_large_close_boxes(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + boxes1 = torch.tensor( + [[299.500000, 417.370422, 600.000000, 364.259186, 27.1828]], + dtype=torch.float32, + device=device, + ) + boxes2 = torch.tensor( + [[299.500000, 417.370422, 600.000000, 364.259155, 27.1828]], + dtype=torch.float32, + device=device, + ) + iou = 364.259155 / 364.259186 + expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device) + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_many_boxes(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + num_boxes1 = 100 + num_boxes2 = 200 + boxes1 = torch.stack( + [ + torch.tensor( + [5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32, device=device + ) + for i in range(num_boxes1) + ] + ) + boxes2 = torch.stack( + [ + torch.tensor( + [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], + dtype=torch.float32, + device=device, + ) + for i in range(num_boxes2) + ] + ) + expected_ious = torch.zeros(num_boxes1, num_boxes2, dtype=torch.float32, device=device) + for i in range(min(num_boxes1, num_boxes2)): + expected_ious[i][i] = (1 + 9 * i / num_boxes2) / 10.0 + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_issue1207_simplified(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + # Simplified test case of D2-issue-1207 + boxes1 = torch.tensor([[3, 3, 8, 2, -45.0]], device=device) + boxes2 = torch.tensor([[6, 0, 8, 2, -45.0]], device=device) + iou = 0.0 + expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device) + + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_pairwise_iou_issue1207(self): + for device in ["cpu"] + ["cuda"] if torch.cuda.is_available() else []: + # The original test case in D2-issue-1207 + boxes1 = torch.tensor([[160.0, 153.0, 230.0, 23.0, -37.0]], device=device) + boxes2 = torch.tensor([[190.0, 127.0, 80.0, 21.0, -46.0]], device=device) + + iou = 0.0 + expected_ious = torch.tensor([[iou]], dtype=torch.float32, device=device) + + ious = pairwise_iou(RotatedBoxes(boxes1), RotatedBoxes(boxes2)) + self.assertTrue(torch.allclose(ious, expected_ious)) + + def test_empty_cat(self): + x = RotatedBoxes.cat([]) + self.assertTrue(x.tensor.shape, (0, 5)) + + +def benchmark_rotated_iou(): + num_boxes1 = 200 + num_boxes2 = 500 + boxes1 = torch.stack( + [ + torch.tensor([5 + 20 * i, 5 + 20 * i, 10, 10, 0], dtype=torch.float32) + for i in range(num_boxes1) + ] + ) + boxes2 = torch.stack( + [ + torch.tensor( + [5 + 20 * i, 5 + 20 * i, 10, 1 + 9 * i / num_boxes2, 0], dtype=torch.float32 + ) + for i in range(num_boxes2) + ] + ) + + def func(dev, n=1): + b1 = boxes1.to(device=dev) + b2 = boxes2.to(device=dev) + + def bench(): + for _ in range(n): + pairwise_iou_rotated(b1, b2) + if dev.type == "cuda": + torch.cuda.synchronize() + + return bench + + # only run it once per timed loop, since it's slow + args = [{"dev": torch.device("cpu"), "n": 1}] + if torch.cuda.is_available(): + args.append({"dev": torch.device("cuda"), "n": 10}) + + benchmark(func, "rotated_iou", args, warmup_iters=3) + + +if __name__ == "__main__": + unittest.main() + benchmark_rotated_iou() diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..725b488fdaec5d2b3a5c6d11c11d2c362453a2a4 --- /dev/null +++ b/tests/test_checkpoint.py @@ -0,0 +1,48 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import unittest +from collections import OrderedDict +import torch +from torch import nn + +from detectron2.checkpoint.c2_model_loading import align_and_update_state_dicts +from detectron2.utils.logger import setup_logger + + +class TestCheckpointer(unittest.TestCase): + def setUp(self): + setup_logger() + + def create_complex_model(self): + m = nn.Module() + m.block1 = nn.Module() + m.block1.layer1 = nn.Linear(2, 3) + m.layer2 = nn.Linear(3, 2) + m.res = nn.Module() + m.res.layer2 = nn.Linear(3, 2) + + state_dict = OrderedDict() + state_dict["layer1.weight"] = torch.rand(3, 2) + state_dict["layer1.bias"] = torch.rand(3) + state_dict["layer2.weight"] = torch.rand(2, 3) + state_dict["layer2.bias"] = torch.rand(2) + state_dict["res.layer2.weight"] = torch.rand(2, 3) + state_dict["res.layer2.bias"] = torch.rand(2) + return m, state_dict + + def test_complex_model_loaded(self): + for add_data_parallel in [False, True]: + model, state_dict = self.create_complex_model() + if add_data_parallel: + model = nn.DataParallel(model) + model_sd = model.state_dict() + + align_and_update_state_dicts(model_sd, state_dict) + for loaded, stored in zip(model_sd.values(), state_dict.values()): + # different tensor references + self.assertFalse(id(loaded) == id(stored)) + # same content + self.assertTrue(loaded.equal(stored)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000000000000000000000000000000000000..650bdf2c42107c7031709653783cb2f3043e1bdf --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + + +import os +import tempfile +import unittest +import torch + +from detectron2.config import configurable, downgrade_config, get_cfg, upgrade_config +from detectron2.layers import ShapeSpec + +_V0_CFG = """ +MODEL: + RPN_HEAD: + NAME: "TEST" +VERSION: 0 +""" + +_V1_CFG = """ +MODEL: + WEIGHT: "/path/to/weight" +""" + + +class TestConfigVersioning(unittest.TestCase): + def test_upgrade_downgrade_consistency(self): + cfg = get_cfg() + # check that custom is preserved + cfg.USER_CUSTOM = 1 + + down = downgrade_config(cfg, to_version=0) + up = upgrade_config(down) + self.assertTrue(up == cfg) + + def _merge_cfg_str(self, cfg, merge_str): + f = tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) + try: + f.write(merge_str) + f.close() + cfg.merge_from_file(f.name) + finally: + os.remove(f.name) + return cfg + + def test_auto_upgrade(self): + cfg = get_cfg() + latest_ver = cfg.VERSION + cfg.USER_CUSTOM = 1 + + self._merge_cfg_str(cfg, _V0_CFG) + + self.assertEqual(cfg.MODEL.RPN.HEAD_NAME, "TEST") + self.assertEqual(cfg.VERSION, latest_ver) + + def test_guess_v1(self): + cfg = get_cfg() + latest_ver = cfg.VERSION + self._merge_cfg_str(cfg, _V1_CFG) + self.assertEqual(cfg.VERSION, latest_ver) + + +class _TestClassA(torch.nn.Module): + @configurable + def __init__(self, arg1, arg2, arg3=3): + super().__init__() + self.arg1 = arg1 + self.arg2 = arg2 + self.arg3 = arg3 + assert arg1 == 1 + assert arg2 == 2 + assert arg3 == 3 + + @classmethod + def from_config(cls, cfg): + args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2} + return args + + +class _TestClassB(_TestClassA): + @configurable + def __init__(self, input_shape, arg1, arg2, arg3=3): + """ + Doc of _TestClassB + """ + assert input_shape == "shape" + super().__init__(arg1, arg2, arg3) + + @classmethod + def from_config(cls, cfg, input_shape): # test extra positional arg in from_config + args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2} + args["input_shape"] = input_shape + return args + + +class _LegacySubClass(_TestClassB): + # an old subclass written in cfg style + def __init__(self, cfg, input_shape, arg4=4): + super().__init__(cfg, input_shape) + assert self.arg1 == 1 + assert self.arg2 == 2 + assert self.arg3 == 3 + + +class _NewSubClassNewInit(_TestClassB): + # test new subclass with a new __init__ + @configurable + def __init__(self, input_shape, arg4=4, **kwargs): + super().__init__(input_shape, **kwargs) + assert self.arg1 == 1 + assert self.arg2 == 2 + assert self.arg3 == 3 + + +class _LegacySubClassNotCfg(_TestClassB): + # an old subclass written in cfg style, but argument is not called "cfg" + def __init__(self, config, input_shape): + super().__init__(config, input_shape) + assert self.arg1 == 1 + assert self.arg2 == 2 + assert self.arg3 == 3 + + +class _TestClassC(_TestClassB): + @classmethod + def from_config(cls, cfg, input_shape, **kwargs): # test extra kwarg overwrite + args = {"arg1": cfg.ARG1, "arg2": cfg.ARG2} + args["input_shape"] = input_shape + args.update(kwargs) + return args + + +class _TestClassD(_TestClassA): + @configurable + def __init__(self, input_shape: ShapeSpec, arg1: int, arg2, arg3=3): + assert input_shape == "shape" + super().__init__(arg1, arg2, arg3) + + # _TestClassA.from_config does not have input_shape args. + # Test whether input_shape will be forwarded to __init__ + + +class TestConfigurable(unittest.TestCase): + def testInitWithArgs(self): + _ = _TestClassA(arg1=1, arg2=2, arg3=3) + _ = _TestClassB("shape", arg1=1, arg2=2) + _ = _TestClassC("shape", arg1=1, arg2=2) + _ = _TestClassD("shape", arg1=1, arg2=2, arg3=3) + + def testPatchedAttr(self): + self.assertTrue("Doc" in _TestClassB.__init__.__doc__) + self.assertEqual(_TestClassD.__init__.__annotations__["arg1"], int) + + def testInitWithCfg(self): + cfg = get_cfg() + cfg.ARG1 = 1 + cfg.ARG2 = 2 + cfg.ARG3 = 3 + _ = _TestClassA(cfg) + _ = _TestClassB(cfg, input_shape="shape") + _ = _TestClassC(cfg, input_shape="shape") + _ = _TestClassD(cfg, input_shape="shape") + _ = _LegacySubClass(cfg, input_shape="shape") + _ = _NewSubClassNewInit(cfg, input_shape="shape") + _ = _LegacySubClassNotCfg(cfg, input_shape="shape") + with self.assertRaises(TypeError): + # disallow forwarding positional args to __init__ since it's prone to errors + _ = _TestClassD(cfg, "shape") + + # call with kwargs instead + _ = _TestClassA(cfg=cfg) + _ = _TestClassB(cfg=cfg, input_shape="shape") + _ = _TestClassC(cfg=cfg, input_shape="shape") + _ = _TestClassD(cfg=cfg, input_shape="shape") + _ = _LegacySubClass(cfg=cfg, input_shape="shape") + _ = _NewSubClassNewInit(cfg=cfg, input_shape="shape") + _ = _LegacySubClassNotCfg(config=cfg, input_shape="shape") + + def testInitWithCfgOverwrite(self): + cfg = get_cfg() + cfg.ARG1 = 1 + cfg.ARG2 = 999 # wrong config + with self.assertRaises(AssertionError): + _ = _TestClassA(cfg, arg3=3) + + # overwrite arg2 with correct config later: + _ = _TestClassA(cfg, arg2=2, arg3=3) + _ = _TestClassB(cfg, input_shape="shape", arg2=2, arg3=3) + _ = _TestClassC(cfg, input_shape="shape", arg2=2, arg3=3) + _ = _TestClassD(cfg, input_shape="shape", arg2=2, arg3=3) + + # call with kwargs cfg=cfg instead + _ = _TestClassA(cfg=cfg, arg2=2, arg3=3) + _ = _TestClassB(cfg=cfg, input_shape="shape", arg2=2, arg3=3) + _ = _TestClassC(cfg=cfg, input_shape="shape", arg2=2, arg3=3) + _ = _TestClassD(cfg=cfg, input_shape="shape", arg2=2, arg3=3) + + def testInitWithCfgWrongArgs(self): + cfg = get_cfg() + cfg.ARG1 = 1 + cfg.ARG2 = 2 + with self.assertRaises(TypeError): + _ = _TestClassB(cfg, "shape", not_exist=1) + with self.assertRaises(TypeError): + _ = _TestClassC(cfg, "shape", not_exist=1) + with self.assertRaises(TypeError): + _ = _TestClassD(cfg, "shape", not_exist=1) + + def testBadClass(self): + class _BadClass1: + @configurable + def __init__(self, a=1, b=2): + pass + + class _BadClass2: + @configurable + def __init__(self, a=1, b=2): + pass + + def from_config(self, cfg): # noqa + pass + + class _BadClass3: + @configurable + def __init__(self, a=1, b=2): + pass + + # bad name: must be cfg + @classmethod + def from_config(cls, config): # noqa + pass + + with self.assertRaises(AttributeError): + _ = _BadClass1(a=1) + + with self.assertRaises(TypeError): + _ = _BadClass2(a=1) + + with self.assertRaises(TypeError): + _ = _BadClass3(get_cfg()) diff --git a/tests/test_export_caffe2.py b/tests/test_export_caffe2.py new file mode 100644 index 0000000000000000000000000000000000000000..ad989c4a3d11e6675d26ae2690f06d2ffe30d44c --- /dev/null +++ b/tests/test_export_caffe2.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# -*- coding: utf-8 -*- + +import copy +import numpy as np +import os +import tempfile +import unittest +import cv2 +import torch +from fvcore.common.file_io import PathManager + +from detectron2 import model_zoo +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import DatasetCatalog +from detectron2.modeling import build_model +from detectron2.utils.logger import setup_logger + + +@unittest.skipIf(os.environ.get("CIRCLECI"), "Require COCO data and model zoo.") +class TestCaffe2Export(unittest.TestCase): + def setUp(self): + setup_logger() + + def _test_model(self, config_path, device="cpu"): + # requires extra dependencies + from detectron2.export import Caffe2Model, add_export_config, export_caffe2_model + + cfg = get_cfg() + cfg.merge_from_file(model_zoo.get_config_file(config_path)) + cfg = add_export_config(cfg) + cfg.MODEL.DEVICE = device + + model = build_model(cfg) + DetectionCheckpointer(model).load(model_zoo.get_checkpoint_url(config_path)) + + inputs = [{"image": self._get_test_image()}] + c2_model = export_caffe2_model(cfg, model, copy.deepcopy(inputs)) + + with tempfile.TemporaryDirectory(prefix="detectron2_unittest") as d: + c2_model.save_protobuf(d) + c2_model.save_graph(os.path.join(d, "test.svg"), inputs=copy.deepcopy(inputs)) + c2_model = Caffe2Model.load_protobuf(d) + c2_model(inputs)[0]["instances"] + + def _get_test_image(self): + try: + file_name = DatasetCatalog.get("coco_2017_train")[0]["file_name"] + assert PathManager.exists(file_name) + except Exception: + self.skipTest("COCO dataset not available.") + + with PathManager.open(file_name, "rb") as f: + buf = f.read() + img = cv2.imdecode(np.frombuffer(buf, dtype=np.uint8), cv2.IMREAD_COLOR) + assert img is not None, file_name + return torch.from_numpy(img.transpose(2, 0, 1)) + + def testMaskRCNN(self): + self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml") + + @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available") + def testMaskRCNNGPU(self): + self._test_model("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml", device="cuda") + + def testRetinaNet(self): + self._test_model("COCO-Detection/retinanet_R_50_FPN_3x.yaml") + + def testPanopticFPN(self): + self._test_model("COCO-PanopticSegmentation/panoptic_fpn_R_50_3x.yaml") diff --git a/tests/test_model_analysis.py b/tests/test_model_analysis.py new file mode 100644 index 0000000000000000000000000000000000000000..0e3f84c9354746fc634aca997abb232424ddebb2 --- /dev/null +++ b/tests/test_model_analysis.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + + +import unittest +import torch + +import detectron2.model_zoo as model_zoo +from detectron2.config import get_cfg +from detectron2.modeling import build_model +from detectron2.utils.analysis import flop_count_operators, parameter_count + + +def get_model_zoo(config_path): + """ + Like model_zoo.get, but do not load any weights (even pretrained) + """ + cfg_file = model_zoo.get_config_file(config_path) + cfg = get_cfg() + cfg.merge_from_file(cfg_file) + if not torch.cuda.is_available(): + cfg.MODEL.DEVICE = "cpu" + return build_model(cfg) + + +class RetinaNetTest(unittest.TestCase): + def setUp(self): + self.model = get_model_zoo("COCO-Detection/retinanet_R_50_FPN_1x.yaml") + + def test_flop(self): + # RetinaNet supports flop-counting with random inputs + inputs = [{"image": torch.rand(3, 800, 800)}] + res = flop_count_operators(self.model, inputs) + self.assertTrue(int(res["conv"]), 146) # 146B flops + + def test_param_count(self): + res = parameter_count(self.model) + self.assertTrue(res[""], 37915572) + self.assertTrue(res["backbone"], 31452352) + + +class FasterRCNNTest(unittest.TestCase): + def setUp(self): + self.model = get_model_zoo("COCO-Detection/faster_rcnn_R_50_FPN_1x.yaml") + + def test_flop(self): + # Faster R-CNN supports flop-counting with random inputs + inputs = [{"image": torch.rand(3, 800, 800)}] + res = flop_count_operators(self.model, inputs) + + # This only checks flops for backbone & proposal generator + # Flops for box head is not conv, and depends on #proposals, which is + # almost 0 for random inputs. + self.assertTrue(int(res["conv"]), 117) + + def test_param_count(self): + res = parameter_count(self.model) + self.assertTrue(res[""], 41699936) + self.assertTrue(res["backbone"], 26799296) diff --git a/tests/test_model_zoo.py b/tests/test_model_zoo.py new file mode 100644 index 0000000000000000000000000000000000000000..2d16c711af2ab797dab04d0573c2ed70e071ebfd --- /dev/null +++ b/tests/test_model_zoo.py @@ -0,0 +1,29 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import logging +import unittest + +from detectron2 import model_zoo +from detectron2.modeling import FPN, GeneralizedRCNN + +logger = logging.getLogger(__name__) + + +class TestModelZoo(unittest.TestCase): + def test_get_returns_model(self): + model = model_zoo.get("Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml", trained=False) + self.assertIsInstance(model, GeneralizedRCNN) + self.assertIsInstance(model.backbone, FPN) + + def test_get_invalid_model(self): + self.assertRaises(RuntimeError, model_zoo.get, "Invalid/config.yaml") + + def test_get_url(self): + url = model_zoo.get_checkpoint_url("Misc/scratch_mask_rcnn_R_50_FPN_3x_gn.yaml") + self.assertEqual( + url, + "https://dl.fbaipublicfiles.com/detectron2/Misc/scratch_mask_rcnn_R_50_FPN_3x_gn/138602908/model_final_01ca85.pkl", # noqa + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_visualizer.py b/tests/test_visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..1cdeddc6733e25d882bede48a404a1d52c0845de --- /dev/null +++ b/tests/test_visualizer.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +# File: + +import numpy as np +import unittest +import torch + +from detectron2.data import MetadataCatalog +from detectron2.structures import BoxMode, Instances, RotatedBoxes +from detectron2.utils.visualizer import Visualizer + + +class TestVisualizer(unittest.TestCase): + def _random_data(self): + H, W = 100, 100 + N = 10 + img = np.random.rand(H, W, 3) * 255 + boxxy = np.random.rand(N, 2) * (H // 2) + boxes = np.concatenate((boxxy, boxxy + H // 2), axis=1) + + def _rand_poly(): + return np.random.rand(3, 2).flatten() * H + + polygons = [[_rand_poly() for _ in range(np.random.randint(1, 5))] for _ in range(N)] + + mask = np.zeros_like(img[:, :, 0], dtype=np.bool) + mask[:10, 10:20] = 1 + + labels = [str(i) for i in range(N)] + return img, boxes, labels, polygons, [mask] * N + + @property + def metadata(self): + return MetadataCatalog.get("coco_2017_train") + + def test_draw_dataset_dict(self): + img = np.random.rand(512, 512, 3) * 255 + dic = { + "annotations": [ + { + "bbox": [ + 368.9946492271106, + 330.891438763377, + 13.148537455410235, + 13.644708680142685, + ], + "bbox_mode": BoxMode.XYWH_ABS, + "category_id": 0, + "iscrowd": 1, + "segmentation": { + "counts": "_jh52m?2N2N2N2O100O10O001N1O2MceP2", + "size": [512, 512], + }, + } + ], + "height": 512, + "image_id": 1, + "width": 512, + } + v = Visualizer(img, self.metadata) + v.draw_dataset_dict(dic) + + def test_overlay_instances(self): + img, boxes, labels, polygons, masks = self._random_data() + + v = Visualizer(img, self.metadata) + output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image() + self.assertEqual(output.shape, img.shape) + + # Test 2x scaling + v = Visualizer(img, self.metadata, scale=2.0) + output = v.overlay_instances(masks=polygons, boxes=boxes, labels=labels).get_image() + self.assertEqual(output.shape[0], img.shape[0] * 2) + + # Test overlay masks + v = Visualizer(img, self.metadata) + output = v.overlay_instances(masks=masks, boxes=boxes, labels=labels).get_image() + self.assertEqual(output.shape, img.shape) + + def test_overlay_instances_no_boxes(self): + img, boxes, labels, polygons, _ = self._random_data() + v = Visualizer(img, self.metadata) + v.overlay_instances(masks=polygons, boxes=None, labels=labels).get_image() + + def test_draw_instance_predictions(self): + img, boxes, _, _, masks = self._random_data() + num_inst = len(boxes) + inst = Instances((img.shape[0], img.shape[1])) + inst.pred_classes = torch.randint(0, 80, size=(num_inst,)) + inst.scores = torch.rand(num_inst) + inst.pred_boxes = torch.from_numpy(boxes) + inst.pred_masks = torch.from_numpy(np.asarray(masks)) + + v = Visualizer(img, self.metadata) + v.draw_instance_predictions(inst) + + def test_draw_empty_mask_predictions(self): + img, boxes, _, _, masks = self._random_data() + num_inst = len(boxes) + inst = Instances((img.shape[0], img.shape[1])) + inst.pred_classes = torch.randint(0, 80, size=(num_inst,)) + inst.scores = torch.rand(num_inst) + inst.pred_boxes = torch.from_numpy(boxes) + inst.pred_masks = torch.from_numpy(np.zeros_like(np.asarray(masks))) + + v = Visualizer(img, self.metadata) + v.draw_instance_predictions(inst) + + def test_correct_output_shape(self): + img = np.random.rand(928, 928, 3) * 255 + v = Visualizer(img, self.metadata) + out = v.output.get_image() + self.assertEqual(out.shape, img.shape) + + def test_overlay_rotated_instances(self): + H, W = 100, 150 + img = np.random.rand(H, W, 3) * 255 + num_boxes = 50 + boxes_5d = torch.zeros(num_boxes, 5) + boxes_5d[:, 0] = torch.FloatTensor(num_boxes).uniform_(-0.1 * W, 1.1 * W) + boxes_5d[:, 1] = torch.FloatTensor(num_boxes).uniform_(-0.1 * H, 1.1 * H) + boxes_5d[:, 2] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H)) + boxes_5d[:, 3] = torch.FloatTensor(num_boxes).uniform_(0, max(W, H)) + boxes_5d[:, 4] = torch.FloatTensor(num_boxes).uniform_(-1800, 1800) + rotated_boxes = RotatedBoxes(boxes_5d) + labels = [str(i) for i in range(num_boxes)] + + v = Visualizer(img, self.metadata) + output = v.overlay_instances(boxes=rotated_boxes, labels=labels).get_image() + self.assertEqual(output.shape, img.shape) + + def test_draw_no_metadata(self): + img, boxes, _, _, masks = self._random_data() + num_inst = len(boxes) + inst = Instances((img.shape[0], img.shape[1])) + inst.pred_classes = torch.randint(0, 80, size=(num_inst,)) + inst.scores = torch.rand(num_inst) + inst.pred_boxes = torch.from_numpy(boxes) + inst.pred_masks = torch.from_numpy(np.asarray(masks)) + + v = Visualizer(img, MetadataCatalog.get("asdfasdf")) + v.draw_instance_predictions(inst) diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3733863970218bf8bdf9b32420163f4c858e209e --- /dev/null +++ b/tools/README.md @@ -0,0 +1,45 @@ + +This directory contains a few scripts that use detectron2. + + +* `train_net.py` + +An example training script that's made to train builtin models of detectron2. + +For usage, see [GETTING_STARTED.md](../GETTING_STARTED.md). + +* `plain_train_net.py` + +Similar to `train_net.py`, but implements a training loop instead of using `Trainer`. +This script includes fewer features but it may be more friendly to hackers. + +* `benchmark.py` + +Benchmark the training speed, inference speed or data loading speed of a given config. + +Usage: +``` +python benchmark.py --config-file config.yaml --task train/eval/data [optional DDP flags] +``` + +* `visualize_json_results.py` + +Visualize the json instance detection/segmentation results dumped by `COCOEvalutor` or `LVISEvaluator` + +Usage: +``` +python visualize_json_results.py --input x.json --output dir/ --dataset coco_2017_val +``` +If not using a builtin dataset, you'll need your own script or modify this script. + +* `visualize_data.py` + +Visualize ground truth raw annotations or training data (after preprocessing/augmentations). + +Usage: +``` +python visualize_data.py --config-file config.yaml --source annotation/dataloader --output-dir dir/ [--show] +``` + +NOTE: the script does not stop by itself when using `--source dataloader` because a training +dataloader is usually infinite. diff --git a/tools/analyze_model.py b/tools/analyze_model.py new file mode 100755 index 0000000000000000000000000000000000000000..9c06ea4b5fbfd551d85702171976f9bc33f2e275 --- /dev/null +++ b/tools/analyze_model.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import logging +import numpy as np +from collections import Counter +import tqdm + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import build_detection_test_loader +from detectron2.engine import default_argument_parser +from detectron2.modeling import build_model +from detectron2.utils.analysis import ( + activation_count_operators, + flop_count_operators, + parameter_count_table, +) +from detectron2.utils.logger import setup_logger + +logger = logging.getLogger("detectron2") + + +def setup(args): + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.DATALOADER.NUM_WORKERS = 0 + cfg.merge_from_list(args.opts) + cfg.freeze() + setup_logger() + return cfg + + +def do_flop(cfg): + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + model = build_model(cfg) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + model.eval() + + counts = Counter() + total_flops = [] + for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa + count = flop_count_operators(model, data) + counts += count + total_flops.append(sum(count.values())) + logger.info( + "(G)Flops for Each Type of Operators:\n" + str([(k, v / idx) for k, v in counts.items()]) + ) + logger.info("Total (G)Flops: {}±{}".format(np.mean(total_flops), np.std(total_flops))) + + +def do_activation(cfg): + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + model = build_model(cfg) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + model.eval() + + counts = Counter() + total_activations = [] + for idx, data in zip(tqdm.trange(args.num_inputs), data_loader): # noqa + count = activation_count_operators(model, data) + counts += count + total_activations.append(sum(count.values())) + logger.info( + "(Million) Activations for Each Type of Operators:\n" + + str([(k, v / idx) for k, v in counts.items()]) + ) + logger.info( + "Total (Million) Activations: {}±{}".format( + np.mean(total_activations), np.std(total_activations) + ) + ) + + +def do_parameter(cfg): + model = build_model(cfg) + logger.info("Parameter Count:\n" + parameter_count_table(model, max_depth=5)) + + +def do_structure(cfg): + model = build_model(cfg) + logger.info("Model Structure:\n" + str(model)) + + +if __name__ == "__main__": + parser = default_argument_parser( + epilog=""" +Examples: + +To show parameters of a model: +$ ./analyze_model.py --tasks parameter \\ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml + +Flops and activations are data-dependent, therefore inputs and model weights +are needed to count them: + +$ ./analyze_model.py --num-inputs 100 --tasks flop \\ + --config-file ../configs/COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x.yaml \\ + MODEL.WEIGHTS /path/to/model.pkl +""" + ) + parser.add_argument( + "--tasks", + choices=["flop", "activation", "parameter", "structure"], + required=True, + nargs="+", + ) + parser.add_argument( + "--num-inputs", + default=100, + type=int, + help="number of inputs used to compute statistics for flops/activations, " + "both are data dependent.", + ) + args = parser.parse_args() + assert not args.eval_only + assert args.num_gpus == 1 + + cfg = setup(args) + + for task in args.tasks: + { + "flop": do_flop, + "activation": do_activation, + "parameter": do_parameter, + "structure": do_structure, + }[task](cfg) diff --git a/tools/benchmark.py b/tools/benchmark.py new file mode 100755 index 0000000000000000000000000000000000000000..9eec59f476882e4045ec3c682ffe515413a3be15 --- /dev/null +++ b/tools/benchmark.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +A script to benchmark builtin models. + +Note: this script has an extra dependency of psutil. +""" + +import itertools +import logging +import psutil +import torch +import tqdm +from fvcore.common.timer import Timer +from torch.nn.parallel import DistributedDataParallel + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import ( + DatasetFromList, + build_detection_test_loader, + build_detection_train_loader, +) +from detectron2.engine import SimpleTrainer, default_argument_parser, hooks, launch +from detectron2.modeling import build_model +from detectron2.solver import build_optimizer +from detectron2.utils import comm +from detectron2.utils.events import CommonMetricPrinter +from detectron2.utils.logger import setup_logger + +logger = logging.getLogger("detectron2") + + +def setup(args): + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.SOLVER.BASE_LR = 0.001 # Avoid NaNs. Not useful in this script anyway. + cfg.merge_from_list(args.opts) + cfg.freeze() + setup_logger(distributed_rank=comm.get_rank()) + return cfg + + +def benchmark_data(args): + cfg = setup(args) + + timer = Timer() + dataloader = build_detection_train_loader(cfg) + logger.info("Initialize loader using {} seconds.".format(timer.seconds())) + + timer.reset() + itr = iter(dataloader) + for i in range(10): # warmup + next(itr) + if i == 0: + startup_time = timer.seconds() + timer = Timer() + max_iter = 1000 + for _ in tqdm.trange(max_iter): + next(itr) + logger.info( + "{} iters ({} images) in {} seconds.".format( + max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds() + ) + ) + logger.info("Startup time: {} seconds".format(startup_time)) + vram = psutil.virtual_memory() + logger.info( + "RAM Usage: {:.2f}/{:.2f} GB".format( + (vram.total - vram.available) / 1024 ** 3, vram.total / 1024 ** 3 + ) + ) + + # test for a few more rounds + for _ in range(10): + timer = Timer() + max_iter = 1000 + for _ in tqdm.trange(max_iter): + next(itr) + logger.info( + "{} iters ({} images) in {} seconds.".format( + max_iter, max_iter * cfg.SOLVER.IMS_PER_BATCH, timer.seconds() + ) + ) + + +def benchmark_train(args): + cfg = setup(args) + model = build_model(cfg) + logger.info("Model:\n{}".format(model)) + if comm.get_world_size() > 1: + model = DistributedDataParallel( + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False + ) + optimizer = build_optimizer(cfg, model) + checkpointer = DetectionCheckpointer(model, optimizer=optimizer) + checkpointer.load(cfg.MODEL.WEIGHTS) + + cfg.defrost() + cfg.DATALOADER.NUM_WORKERS = 0 + data_loader = build_detection_train_loader(cfg) + dummy_data = list(itertools.islice(data_loader, 100)) + + def f(): + data = DatasetFromList(dummy_data, copy=False) + while True: + yield from data + + max_iter = 400 + trainer = SimpleTrainer(model, f(), optimizer) + trainer.register_hooks( + [hooks.IterationTimer(), hooks.PeriodicWriter([CommonMetricPrinter(max_iter)])] + ) + trainer.train(1, max_iter) + + +@torch.no_grad() +def benchmark_eval(args): + cfg = setup(args) + model = build_model(cfg) + model.eval() + logger.info("Model:\n{}".format(model)) + DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS) + + cfg.defrost() + cfg.DATALOADER.NUM_WORKERS = 0 + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + dummy_data = list(itertools.islice(data_loader, 100)) + + def f(): + while True: + yield from DatasetFromList(dummy_data, copy=False) + + for _ in range(5): # warmup + model(dummy_data[0]) + + max_iter = 400 + timer = Timer() + with tqdm.tqdm(total=max_iter) as pbar: + for idx, d in enumerate(f()): + if idx == max_iter: + break + model(d) + pbar.update() + logger.info("{} iters in {} seconds.".format(max_iter, timer.seconds())) + + +if __name__ == "__main__": + parser = default_argument_parser() + parser.add_argument("--task", choices=["train", "eval", "data"], required=True) + args = parser.parse_args() + assert not args.eval_only + + if args.task == "data": + f = benchmark_data + elif args.task == "train": + """ + Note: training speed may not be representative. + The training cost of a R-CNN model varies with the content of the data + and the quality of the model. + """ + f = benchmark_train + elif args.task == "eval": + f = benchmark_eval + # only benchmark single-GPU inference. + assert args.num_gpus == 1 and args.num_machines == 1 + launch(f, args.num_gpus, args.num_machines, args.machine_rank, args.dist_url, args=(args,)) diff --git a/tools/convert-torchvision-to-d2.py b/tools/convert-torchvision-to-d2.py new file mode 100755 index 0000000000000000000000000000000000000000..18a24e4ef96d34a4a0d1f43debc2276260da1a2b --- /dev/null +++ b/tools/convert-torchvision-to-d2.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import pickle as pkl +import sys +import torch + +""" +Usage: + # download one of the ResNet{18,34,50,101,152} models from torchvision: + wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth + # run the conversion + ./convert-torchvision-to-d2.py r50.pth r50.pkl + + # Then, use r50.pkl with the following changes in config: + +MODEL: + WEIGHTS: "/path/to/r50.pkl" + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + RESNETS: + DEPTH: 50 + STRIDE_IN_1X1: False +INPUT: + FORMAT: "RGB" + + These models typically produce slightly worse results than the + pre-trained ResNets we use in official configs, which are the + original ResNet models released by MSRA. +""" + +if __name__ == "__main__": + input = sys.argv[1] + + obj = torch.load(input, map_location="cpu") + + newmodel = {} + for k in list(obj.keys()): + old_k = k + if "layer" not in k: + k = "stem." + k + for t in [1, 2, 3, 4]: + k = k.replace("layer{}".format(t), "res{}".format(t + 1)) + for t in [1, 2, 3]: + k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) + k = k.replace("downsample.0", "shortcut") + k = k.replace("downsample.1", "shortcut.norm") + print(old_k, "->", k) + newmodel[k] = obj.pop(old_k).detach().numpy() + + res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} + + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) + if obj: + print("Unconverted keys:", obj.keys()) diff --git a/tools/deploy/CMakeLists.txt b/tools/deploy/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c3ca7a33cd1dcb4e7d1359d66a1c4aa4fb4d97e --- /dev/null +++ b/tools/deploy/CMakeLists.txt @@ -0,0 +1,21 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# See https://pytorch.org/tutorials/advanced/cpp_frontend.html +cmake_minimum_required(VERSION 3.0 FATAL_ERROR) +project(caffe2_mask_rcnn) + +find_package(Torch REQUIRED) +find_package(gflags REQUIRED) +find_package(OpenCV REQUIRED) + +add_executable(caffe2_mask_rcnn caffe2_mask_rcnn.cpp) +target_link_libraries( + caffe2_mask_rcnn + "${TORCH_LIBRARIES}" gflags glog ${OpenCV_LIBS}) +set_property(TARGET caffe2_mask_rcnn PROPERTY CXX_STANDARD 14) + + +add_executable(torchscript_traced_mask_rcnn torchscript_traced_mask_rcnn.cpp) +target_link_libraries( + torchscript_traced_mask_rcnn + "${TORCH_LIBRARIES}" ${OpenCV_LIBS}) +set_property(TARGET torchscript_traced_mask_rcnn PROPERTY CXX_STANDARD 14) diff --git a/tools/deploy/README.md b/tools/deploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b9d5b15512c0bd160accbb1823236b8954a37b86 --- /dev/null +++ b/tools/deploy/README.md @@ -0,0 +1,9 @@ + +This directory contains: + +1. A script that converts a detectron2 model to caffe2 format. + +2. An example that loads a Mask R-CNN model in caffe2 format and runs inference. + +See [tutorial](https://detectron2.readthedocs.io/tutorials/deployment.html) +for their usage. diff --git a/tools/deploy/caffe2_converter.py b/tools/deploy/caffe2_converter.py new file mode 100755 index 0000000000000000000000000000000000000000..08feb69fba090a302d1624d52d146ac7a0787223 --- /dev/null +++ b/tools/deploy/caffe2_converter.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +import argparse +import os +import onnx +import torch + +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import build_detection_test_loader +from detectron2.evaluation import COCOEvaluator, inference_on_dataset, print_csv_format +from detectron2.export import Caffe2Tracer, add_export_config +from detectron2.modeling import build_model +from detectron2.utils.logger import setup_logger + + +def setup_cfg(args): + cfg = get_cfg() + # cuda context is initialized before creating dataloader, so we don't fork anymore + cfg.DATALOADER.NUM_WORKERS = 0 + cfg = add_export_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + if cfg.MODEL.DEVICE != "cpu": + TORCH_VERSION = tuple(int(x) for x in torch.__version__.split(".")[:2]) + assert TORCH_VERSION >= (1, 5), "PyTorch>=1.5 required for GPU conversion!" + return cfg + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Convert a model using caffe2 tracing.") + parser.add_argument( + "--format", + choices=["caffe2", "onnx", "torchscript"], + help="output format", + default="caffe2", + ) + parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file") + parser.add_argument("--run-eval", action="store_true") + parser.add_argument("--output", help="output directory for the converted model") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + args = parser.parse_args() + logger = setup_logger() + logger.info("Command line arguments: " + str(args)) + os.makedirs(args.output, exist_ok=True) + + cfg = setup_cfg(args) + + # create a torch model + torch_model = build_model(cfg) + DetectionCheckpointer(torch_model).resume_or_load(cfg.MODEL.WEIGHTS) + + # get a sample data + data_loader = build_detection_test_loader(cfg, cfg.DATASETS.TEST[0]) + first_batch = next(iter(data_loader)) + + # convert and save caffe2 model + tracer = Caffe2Tracer(cfg, torch_model, first_batch) + if args.format == "caffe2": + caffe2_model = tracer.export_caffe2() + caffe2_model.save_protobuf(args.output) + # draw the caffe2 graph + caffe2_model.save_graph(os.path.join(args.output, "model.svg"), inputs=first_batch) + elif args.format == "onnx": + onnx_model = tracer.export_onnx() + onnx.save(onnx_model, os.path.join(args.output, "model.onnx")) + elif args.format == "torchscript": + script_model = tracer.export_torchscript() + script_model.save(os.path.join(args.output, "model.ts")) + + # Recursively print IR of all modules + with open(os.path.join(args.output, "model_ts_IR.txt"), "w") as f: + try: + f.write(script_model._actual_script_module._c.dump_to_str(True, False, False)) + except AttributeError: + pass + # Print IR of the entire graph (all submodules inlined) + with open(os.path.join(args.output, "model_ts_IR_inlined.txt"), "w") as f: + f.write(str(script_model.inlined_graph)) + # Print the model structure in pytorch style + with open(os.path.join(args.output, "model.txt"), "w") as f: + f.write(str(script_model)) + + # run evaluation with the converted model + if args.run_eval: + assert args.format == "caffe2", "Python inference in other format is not yet supported." + dataset = cfg.DATASETS.TEST[0] + data_loader = build_detection_test_loader(cfg, dataset) + # NOTE: hard-coded evaluator. change to the evaluator for your dataset + evaluator = COCOEvaluator(dataset, cfg, True, args.output) + metrics = inference_on_dataset(caffe2_model, data_loader, evaluator) + print_csv_format(metrics) diff --git a/tools/deploy/caffe2_mask_rcnn.cpp b/tools/deploy/caffe2_mask_rcnn.cpp new file mode 100644 index 0000000000000000000000000000000000000000..44370b4c518408f1f46345c7e3ac07c7db63a485 --- /dev/null +++ b/tools/deploy/caffe2_mask_rcnn.cpp @@ -0,0 +1,119 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +C10_DEFINE_string(predict_net, "", "path to model.pb"); +C10_DEFINE_string(init_net, "", "path to model_init.pb"); +C10_DEFINE_string(input, "", "path to input image"); + +using namespace std; +using namespace caffe2; + +int main(int argc, char** argv) { + caffe2::GlobalInit(&argc, &argv); + string predictNetPath = FLAGS_predict_net; + string initNetPath = FLAGS_init_net; + cv::Mat input = cv::imread(FLAGS_input, cv::IMREAD_COLOR); + + const int height = input.rows; + const int width = input.cols; + // FPN models require divisibility of 32 + assert(height % 32 == 0 && width % 32 == 0); + const int batch = 1; + const int channels = 3; + + // initialize Net and Workspace + caffe2::NetDef initNet_, predictNet_; + CAFFE_ENFORCE(ReadProtoFromFile(initNetPath, &initNet_)); + CAFFE_ENFORCE(ReadProtoFromFile(predictNetPath, &predictNet_)); + + Workspace workSpace; + for (auto& str : predictNet_.external_input()) { + workSpace.CreateBlob(str); + } + CAFFE_ENFORCE(workSpace.CreateNet(predictNet_)); + CAFFE_ENFORCE(workSpace.RunNetOnce(initNet_)); + + // setup inputs + auto data = BlobGetMutableTensor(workSpace.GetBlob("data"), caffe2::CPU); + data->Resize(batch, channels, height, width); + float* ptr = data->mutable_data(); + // HWC to CHW + for (int c = 0; c < 3; ++c) { + for (int i = 0; i < height * width; ++i) { + ptr[c * height * width + i] = static_cast(input.data[3 * i + c]); + } + } + + auto im_info = + BlobGetMutableTensor(workSpace.GetBlob("im_info"), caffe2::CPU); + im_info->Resize(batch, 3); + float* im_info_ptr = im_info->mutable_data(); + im_info_ptr[0] = height; + im_info_ptr[1] = width; + im_info_ptr[2] = 1.0; + + // run the network + CAFFE_ENFORCE(workSpace.RunNet(predictNet_.name())); + + // run 3 more times to benchmark + int N_benchmark = 3; + auto start_time = chrono::high_resolution_clock::now(); + for (int i = 0; i < N_benchmark; ++i) { + CAFFE_ENFORCE(workSpace.RunNet(predictNet_.name())); + } + auto end_time = chrono::high_resolution_clock::now(); + auto ms = chrono::duration_cast(end_time - start_time) + .count(); + cout << "Latency (should vary with different inputs): " + << ms * 1.0 / 1e6 / N_benchmark << " seconds" << endl; + + // parse Mask R-CNN outputs + caffe2::Tensor bbox( + workSpace.GetBlob("bbox_nms")->Get(), caffe2::CPU); + caffe2::Tensor scores( + workSpace.GetBlob("score_nms")->Get(), caffe2::CPU); + caffe2::Tensor labels( + workSpace.GetBlob("class_nms")->Get(), caffe2::CPU); + caffe2::Tensor mask_probs( + workSpace.GetBlob("mask_fcn_probs")->Get(), caffe2::CPU); + cout << "bbox:" << bbox.DebugString() << endl; + cout << "scores:" << scores.DebugString() << endl; + cout << "labels:" << labels.DebugString() << endl; + cout << "mask_probs: " << mask_probs.DebugString() << endl; + + int num_instances = bbox.sizes()[0]; + for (int i = 0; i < num_instances; ++i) { + float score = scores.data()[i]; + if (score < 0.6) + continue; // skip them + + const float* box = bbox.data() + i * 4; + int label = labels.data()[i]; + + cout << "Prediction " << i << ", xyxy=("; + cout << box[0] << ", " << box[1] << ", " << box[2] << ", " << box[3] + << "); score=" << score << "; label=" << label << endl; + + const float* mask = mask_probs.data() + + i * mask_probs.size_from_dim(1) + label * mask_probs.size_from_dim(2); + + // save the 28x28 mask + cv::Mat cv_mask(28, 28, CV_32FC1); + memcpy(cv_mask.data, mask, 28 * 28 * sizeof(float)); + cv::imwrite("mask" + std::to_string(i) + ".png", cv_mask * 255.); + } + return 0; +} diff --git a/tools/deploy/torchscript_traced_mask_rcnn.cpp b/tools/deploy/torchscript_traced_mask_rcnn.cpp new file mode 100644 index 0000000000000000000000000000000000000000..82fbdb052fa53543920bf8169a05982005e30cc5 --- /dev/null +++ b/tools/deploy/torchscript_traced_mask_rcnn.cpp @@ -0,0 +1,71 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. + +#include +#include +#include + +#include +#include + +using namespace std; + +// experimental. don't use +int main(int argc, const char* argv[]) { + if (argc != 3) { + return 1; + } + std::string image_file = argv[2]; + + torch::autograd::AutoGradMode guard(false); + auto module = torch::jit::load(argv[1]); + + assert(module.buffers().size() > 0); + // Assume that the entire model is on the same device. + // We just put input to this device. + auto device = (*begin(module.buffers())).device(); + + cv::Mat input_img = cv::imread(image_file, cv::IMREAD_COLOR); + const int height = input_img.rows; + const int width = input_img.cols; + // FPN models require divisibility of 32 + assert(height % 32 == 0 && width % 32 == 0); + const int channels = 3; + + auto input = torch::from_blob( + input_img.data, {1, height, width, channels}, torch::kUInt8); + // NHWC to NCHW + input = input.to(device, torch::kFloat).permute({0, 3, 1, 2}).contiguous(); + + std::array im_info_data{height * 1.0f, width * 1.0f, 1.0f}; + auto im_info = torch::from_blob(im_info_data.data(), {1, 3}).to(device); + + // run the network + auto output = module.forward({std::make_tuple(input, im_info)}); + + // run 3 more times to benchmark + int N_benchmark = 3; + auto start_time = chrono::high_resolution_clock::now(); + for (int i = 0; i < N_benchmark; ++i) { + output = module.forward({std::make_tuple(input, im_info)}); + } + auto end_time = chrono::high_resolution_clock::now(); + auto ms = chrono::duration_cast(end_time - start_time) + .count(); + cout << "Latency (should vary with different inputs): " + << ms * 1.0 / 1e6 / N_benchmark << " seconds" << endl; + + auto outputs = output.toTuple()->elements(); + // parse Mask R-CNN outputs + auto bbox = outputs[0].toTensor(), scores = outputs[1].toTensor(), + labels = outputs[2].toTensor(), mask_probs = outputs[3].toTensor(); + + cout << "bbox: " << bbox.toString() << " " << bbox.sizes() << endl; + cout << "scores: " << scores.toString() << " " << scores.sizes() << endl; + cout << "labels: " << labels.toString() << " " << labels.sizes() << endl; + cout << "mask_probs: " << mask_probs.toString() << " " << mask_probs.sizes() + << endl; + + int num_instances = bbox.sizes()[0]; + cout << bbox << endl; + return 0; +} diff --git a/tools/plain_train_net.py b/tools/plain_train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..dbfa325b0f5a86baf5eda69af598178ad209b927 --- /dev/null +++ b/tools/plain_train_net.py @@ -0,0 +1,237 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Detectron2 training script with a plain training loop. + +This script reads a given config file and runs the training or evaluation. +It is an entry point that is able to train standard models in detectron2. + +In order to let one script support training of many models, +this script contains logic that are specific to these built-in models and therefore +may not be suitable for your own project. +For example, your research project perhaps only needs a single "evaluator". + +Therefore, we recommend you to use detectron2 as a library and take +this file as an example of how to use the library. +You may want to write your own script with your datasets and other customizations. + +Compared to "train_net.py", this script supports fewer default features. +It also includes fewer abstraction, therefore is easier to add custom logic. +""" + +import logging +import os +from collections import OrderedDict +import torch +from torch.nn.parallel import DistributedDataParallel + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer, PeriodicCheckpointer +from detectron2.config import get_cfg +from detectron2.data import ( + MetadataCatalog, + build_detection_test_loader, + build_detection_train_loader, +) +from detectron2.engine import default_argument_parser, default_setup, launch +from detectron2.evaluation import ( + CityscapesInstanceEvaluator, + CityscapesSemSegEvaluator, + COCOEvaluator, + COCOPanopticEvaluator, + DatasetEvaluators, + LVISEvaluator, + PascalVOCDetectionEvaluator, + SemSegEvaluator, + inference_on_dataset, + print_csv_format, +) +from detectron2.modeling import build_model +from detectron2.solver import build_lr_scheduler, build_optimizer +from detectron2.utils.events import ( + CommonMetricPrinter, + EventStorage, + JSONWriter, + TensorboardXWriter, +) + +logger = logging.getLogger("detectron2") + + +def get_evaluator(cfg, dataset_name, output_folder=None): + """ + Create evaluator(s) for a given dataset. + This uses the special metadata "evaluator_type" associated with each builtin dataset. + For your own dataset, you can simply create an evaluator manually in your + script and do not have to worry about the hacky if-else logic here. + """ + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: + evaluator_list.append( + SemSegEvaluator( + dataset_name, + distributed=True, + num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + output_dir=output_folder, + ) + ) + if evaluator_type in ["coco", "coco_panoptic_seg"]: + evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) + if evaluator_type == "coco_panoptic_seg": + evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) + if evaluator_type == "cityscapes_instance": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesInstanceEvaluator(dataset_name) + if evaluator_type == "cityscapes_sem_seg": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesSemSegEvaluator(dataset_name) + if evaluator_type == "pascal_voc": + return PascalVOCDetectionEvaluator(dataset_name) + if evaluator_type == "lvis": + return LVISEvaluator(dataset_name, cfg, True, output_folder) + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format(dataset_name, evaluator_type) + ) + if len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + +def do_test(cfg, model): + results = OrderedDict() + for dataset_name in cfg.DATASETS.TEST: + data_loader = build_detection_test_loader(cfg, dataset_name) + evaluator = get_evaluator( + cfg, dataset_name, os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) + ) + results_i = inference_on_dataset(model, data_loader, evaluator) + results[dataset_name] = results_i + if comm.is_main_process(): + logger.info("Evaluation results for {} in csv format:".format(dataset_name)) + print_csv_format(results_i) + if len(results) == 1: + results = list(results.values())[0] + return results + + +def do_train(cfg, model, resume=False): + model.train() + optimizer = build_optimizer(cfg, model) + scheduler = build_lr_scheduler(cfg, optimizer) + + checkpointer = DetectionCheckpointer( + model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler + ) + start_iter = ( + checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1 + ) + max_iter = cfg.SOLVER.MAX_ITER + + periodic_checkpointer = PeriodicCheckpointer( + checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter + ) + + writers = ( + [ + CommonMetricPrinter(max_iter), + JSONWriter(os.path.join(cfg.OUTPUT_DIR, "metrics.json")), + TensorboardXWriter(cfg.OUTPUT_DIR), + ] + if comm.is_main_process() + else [] + ) + + # compared to "train_net.py", we do not support accurate timing and + # precise BN here, because they are not trivial to implement + data_loader = build_detection_train_loader(cfg) + logger.info("Starting training from iteration {}".format(start_iter)) + with EventStorage(start_iter) as storage: + for data, iteration in zip(data_loader, range(start_iter, max_iter)): + iteration = iteration + 1 + storage.step() + + loss_dict = model(data) + losses = sum(loss_dict.values()) + assert torch.isfinite(losses).all(), loss_dict + + loss_dict_reduced = {k: v.item() for k, v in comm.reduce_dict(loss_dict).items()} + losses_reduced = sum(loss for loss in loss_dict_reduced.values()) + if comm.is_main_process(): + storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced) + + optimizer.zero_grad() + losses.backward() + optimizer.step() + storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False) + scheduler.step() + + if ( + cfg.TEST.EVAL_PERIOD > 0 + and iteration % cfg.TEST.EVAL_PERIOD == 0 + and iteration != max_iter + ): + do_test(cfg, model) + # Compared to "train_net.py", the test results are not dumped to EventStorage + comm.synchronize() + + if iteration - start_iter > 5 and (iteration % 20 == 0 or iteration == max_iter): + for writer in writers: + writer.write() + periodic_checkpointer.step(iteration) + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup( + cfg, args + ) # if you don't like any of the default setup, write your own setup code + return cfg + + +def main(args): + cfg = setup(args) + + model = build_model(cfg) + logger.info("Model:\n{}".format(model)) + if args.eval_only: + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + return do_test(cfg, model) + + distributed = comm.get_world_size() > 1 + if distributed: + model = DistributedDataParallel( + model, device_ids=[comm.get_local_rank()], broadcast_buffers=False + ) + + do_train(cfg, model, resume=args.resume) + return do_test(cfg, model) + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/tools/train_net.py b/tools/train_net.py new file mode 100755 index 0000000000000000000000000000000000000000..da8c7e91fc56c7f80296402f38605321ba105324 --- /dev/null +++ b/tools/train_net.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Detection Training Script. + +This scripts reads a given config file and runs the training or evaluation. +It is an entry point that is made to train standard models in detectron2. + +In order to let one script support training of many models, +this script contains logic that are specific to these built-in models and therefore +may not be suitable for your own project. +For example, your research project perhaps only needs a single "evaluator". + +Therefore, we recommend you to use detectron2 as an library and take +this file as an example of how to use the library. +You may want to write your own script with your datasets and other customizations. +""" + +import logging +import os +from collections import OrderedDict +import torch + +import detectron2.utils.comm as comm +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import MetadataCatalog +from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, hooks, launch +from detectron2.evaluation import ( + CityscapesInstanceEvaluator, + CityscapesSemSegEvaluator, + COCOEvaluator, + COCOPanopticEvaluator, + DatasetEvaluators, + LVISEvaluator, + PascalVOCDetectionEvaluator, + SemSegEvaluator, + verify_results, +) +from detectron2.modeling import GeneralizedRCNNWithTTA + + +class Trainer(DefaultTrainer): + """ + We use the "DefaultTrainer" which contains pre-defined default logic for + standard training workflow. They may not work for you, especially if you + are working on a new research project. In that case you can use the cleaner + "SimpleTrainer", or write your own training loop. You can use + "tools/plain_train_net.py" as an example. + """ + + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + """ + Create evaluator(s) for a given dataset. + This uses the special metadata "evaluator_type" associated with each builtin dataset. + For your own dataset, you can simply create an evaluator manually in your + script and do not have to worry about the hacky if-else logic here. + """ + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type in ["sem_seg", "coco_panoptic_seg"]: + evaluator_list.append( + SemSegEvaluator( + dataset_name, + distributed=True, + num_classes=cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + ignore_label=cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + output_dir=output_folder, + ) + ) + if evaluator_type in ["coco", "coco_panoptic_seg"]: + evaluator_list.append(COCOEvaluator(dataset_name, cfg, True, output_folder)) + if evaluator_type == "coco_panoptic_seg": + evaluator_list.append(COCOPanopticEvaluator(dataset_name, output_folder)) + if evaluator_type == "cityscapes_instance": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesInstanceEvaluator(dataset_name) + if evaluator_type == "cityscapes_sem_seg": + assert ( + torch.cuda.device_count() >= comm.get_rank() + ), "CityscapesEvaluator currently do not work with multiple machines." + return CityscapesSemSegEvaluator(dataset_name) + elif evaluator_type == "pascal_voc": + return PascalVOCDetectionEvaluator(dataset_name) + elif evaluator_type == "lvis": + return LVISEvaluator(dataset_name, cfg, True, output_folder) + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format( + dataset_name, evaluator_type + ) + ) + elif len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + @classmethod + def test_with_TTA(cls, cfg, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA + # Only support some R-CNN models. + logger.info("Running inference with test-time augmentation ...") + model = GeneralizedRCNNWithTTA(cfg, model) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + res = Trainer.test(cfg, model) + if cfg.TEST.AUG.ENABLED: + res.update(Trainer.test_with_TTA(cfg, model)) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + """ + If you'd like to do anything fancier than the standard training logic, + consider writing your own training loop (see plain_train_net.py) or + subclassing the trainer. + """ + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + if cfg.TEST.AUG.ENABLED: + trainer.register_hooks( + [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))] + ) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + ) diff --git a/tools/visualize_data.py b/tools/visualize_data.py new file mode 100755 index 0000000000000000000000000000000000000000..b143b2d250787c2880657d42c9e9cc0c80c6a348 --- /dev/null +++ b/tools/visualize_data.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +import argparse +import os +from itertools import chain +import cv2 +import tqdm + +from detectron2.config import get_cfg +from detectron2.data import DatasetCatalog, MetadataCatalog, build_detection_train_loader +from detectron2.data import detection_utils as utils +from detectron2.data.build import filter_images_with_few_keypoints +from detectron2.utils.logger import setup_logger +from detectron2.utils.visualizer import Visualizer + + +def setup(args): + cfg = get_cfg() + if args.config_file: + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + return cfg + + +def parse_args(in_args=None): + parser = argparse.ArgumentParser(description="Visualize ground-truth data") + parser.add_argument( + "--source", + choices=["annotation", "dataloader"], + required=True, + help="visualize the annotations or the data loader (with pre-processing)", + ) + parser.add_argument("--config-file", metavar="FILE", help="path to config file") + parser.add_argument("--output-dir", default="./", help="path to output directory") + parser.add_argument("--show", action="store_true", help="show output in a window") + parser.add_argument( + "opts", + help="Modify config options using the command-line", + default=None, + nargs=argparse.REMAINDER, + ) + return parser.parse_args(in_args) + + +if __name__ == "__main__": + args = parse_args() + logger = setup_logger() + logger.info("Arguments: " + str(args)) + cfg = setup(args) + + dirname = args.output_dir + os.makedirs(dirname, exist_ok=True) + metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0]) + + def output(vis, fname): + if args.show: + print(fname) + cv2.imshow("window", vis.get_image()[:, :, ::-1]) + cv2.waitKey() + else: + filepath = os.path.join(dirname, fname) + print("Saving to {} ...".format(filepath)) + vis.save(filepath) + + scale = 2.0 if args.show else 1.0 + if args.source == "dataloader": + train_data_loader = build_detection_train_loader(cfg) + for batch in train_data_loader: + for per_image in batch: + # Pytorch tensor is in (C, H, W) format + img = per_image["image"].permute(1, 2, 0).cpu().detach().numpy() + img = utils.convert_image_to_rgb(img, cfg.INPUT.FORMAT) + + visualizer = Visualizer(img, metadata=metadata, scale=scale) + target_fields = per_image["instances"].get_fields() + labels = [metadata.thing_classes[i] for i in target_fields["gt_classes"]] + vis = visualizer.overlay_instances( + labels=labels, + boxes=target_fields.get("gt_boxes", None), + masks=target_fields.get("gt_masks", None), + keypoints=target_fields.get("gt_keypoints", None), + ) + output(vis, str(per_image["image_id"]) + ".jpg") + else: + dicts = list(chain.from_iterable([DatasetCatalog.get(k) for k in cfg.DATASETS.TRAIN])) + if cfg.MODEL.KEYPOINT_ON: + dicts = filter_images_with_few_keypoints(dicts, 1) + for dic in tqdm.tqdm(dicts): + img = utils.read_image(dic["file_name"], "RGB") + visualizer = Visualizer(img, metadata=metadata, scale=scale) + vis = visualizer.draw_dataset_dict(dic) + output(vis, os.path.basename(dic["file_name"])) diff --git a/tools/visualize_json_results.py b/tools/visualize_json_results.py new file mode 100755 index 0000000000000000000000000000000000000000..d11ecb90382a630d90661bc65cefc4f8bf3486cf --- /dev/null +++ b/tools/visualize_json_results.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +import argparse +import json +import numpy as np +import os +from collections import defaultdict +import cv2 +import tqdm +from fvcore.common.file_io import PathManager + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.structures import Boxes, BoxMode, Instances +from detectron2.utils.logger import setup_logger +from detectron2.utils.visualizer import Visualizer + + +def create_instances(predictions, image_size): + ret = Instances(image_size) + + score = np.asarray([x["score"] for x in predictions]) + chosen = (score > args.conf_threshold).nonzero()[0] + score = score[chosen] + bbox = np.asarray([predictions[i]["bbox"] for i in chosen]).reshape(-1, 4) + bbox = BoxMode.convert(bbox, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + + labels = np.asarray([dataset_id_map(predictions[i]["category_id"]) for i in chosen]) + + ret.scores = score + ret.pred_boxes = Boxes(bbox) + ret.pred_classes = labels + + try: + ret.pred_masks = [predictions[i]["segmentation"] for i in chosen] + except KeyError: + pass + return ret + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="A script that visualizes the json predictions from COCO or LVIS dataset." + ) + parser.add_argument("--input", required=True, help="JSON file produced by the model") + parser.add_argument("--output", required=True, help="output directory") + parser.add_argument("--dataset", help="name of the dataset", default="coco_2017_val") + parser.add_argument("--conf-threshold", default=0.5, type=float, help="confidence threshold") + args = parser.parse_args() + + logger = setup_logger() + + with PathManager.open(args.input, "r") as f: + predictions = json.load(f) + + pred_by_image = defaultdict(list) + for p in predictions: + pred_by_image[p["image_id"]].append(p) + + dicts = list(DatasetCatalog.get(args.dataset)) + metadata = MetadataCatalog.get(args.dataset) + if hasattr(metadata, "thing_dataset_id_to_contiguous_id"): + + def dataset_id_map(ds_id): + return metadata.thing_dataset_id_to_contiguous_id[ds_id] + + elif "lvis" in args.dataset: + # LVIS results are in the same format as COCO results, but have a different + # mapping from dataset category id to contiguous category id in [0, #categories - 1] + def dataset_id_map(ds_id): + return ds_id - 1 + + else: + raise ValueError("Unsupported dataset: {}".format(args.dataset)) + + os.makedirs(args.output, exist_ok=True) + + for dic in tqdm.tqdm(dicts): + img = cv2.imread(dic["file_name"], cv2.IMREAD_COLOR)[:, :, ::-1] + basename = os.path.basename(dic["file_name"]) + + predictions = create_instances(pred_by_image[dic["image_id"]], img.shape[:2]) + vis = Visualizer(img, metadata) + vis_pred = vis.draw_instance_predictions(predictions).get_image() + + vis = Visualizer(img, metadata) + vis_gt = vis.draw_dataset_dict(dic).get_image() + + concat = np.concatenate((vis_pred, vis_gt), axis=1) + cv2.imwrite(os.path.join(args.output, basename), concat[:, :, ::-1])