diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2fdf8a2d23cff3f69ea753466370b6dc3c719686..eea0b2544fd606d8593f1b2f12008a76673829d1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,6 +16,7 @@ All kinds of contributions are welcome, including but not limited to the followi ```{note} If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first. ``` + ### Code style #### Python @@ -24,10 +25,11 @@ We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code We use the following tools for linting and formatting: -- [flake8](http://flake8.pycqa.org/en/latest/): A wrapper around some linter tools. -- [yapf](https://github.com/google/yapf): A formatter for Python files. +- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools. - [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports. -- [markdownlint](https://github.com/markdownlint/markdownlint): A linter to check markdown files and flag style issues. +- [yapf](https://github.com/google/yapf): A formatter for Python files. +- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files. +- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files. - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring. Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg). @@ -48,23 +50,9 @@ From the repository folder pre-commit install ``` -Try the following steps to install ruby when you encounter an issue on installing markdownlint - -```shell -# install rvm -curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail -[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm" -rvm autolibs disable - -# install ruby -rvm install 2.7.1 -``` - -Or refer to [this repo](https://github.com/innerlee/setup) and take [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh) according its instruction. - After this on every commit check code linters and formatter will be enforced. ->Before you create a PR, make sure that your code lints and is formatted by yapf. +> Before you create a PR, make sure that your code lints and is formatted by yapf. #### C++ and CUDA diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index e163b312ca5b45dac195232979fa31024ff55ef2..0000000000000000000000000000000000000000 --- a/Dockerfile +++ /dev/null @@ -1,7 +0,0 @@ -FROM python:3.7 - -WORKDIR /mmcv - -COPY . /mmcv - -RUN pip install -e . diff --git a/LICENSES.md b/LICENSES.md index 9bb0c8cafa72033f503fd3f46b98d30dcfd75c29..5de8358331f4d21529e016807b86b66dc6ca29da 100644 --- a/LICENSES.md +++ b/LICENSES.md @@ -2,7 +2,7 @@ In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters. -| Operation | Files | License | -| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: | +| Operation | Files | License | +| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: | | upfirdn2d | [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu) | NVIDIA License | | fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License | diff --git a/MANIFEST.in b/MANIFEST.in index 65f232e070d43ce40d0fd425201e3b140b5af551..5de8494b5df3656a4f6a09da26d9f4bb27ed69a5 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,7 @@ include requirements/runtime.txt -include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json +include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json mmcv/model_zoo/torchvision_0.12.json include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp +include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm +recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm diff --git a/README.md b/README.md index 9b64100479f8f8030f1736173aa6ee3e25be8f8a..1a6541a689a48944394db84b48d5b484e63a8708 100644 --- a/README.md +++ b/README.md @@ -1,61 +1,274 @@ -#
MMCV
-## 简介 -MMCV是计算机视觉研究的基础库,主要提供以下功能:图像处理、图像和标注结果可视化、图像转换、多种CNN网络结构、高质量实现的常见CUDA算子。 +
+ +
 
+
+ OpenMMLab website + + + HOT + + +      + OpenMMLab platform + + + TRY IT OUT + + +
+
 
+
-## 安装 -组件支持 -+ Python 3.7、3.8、3.9 +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/en/latest/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) +[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) +[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) +[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) + +English | [简体中文](README_zh-CN.md) + +## Introduction + +MMCV is a foundational library for computer vision research and supports many +research projects as below: + +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. +- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. +- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. + +It provides the following functionalities. + +- Universal IO APIs +- Image/Video processing +- Image and annotation visualization +- Useful utilities (progress bar, timer, ...) +- PyTorch runner with hooking mechanism +- Various CNN architectures +- High-quality implementation of common CUDA ops + +It supports the following systems. + +- Linux +- Windows +- macOS + +See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage. + +Note: MMCV requires Python 3.6+. + +## Installation + +There are two versions of MMCV: + +- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build. +- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops. + +**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`. + +a. Install the full version. + +Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/). + +We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building for **Linux and Windows systems**. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands. + +i. Install the latest version. + +The rule for installing the latest `mmcv-full` is as follows: -### 1、使用pip方式安装 -mmcv whl包下载目录:[https://cancon.hpccube.com:65024/4/main/mmcv](https://cancon.hpccube.com:65024/4/main/mmcv),选择对应的pytorch版本和python版本下载对应mmcv的whl包 ```shell -pip install mmcv* (下载的mmcv的whl包) +pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -### 2、使用源码编译方式安装 -#### 编译环境准备 -提供2种环境准备方式: +Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. For example, +to install the latest `mmcv-full` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command: + +```shell +pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html +``` -1. 基于光源pytorch基础镜像环境:镜像下载地址:[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch),根据pytorch、python、dtk及系统下载对应的镜像版本。 +**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full. -2. 基于现有python环境:安装pytorch,pytorch whl包下载目录:[https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1),根据python、dtk版本,下载对应pytorch的whl包。安装命令如下: ```shell -pip install torch* (下载的torch的whl包) -pip install setuptools==59.5.0 wheel +pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html ``` -#### 源码编译安装 -- 代码下载 +For more details, please refer the the following tables and delete `=={mmcv_version}`. + +ii. Install a specified version. + +The rule for installing a specified `mmcv-full` is as follows: + ```shell -git clone https://developer.hpccube.com/codes/aicomponent/mmcv # 根据编译需要切换分支 +pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -- 提供2种源码编译方式(进入mmcv目录): + +First of all, please refer to the Releases and replace `{mmcv_version}` a specified one. e.g. `1.3.9`. +Then replace `{cu_version}` and `{torch_version}` in the url to your desired versions. For example, +to install `mmcv-full==1.3.9` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command: + +```shell +pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html ``` -1. 编译whl包并安装 -MMCV_WITH_OPS=1 python3 setup.py -v bdist_wheel -pip install dist/mmcv* -2. 源码编译安装 -MMCV_WITH_OPS=1 python3 setup.py install +For more details, please refer the the following tables. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CUDA torch 1.11torch 1.10torch 1.9torch 1.8torch 1.7torch 1.6torch 1.5
11.5
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html
11.3
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
11.1
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
11.0
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
10.2
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html
10.1
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html
9.2
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html
cpu
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html
+ +**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/en/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year. + +**Note**: mmcv-full does not provide pre-built packages for `cu102-torch1.11` and `cu92-torch*` on Windows. + +Another way is to compile locally by running + +```python +pip install mmcv-full ``` -3. 测试验证 + +Note that the local compiling may take up to 10 mins. + +b. Install the lite version. + +```python +pip install mmcv ``` -cd test -pytest -s ./test_arraymisc.py -pytest -s ./test_ops + +c. Install full version with custom operators for onnxruntime + +- Check [here](docs/en/deployment/onnxruntime_op.md) for detailed instruction. + +If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html). + +## FAQ + +If you face some installation issues, CUDA related issues or RuntimeErrors, +you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html). + +## Citation + +If you find this project useful in your research, please consider cite: + +```latex +@misc{mmcv, + title={{MMCV: OpenMMLab} Computer Vision Foundation}, + author={MMCV Contributors}, + howpublished = {\url{https://github.com/open-mmlab/mmcv}}, + year={2018} +} ``` -#### 注意事项 -+ 若使用pip install下载安装过慢,可添加pypi清华源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ -+ ROCM_PATH为dtk的路径,默认为/opt/dtk -## 验证 -- python -c "import mmcv; mmcv.\_\_version__",版本号与官方版本同步,查询该软件的版本号,例如2.0.0; +## Contributing -## Known Issue -- 无 +We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline. -## 参考资料 -- [README_ORIGIN](README_ORIGIN.md) -- [README_zh-CN](README_zh-CN.md) -- [https://github.com/open-mmlab/mmcv](https://github.com/open-mmlab/mmcv) +## License +MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters. diff --git a/README_ORIGIN.md b/README_ORIGIN.md deleted file mode 100644 index e9e3f8efaf86059c8e7bef3fec73513b69e31442..0000000000000000000000000000000000000000 --- a/README_ORIGIN.md +++ /dev/null @@ -1,222 +0,0 @@ -
- -
- -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) - -English | [简体中文](README_zh-CN.md) - -## Introduction - -MMCV is a foundational library for computer vision research and supports many -research projects as below: - -- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision. -- [MIM](https://github.com/open-mmlab/mim): MIM Installs OpenMMLab Packages. -- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. -- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. -- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. -- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. -- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. -- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. -- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. -- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. -- [MMOCR](https://github.com/open-mmlab/mmocr): A Comprehensive Toolbox for Text Detection, Recognition and Understanding. -- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. -- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. -- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab FewShot Learning Toolbox and Benchmark. - -It provides the following functionalities. - -- Universal IO APIs -- Image/Video processing -- Image and annotation visualization -- Useful utilities (progress bar, timer, ...) -- PyTorch runner with hooking mechanism -- Various CNN architectures -- High-quality implementation of common CUDA ops - -See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage. - -Note: MMCV requires Python 3.6+. - -## Installation - -There are two versions of MMCV: - -- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build. -- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops. - -**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`. - -a. Install the full version. - -Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/). - -We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands. - -i. Install the latest version. - -The rule for installing the latest ``mmcv-full`` is as follows: - -```shell -pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html -``` - -Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example, -to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command: - -```shell -pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -``` - -**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full. - -```shell -pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html -``` - -For more details, please refer the the following tables and delete ``=={mmcv_version}``. - -ii. Install a specified version. - -The rule for installing a specified ``mmcv-full`` is as follows: - -```shell -pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html -``` - -First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``. -Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example, -to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command: - -```shell -pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html -``` - -For more details, please refer the the following tables. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
CUDA torch1.10torch1.9torch1.8torch1.7torch1.6torch1.5
11.3
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
11.1
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
11.0
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
10.2
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html
10.1
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html
9.2
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html
cpu
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html
- -**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year. - -Another way is to compile locally by running - -```python -pip install mmcv-full -``` - -Note that the local compiling may take up to 10 mins. - -b. Install the lite version. - -```python -pip install mmcv -``` - -c. Install full version with custom operators for onnxruntime - -- Check [here](docs/deployment/onnxruntime_op.md) for detailed instruction. - -If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html). - -## FAQ - -If you face some installation issues, CUDA related issues or RuntimeErrors, -you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html). - -## Citation - -If you find this project useful in your research, please consider cite: - -```latex -@misc{mmcv, - title={{MMCV: OpenMMLab} Computer Vision Foundation}, - author={MMCV Contributors}, - howpublished = {\url{https://github.com/open-mmlab/mmcv}}, - year={2018} -} -``` - -## Contributing - -We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline. - -## License - -MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters. diff --git a/README_zh-CN.md b/README_zh-CN.md index e3288ee31403d02c6d4c2c9335aff556c2c3d23c..8c768c837ecddc7f6c4d7e036f590d9d2b96fa64 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -1,8 +1,30 @@
- + +
 
+
+ OpenMMLab 官网 + + + HOT + + +      + OpenMMLab 开放平台 + + + TRY IT OUT + + +
+
 
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) +[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/zh_CN/latest/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) +[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) +[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) +[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) +[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE) [English](README.md) | 简体中文 @@ -10,20 +32,24 @@ MMCV 是一个面向计算机视觉的基础库,它支持了很多开源项目,例如: -- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库 -- [MIM](https://github.com/open-mmlab/mim): OpenMMLab 项目、算法、模型的统一入口 -- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱与测试基准 -- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 检测工具箱与测试基准 -- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用3D目标检测平台 -- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱与测试基准 -- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱与测试基准 +- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口 +- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱 +- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 +- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 +- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱 +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 +- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 -- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱与测试基准 -- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 -- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包 -- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 新一代生成模型工具箱 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 -- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 +- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 +- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 MMCV 提供了如下众多功能: @@ -35,7 +61,13 @@ MMCV 提供了如下众多功能: - 多种 CNN 网络结构 - 高质量实现的常见 CUDA 算子 -如想了解更多特性和使用,请参考[文档](http://mmcv.readthedocs.io/en/latest)。 +MMCV 支持以下的系统: + +- Linux +- Windows +- macOS + +如想了解更多特性和使用,请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。 提示: MMCV 需要 Python 3.6 以上版本。 @@ -50,19 +82,19 @@ MMCV 有两个版本: a. 安装完整版 -在安装 mmcv-full 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 PyTorch 官方[文档](https://pytorch.org/)。 +在安装 mmcv-full 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 PyTorch [官方文档](https://pytorch.org/)。 -我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包,可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外,安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。 +我们提供了 **Linux 和 Windows 平台** PyTorch 和 CUDA 版本组合的 mmcv-full 预编译包,可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外,安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。 i. 安装最新版本 -如下是安装最新版 ``mmcv-full`` 的命令 +如下是安装最新版 `mmcv-full` 的命令 ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号,例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``,使用如下替换过的命令 +请将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号,例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的最新版 `mmcv-full`,使用如下替换过的命令 ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html @@ -74,18 +106,18 @@ pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html ``` -如果想知道更多 CUDA 和 PyTorch 版本的命令,可以参考下面的表格,将链接中的 ``=={mmcv_version}`` 删去即可。 +如果想知道更多 CUDA 和 PyTorch 版本的命令,可以参考下面的表格,将链接中的 `=={mmcv_version}` 删去即可。 ii. 安装特定的版本 -如下是安装特定版本 ``mmcv-full`` 的命令 +如下是安装特定版本 `mmcv-full` 的命令 ```shell pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -首先请参考版本发布信息找到想要安装的版本号,将 ``{mmcv_version}`` 替换成该版本号,例如 ``1.3.9``。 -然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号,例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本,使用如下替换过的命令 +首先请参考版本发布信息找到想要安装的版本号,将 `{mmcv_version}` 替换成该版本号,例如 `1.3.9`。 +然后将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号,例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的 `mmcv-full` 1.3.9 版本,使用如下替换过的命令 ```shell pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html @@ -97,15 +129,27 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t CUDA - torch1.10 - torch1.9 - torch1.8 - torch1.7 - torch1.6 - torch1.5 + torch 1.11 + torch 1.10 + torch 1.9 + torch 1.8 + torch 1.7 + torch 1.6 + torch 1.5 + + + 11.5 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html
+ + + + + + 11.3 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
@@ -115,6 +159,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t 11.1 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
@@ -127,12 +172,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
10.2 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
@@ -144,6 +191,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t 10.1 +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
@@ -154,12 +202,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html
cpu +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
@@ -170,7 +220,9 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t -**注意**:以上提供的预编译包并不囊括所有的 mmcv-full 版本,你可以点击对应链接查看支持的版本。例如,点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html),可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外,从 `mmcv v1.3.17` 开始,我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包,但是我们依然在 CI 中保证对它们的兼容持续到下一年。 +**注意**:以上提供的预编译包并不囊括所有的 mmcv-full 版本,你可以点击对应链接查看支持的版本。例如,点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html),可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外,从 `mmcv v1.3.17` 开始,我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs/zh_cn/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包,但是我们依然在 CI 中保证对它们的兼容持续到下一年。 + +**注意**:mmcv-full 没有提供 Windows 平台 `cu102-torch1.8.0` 和 `cu92-torch*` 的预编译包。 除了使用预编译包之外,另一种方式是在本地进行编译,直接运行下述命令 @@ -188,13 +240,13 @@ pip install mmcv c. 安装完整版并且编译 onnxruntime 的自定义算子 -- 详细的指南请查看 [这里](docs/deployment/onnxruntime_op.md)。 +- 详细的指南请查看[这里](docs/zh_cn/deployment/onnxruntime_op.md)。 -如果想从源码编译 MMCV,请参考[该文档](https://mmcv.readthedocs.io/en/latest/get_started/build.html)。 +如果想从源码编译 MMCV,请参考[该文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。 ## FAQ -如果你遇到了安装问题,CUDA 相关的问题或者 RuntimeErrors,可以首先参考[问题解决页面](https://mmcv.readthedocs.io/en/latest/faq.html) 看是否已经有解决方案。 +如果你遇到了安装问题,CUDA 相关的问题或者 RuntimeErrors,可以首先参考[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html) 看是否已经有解决方案。 ## 贡献指南 @@ -203,12 +255,13 @@ c. 安装完整版并且编译 onnxruntime 的自定义算子 ## 许可证 `MMCV` 目前以 Apache 2.0 的许可证发布,但是其中有一部分功能并不是使用的 Apache2.0 许可证,我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证,如果您正在从事盈利性活动,请谨慎参考此文档。 + ## 欢迎加入 OpenMMLab 社区 -扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI) +扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab),加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=3ijNTqfg),或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
- +
我们会在 OpenMMLab 社区为大家 diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md index 61941e3306c7dc2c0f7b0e181248cac841571a7a..07411b7774c2ed713f472c1287b98b871c7f4d02 100644 --- a/TERMINOLOGY.md +++ b/TERMINOLOGY.md @@ -4,27 +4,27 @@ This document is used as a reference for English-Chinese terminology translation 该文档用作中英文翻译对照参考。 -| English | 中文 | -| :-----: | :---:| -| annotation | 标注 | -| backbone | 主干网络 | -| benchmark | 基准测试 | -| checkpoint | 模型权重文件 | -| classifier | 分类器 | -| cls_head | 分类头 | -| decoder | 解码器 | -| detector | 检测器 | -| encoder | 编码器 | -| finetune | 微调 | -| ground truth | 真实标签 | -| hook | 钩子 | -| localizer | 定位器 | -| neck | 模型颈部 | -| pipeline | 流水线 | -| recognizer | 识别器 | -| register | 注册器 | -| schedule | 调整 | -| scheduler | 调度器 | -| segmentor | 分割器 | -| tensor | 张量 | -| training schedule | 训练策略 | +| English | 中文 | +| :---------------: | :----------: | +| annotation | 标注 | +| backbone | 主干网络 | +| benchmark | 基准测试 | +| checkpoint | 模型权重文件 | +| classifier | 分类器 | +| cls_head | 分类头 | +| decoder | 解码器 | +| detector | 检测器 | +| encoder | 编码器 | +| finetune | 微调 | +| ground truth | 真实标签 | +| hook | 钩子 | +| localizer | 定位器 | +| neck | 模型颈部 | +| pipeline | 流水线 | +| recognizer | 识别器 | +| register | 注册器 | +| schedule | 调整 | +| scheduler | 调度器 | +| segmentor | 分割器 | +| tensor | 张量 | +| training schedule | 训练策略 | diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e9985b4ca645a14c9e3f18bf7afcc0cb4f52bf73 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,70 @@ +# Docker images + +There are two `Dockerfile` files to build docker images, one to build an image with the mmcv-full pre-built package and the other with the mmcv development environment. + +```text +. +|-- README.md +|-- dev # build with mmcv development environment +| `-- Dockerfile +`-- release # build with mmcv pre-built package + `-- Dockerfile +``` + +## Build docker images + +### Build with mmcv pre-built package + +Build with local repository + +```bash +git clone https://github.com/open-mmlab/mmcv.git && cd mmcv +docker build -t mmcv -f docker/release/Dockerfile . +``` + +Or build with remote repository + +```bash +docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release +``` + +The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions. + +```bash +docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=1.5.0 . +``` + +If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images. + +An example to build an image with PyTorch 1.11 and CUDA 11.3. + +```bash +docker build -t mmcv -f docker/release/Dockerfile \ + --build-arg PYTORCH=1.9.0 \ + --build-arg CUDA=11.1 \ + --build-arg CUDNN=8 \ + --build-arg MMCV=1.5.0 . +``` + +More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags). + +### Build with mmcv development environment + +If you want to build an docker image with the mmcv development environment, you can use the following command + +```bash +git clone https://github.com/open-mmlab/mmcv.git && cd mmcv +docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 . +``` + +Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute). + +The building process may take 10 minutes or more. + +## Run images + +```bash +docker run --gpus all --shm-size=8g -it mmcv +``` + +See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages. diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..0c673e958f2909cd80f589100c2b7cbfa726c499 --- /dev/null +++ b/docker/dev/Dockerfile @@ -0,0 +1,32 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# Install git and system dependencies for opencv-python +RUN apt-get update && apt-get install -y git \ + && apt-get update && apt-get install -y libgl1 libglib2.0-0 + +# Install system dependencies for unit tests +RUN apt-get install -y ffmpeg libturbojpeg \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# build mmcv-full from source with develop mode +ARG HTTPS_PROXY="" +ENV https_proxy=${HTTPS_PROXY} +ENV FORCE_CUDA="1" +ENV MMCV_WITH_OPS="1" +ARG CUDA_ARCH="" +ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH} +RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv +WORKDIR /mmcv +RUN git rev-parse --short HEAD +RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install diff --git a/docker/release/Dockerfile b/docker/release/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..493aa6d1625c9bdee1b9f3bd8121c6ff2f723d4a --- /dev/null +++ b/docker/release/Dockerfile @@ -0,0 +1,20 @@ +ARG PYTORCH="1.8.1" +ARG CUDA="10.2" +ARG CUDNN="7" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +# To fix GPG key error when running apt-get update +RUN rm /etc/apt/sources.list.d/cuda.list \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub + +# Install system dependencies for opencv-python +RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Install mmcv-full +ARG MMCV="1.5.1" +RUN pip install openmim && mim install mmcv-full==${MMCV} && python -c 'import mmcv;print(mmcv.__version__)' diff --git a/docs/_static/qq_group_qrcode.jpg b/docs/_static/qq_group_qrcode.jpg deleted file mode 100644 index 7c6b04f561da283ae622f4219ea9b8cabf8f301a..0000000000000000000000000000000000000000 Binary files a/docs/_static/qq_group_qrcode.jpg and /dev/null differ diff --git a/docs/_static/zhihu_qrcode.jpg b/docs/_static/zhihu_qrcode.jpg deleted file mode 100644 index c745fb027f06564d41794e9a40069b06c34e2bb5..0000000000000000000000000000000000000000 Binary files a/docs/_static/zhihu_qrcode.jpg and /dev/null differ diff --git a/docs/community/contributing.md b/docs/community/contributing.md deleted file mode 120000 index f939e75f21a8badb5c40f527abd0e098fe9bc472..0000000000000000000000000000000000000000 --- a/docs/community/contributing.md +++ /dev/null @@ -1 +0,0 @@ -../../CONTRIBUTING.md \ No newline at end of file diff --git a/docs/community/pr.md b/docs/community/pr.md deleted file mode 100644 index 77bdbf77080577d48ca734ffeb45d12269a166e4..0000000000000000000000000000000000000000 --- a/docs/community/pr.md +++ /dev/null @@ -1,94 +0,0 @@ -## Pull Request (PR) - -### What is PR - -`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github. - -> Pull requests let you tell others about changes you've pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch. - -### Basic Workflow - -1. Get the most recent codebase -2. Checkout a new branch from the master branch -3. Commit your changes -4. Push your changes and create a PR -5. Discuss and review your code -6. Merge your branch to the master branch - -### Procedures in detail - -1. Get the most recent codebase - + When you work on your first PR - - Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page - ![avatar](../_static/community/1.png) - - - Clone forked repository to local - ```bash - git clone git@github.com:XXX/mmcv.git - ``` - - - Add source repository to upstream - ```bash - git remote add upstream git@github.com:open-mmlab/mmcv - ``` - - + After your first PR - - Checkout master branch of the local repository and pull the latest master branch of the source repository - ```bash - git checkout master - git pull upstream master - ``` - -2. Checkout a new branch from the master branch - ```bash - git checkout -b branchname - ``` - -```{tip} -To make commit history clear, we strongly recommend you checkout the master branch before create a new branch. -``` - -3. Commit your changes - ```bash - # coding - git add [files] - git commit -m 'messages' - ``` - -4. Push your changes to the forked repository and create a PR - + Push the branch to your forked remote repository - ```bash - git push origin branchname - ``` - - + Create a PR - ![avatar](../_static/community/2.png) - - + Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)). - -5. Discuss and review your code - + After creating a pull request, you can ask a specific person to review the changes you've proposed - ![avatar](../_static/community/3.png) - - + Modify your codes according to reviewers' suggestions and then push your changes - -6. Merge your branch to the master branch and delete the branch - ```bash - git branch -d branchname # delete local branch - git push origin --delete branchname # delete remote branch - ``` - -### PR Specs - -1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style -2. One short-time branch should be matched with only one PR -3. Accomplish a detailed change in one PR. Avoid large PR - >- Bad: Support Faster R-CNN - >- Acceptable: Add a box head to Faster R-CNN - >- Good: Add a parameter to box head to support custom conv-layer number -4. Provide clear and significant commit message -5. Provide clear and meaningful PR description - >- Task name should be clarified in title. The general format is: [Prefix] Short description of the PR (Suffix) - >- Prefix: add new feature [Feature], fix bug [Fix], related to documents [Docs], in developing [WIP] (which will not be reviewed temporarily) - >- Introduce main changes, results and influences on other modules in short description - >- Associate related issues and pull requests with a milestone diff --git a/docs/deployment/onnx.md b/docs/deployment/onnx.md deleted file mode 100644 index be6c59c5c5dbe3d17d62f4c01c79df35afb19d6d..0000000000000000000000000000000000000000 --- a/docs/deployment/onnx.md +++ /dev/null @@ -1,19 +0,0 @@ -## Introduction of onnx module in MMCV (Experimental) - -### register_extra_symbolics - -Some extra symbolic functions need to be registered before exporting PyTorch model to ONNX. - -#### Example - -```python -import mmcv -from mmcv.onnx import register_extra_symbolics - -opset_version = 11 -register_extra_symbolics(opset_version) -``` - -#### FAQs - -- None diff --git a/docs/Makefile b/docs/en/Makefile similarity index 100% rename from docs/Makefile rename to docs/en/Makefile diff --git a/docs/_static/community/1.png b/docs/en/_static/community/1.png similarity index 100% rename from docs/_static/community/1.png rename to docs/en/_static/community/1.png diff --git a/docs/_static/community/2.png b/docs/en/_static/community/2.png similarity index 100% rename from docs/_static/community/2.png rename to docs/en/_static/community/2.png diff --git a/docs/_static/community/3.png b/docs/en/_static/community/3.png similarity index 100% rename from docs/_static/community/3.png rename to docs/en/_static/community/3.png diff --git a/docs/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css similarity index 100% rename from docs/_static/css/readthedocs.css rename to docs/en/_static/css/readthedocs.css diff --git a/docs/_static/flow_img2toimg1.png b/docs/en/_static/flow_img2toimg1.png similarity index 100% rename from docs/_static/flow_img2toimg1.png rename to docs/en/_static/flow_img2toimg1.png diff --git a/docs/_static/flow_raw_images.png b/docs/en/_static/flow_raw_images.png similarity index 100% rename from docs/_static/flow_raw_images.png rename to docs/en/_static/flow_raw_images.png diff --git a/docs/_static/flow_visualization.png b/docs/en/_static/flow_visualization.png similarity index 100% rename from docs/_static/flow_visualization.png rename to docs/en/_static/flow_visualization.png diff --git a/docs/_static/flow_warp.png b/docs/en/_static/flow_warp.png similarity index 100% rename from docs/_static/flow_warp.png rename to docs/en/_static/flow_warp.png diff --git a/docs/_static/flow_warp_diff.png b/docs/en/_static/flow_warp_diff.png similarity index 100% rename from docs/_static/flow_warp_diff.png rename to docs/en/_static/flow_warp_diff.png diff --git a/docs/_static/image/mmcv-logo.png b/docs/en/_static/image/mmcv-logo.png similarity index 100% rename from docs/_static/image/mmcv-logo.png rename to docs/en/_static/image/mmcv-logo.png diff --git a/docs/_static/parallel_progress.gif b/docs/en/_static/parallel_progress.gif similarity index 100% rename from docs/_static/parallel_progress.gif rename to docs/en/_static/parallel_progress.gif diff --git a/docs/_static/parallel_progress.png b/docs/en/_static/parallel_progress.png similarity index 100% rename from docs/_static/parallel_progress.png rename to docs/en/_static/parallel_progress.png diff --git a/docs/_static/progress.gif b/docs/en/_static/progress.gif similarity index 100% rename from docs/_static/progress.gif rename to docs/en/_static/progress.gif diff --git a/docs/_static/progress.png b/docs/en/_static/progress.png similarity index 100% rename from docs/_static/progress.png rename to docs/en/_static/progress.png diff --git a/docs/en/_static/qq_group_qrcode.jpg b/docs/en/_static/qq_group_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..8216326ad442c37c706bdf6dc8f7203c532849d2 Binary files /dev/null and b/docs/en/_static/qq_group_qrcode.jpg differ diff --git a/docs/en/_static/wechat_qrcode.jpg b/docs/en/_static/wechat_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1f453ab91436264e5795569e8e3fdc86204024d5 Binary files /dev/null and b/docs/en/_static/wechat_qrcode.jpg differ diff --git a/docs/en/_static/zhihu_qrcode.jpg b/docs/en/_static/zhihu_qrcode.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f71e00615174516f9befa43ef20eff4216bded4c Binary files /dev/null and b/docs/en/_static/zhihu_qrcode.jpg differ diff --git a/docs/api.rst b/docs/en/api.rst similarity index 90% rename from docs/api.rst rename to docs/en/api.rst index 8ca9118c3b033f1b7311ec3c1533ce9c93fa1aa2..5d3e623037e3fb102f8c927ff5909d478a46cab9 100644 --- a/docs/api.rst +++ b/docs/en/api.rst @@ -38,6 +38,11 @@ runner .. automodule:: mmcv.runner :members: +engine +------ +.. automodule:: mmcv.engine + :members: + ops ------ .. automodule:: mmcv.ops diff --git a/docs/en/community/contributing.md b/docs/en/community/contributing.md new file mode 120000 index 0000000000000000000000000000000000000000..72723396444c0a6cc0516f6f2379b2d868ba59f7 --- /dev/null +++ b/docs/en/community/contributing.md @@ -0,0 +1 @@ +../../../CONTRIBUTING.md diff --git a/docs/en/community/pr.md b/docs/en/community/pr.md new file mode 100644 index 0000000000000000000000000000000000000000..12b7535e749109820b60d59776c91f6be25c2fa3 --- /dev/null +++ b/docs/en/community/pr.md @@ -0,0 +1,114 @@ +## Pull Request (PR) + +### What is PR + +`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github. + +``` +Pull requests let you tell others about changes you have pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch. +``` + +### Basic Workflow + +1. Get the most recent codebase +2. Checkout a new branch from the master branch +3. Commit your changes +4. Push your changes and create a PR +5. Discuss and review your code +6. Merge your branch to the master branch + +### Procedures in detail + +#### 1. Get the most recent codebase + +- When you work on your first PR + + Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page + ![avatar](../_static/community/1.png) + + Clone forked repository to local + + ```bash + git clone git@github.com:XXX/mmcv.git + ``` + + Add source repository to upstream + + ```bash + git remote add upstream git@github.com:open-mmlab/mmcv + ``` + +- After your first PR + + Checkout master branch of the local repository and pull the latest master branch of the source repository + + ```bash + git checkout master + git pull upstream master + ``` + +#### 2. Checkout a new branch from the master branch + +```bash +git checkout -b branchname +``` + +```{tip} +To make commit history clear, we strongly recommend you checkout the master branch before create a new branch. +``` + +#### 3. Commit your changes + +```bash +# coding +git add [files] +git commit -m 'messages' +``` + +#### 4. Push your changes to the forked repository and create a PR + +- Push the branch to your forked remote repository + + ```bash + git push origin branchname + ``` + +- Create a PR + ![avatar](../_static/community/2.png) + +- Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)). + +#### 5. Discuss and review your code + +- After creating a pull request, you can ask a specific person to review the changes you've proposed + ![avatar](../_static/community/3.png) + +- Modify your codes according to reviewers' suggestions and then push your changes + +#### 6. Merge your branch to the master branch and delete the branch + +```bash +git branch -d branchname # delete local branch +git push origin --delete branchname # delete remote branch +``` + +### PR Specs + +1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style + +2. One short-time branch should be matched with only one PR + +3. Accomplish a detailed change in one PR. Avoid large PR + + - Bad: Support Faster R-CNN + - Acceptable: Add a box head to Faster R-CNN + - Good: Add a parameter to box head to support custom conv-layer number + +4. Provide clear and significant commit message + +5. Provide clear and meaningful PR description + + - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix) + - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily) + - Introduce main changes, results and influences on other modules in short description + - Associate related issues and pull requests with a milestone diff --git a/docs/compatibility.md b/docs/en/compatibility.md similarity index 100% rename from docs/compatibility.md rename to docs/en/compatibility.md diff --git a/docs_zh_CN/conf.py b/docs/en/conf.py similarity index 61% rename from docs_zh_CN/conf.py rename to docs/en/conf.py index e0c65d0eeca3bc99ef827b3fa36fc903422e8832..e38dfab1d2673a9bf07dcb9635cab4096e1960c8 100644 --- a/docs_zh_CN/conf.py +++ b/docs/en/conf.py @@ -15,21 +15,19 @@ import os import sys import pytorch_sphinx_theme -from m2r import MdInclude -from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../..')) -version_file = '../mmcv/version.py' -with open(version_file, 'r') as f: +version_file = '../../mmcv/version.py' +with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] # -- Project information ----------------------------------------------------- project = 'mmcv' -copyright = '2018-2021, OpenMMLab' +copyright = '2018-2022, OpenMMLab' author = 'MMCV Authors' # The short X.Y version @@ -51,14 +49,14 @@ extensions = [ 'sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'sphinx.ext.viewcode', - 'sphinx.ext.autosectionlabel', 'sphinx_markdown_tables', 'myst_parser', 'sphinx_copybutton', ] # yapf: disable +myst_heading_anchors = 4 + autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision'] -autosectionlabel_prefix_document = True # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -79,7 +77,7 @@ master_doc = 'index' # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'zh_CN' +language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -108,94 +106,9 @@ html_theme_options = { 'name': 'GitHub', 'url': 'https://github.com/open-mmlab/mmcv' }, - { - 'name': - '文档', - 'children': [ - { - 'name': 'MMCV', - 'url': 'https://mmcv.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MIM', - 'url': 'https://openmim.readthedocs.io/en/latest/' - }, - { - 'name': 'MMAction2', - 'url': 'https://mmaction2.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMClassification', - 'url': - 'https://mmclassification.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMDetection', - 'url': 'https://mmdetection.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMDetection3D', - 'url': - 'https://mmdetection3d.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMEditing', - 'url': 'https://mmediting.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMGeneration', - 'url': 'https://mmgeneration.readthedocs.io/en/latest/', - }, - { - 'name': 'MMOCR', - 'url': 'https://mmocr.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMPose', - 'url': 'https://mmpose.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMSegmentation', - 'url': - 'https://mmsegmentation.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMTracking', - 'url': 'https://mmtracking.readthedocs.io/zh_CN/latest/', - }, - { - 'name': 'MMFlow', - 'url': 'https://mmflow.readthedocs.io/en/latest/', - }, - { - 'name': 'MMFewShot', - 'url': 'https://mmfewshot.readthedocs.io/zh_CN/latest/', - }, - ] - }, - { - 'name': - 'OpenMMLab', - 'children': [ - { - 'name': '主页', - 'url': 'https://openmmlab.com/' - }, - { - 'name': 'GitHub', - 'url': 'https://github.com/open-mmlab/' - }, - { - 'name': '推特', - 'url': 'https://twitter.com/OpenMMLab' - }, - { - 'name': '知乎', - 'url': 'https://zhihu.com/people/openmmlab' - }, - ] - }, - ] + ], + # Specify the language of shared menu + 'menu_lang': 'en', } # Add any paths that contain custom static files (such as style sheets) here, @@ -288,16 +201,3 @@ StandaloneHTMLBuilder.supported_image_types = [ # Ignore >>> when copying code copybutton_prompt_text = r'>>> |\.\.\. ' copybutton_prompt_is_regexp = True - - -def setup(app): - app.add_config_value('no_underscore_emphasis', False, 'env') - app.add_config_value('m2r_parse_relative_links', False, 'env') - app.add_config_value('m2r_anonymous_references', False, 'env') - app.add_config_value('m2r_disable_inline_math', False, 'env') - app.add_directive('mdinclude', MdInclude) - app.add_config_value('recommonmark_config', { - 'auto_toc_tree_section': 'Contents', - 'enable_eval_rst': True, - }, True) - app.add_transform(AutoStructify) diff --git a/docs/deployment/mmcv_ops_definition.md b/docs/en/deployment/mmcv_ops_definition.md similarity index 80% rename from docs/deployment/mmcv_ops_definition.md rename to docs/en/deployment/mmcv_ops_definition.md index 5696316be5b1fb9234faab74cd83ad579655724e..d7eabb33fd41855116ed975d4e48daea81e4d74d 100644 --- a/docs/deployment/mmcv_ops_definition.md +++ b/docs/en/deployment/mmcv_ops_definition.md @@ -1,7 +1,10 @@ -# Definition of custom operators in MMCV +# MMCV Operators + +To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document. -- [Definition of custom operators in MMCV](#definition-of-custom-operators-in-mmcv) + +- [MMCV Operators](#mmcv-operators) - [MMCVBorderAlign](#mmcvborderalign) - [Description](#description) - [Parameters](#parameters) @@ -80,25 +83,26 @@ - [Inputs](#inputs-12) - [Outputs](#outputs-12) - [Type Constraints](#type-constraints-12) -- [torch](#torch) - - [grid_sampler](#grid_sampler) + - [grid_sampler\*](#grid_sampler) - [Description](#description-13) - [Parameters](#parameters-13) - [Inputs](#inputs-13) - [Outputs](#outputs-13) - [Type Constraints](#type-constraints-13) - - [cummax](#cummax) + - [cummax\*](#cummax) - [Description](#description-14) - [Parameters](#parameters-14) - [Inputs](#inputs-14) - [Outputs](#outputs-14) - [Type Constraints](#type-constraints-14) - - [cummin](#cummin) + - [cummin\*](#cummin) - [Description](#description-15) - [Parameters](#parameters-15) - [Inputs](#inputs-15) - [Outputs](#outputs-15) - [Type Constraints](#type-constraints-15) + - [Reminders](#reminders) + ## MMCVBorderAlign @@ -118,9 +122,9 @@ Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | -| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). | +| Type | Parameter | Description | +| ----- | ----------- | ----------------------------------------------------------------------------------- | +| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). | ### Inputs @@ -152,11 +156,11 @@ Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.0 ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | -| `int` | `kernel_size` | reassemble kernel size, should be odd integer| -| `int` | `group_size` | reassemble group size | -| `float` | `scale_factor` | upsample ratio(>=1) | +| Type | Parameter | Description | +| ------- | -------------- | --------------------------------------------- | +| `int` | `kernel_size` | reassemble kernel size, should be odd integer | +| `int` | `group_size` | reassemble group size | +| `float` | `scale_factor` | upsample ratio(>=1) | ### Inputs @@ -187,8 +191,7 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | +None ### Inputs @@ -219,8 +222,7 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | +None ### Inputs @@ -242,7 +244,6 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p - T:tensor(float32) - ## MMCVCornerPool ### Description @@ -251,9 +252,9 @@ Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | ---------------------------------------------------------------- | -| `int` | `mode` | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) | +| Type | Parameter | Description | +| ----- | --------- | ---------------------------------------------------------------- | +| `int` | `mode` | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) | ### Inputs @@ -283,15 +284,15 @@ Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) f ### Parameters -| Type | Parameter | Description | -| -------------- | ------------------ | ------------------------------------------------------------------------------------- | -| `list of ints` | `stride` | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`. | -| `list of ints` | `padding` | Paddings on both sides of the input, (padH, padW). Defaults to `(0, 0)`. | -| `list of ints` | `dilation` | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`. | -| `int` | `groups` | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.| -| `int` | `deformable_groups` | Groups of deformable offset. Defaults to `1`. | -| `int` | `bias` | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. | -| `int` | `im2col_step` | Groups of deformable offset. Defaults to `32`. | +| Type | Parameter | Description | +| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- | +| `list of ints` | `stride` | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`. | +| `list of ints` | `padding` | Paddings on both sides of the input, (padH, padW). Defaults to `(0, 0)`. | +| `list of ints` | `dilation` | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`. | +| `int` | `groups` | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`. | +| `int` | `deformable_groups` | Groups of deformable offset. Defaults to `1`. | +| `int` | `bias` | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. | +| `int` | `im2col_step` | Groups of deformable offset. Defaults to `32`. | ### Inputs @@ -323,11 +324,11 @@ Perform Modulated Deformable Convolution on input feature, read [Deformable Conv ### Parameters -| Type | Parameter | Description | -| -------------- | ------------------ | ------------------------------------------------------------------------------------- | -| `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW) | -| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW) | -| `list of ints` | `dilation` | The spacing between kernel elements. (dH, dW) | +| Type | Parameter | Description | +| -------------- | ------------------- | ------------------------------------------------------------------------------------- | +| `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW) | +| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW) | +| `list of ints` | `dilation` | The spacing between kernel elements. (dH, dW) | | `int` | `deformable_groups` | Groups of deformable offset. | | `int` | `groups` | Split input into groups. `input_channel` should be divisible by the number of groups. | @@ -365,13 +366,13 @@ Deformable roi pooling layer ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | +| Type | Parameter | Description | +| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | | `int` | `output_height` | height of output roi | | `int` | `output_width` | width of output roi | | `float` | `spatial_scale` | used to scale the input boxes | | `int` | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. | -| `float` | `gamma` | gamma | +| `float` | `gamma` | gamma | ### Inputs @@ -404,10 +405,10 @@ Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for mor ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | -| `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** | -| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`. | +| Type | Parameter | Description | +| -------------- | --------- | -------------------------------------------------------------------------------- | +| `list of ints` | `stride` | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** | +| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`. | ### Inputs @@ -443,10 +444,10 @@ Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hs ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | -| `int` | `psa_type` | `0` means collect and `1` means `distribute` | -| `list of ints` | `mask_size` | The size of mask | +| Type | Parameter | Description | +| -------------- | ----------- | -------------------------------------------- | +| `int` | `psa_type` | `0` means collect and `1` means `distribute` | +| `list of ints` | `mask_size` | The size of mask | ### Inputs @@ -478,9 +479,9 @@ Note this definition is slightly different with [onnx: NonMaxSuppression](https: | Type | Parameter | Description | | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `int` | `center_point_box` | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height]. | +| `int` | `center_point_box` | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\]. | | `int` | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. | -| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0. | +| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0. | | `float` | `score_threshold` | The threshold for deciding when to remove boxes based on score. | | `int` | `offset` | 0 or 1, boxes' width or height is (x2 - x1 + offset). | @@ -543,7 +544,6 @@ Perform RoIAlign on output feature, used in bbox_head of most two-stage detector - T:tensor(float32) - ## MMCVRoIAlignRotated ### Description @@ -552,15 +552,15 @@ Perform RoI align pooling for rotated proposals ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | -------------------------------------------------------------- | +| Type | Parameter | Description | +| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- | | `int` | `output_height` | height of output roi | | `int` | `output_width` | width of output roi | | `float` | `spatial_scale` | used to scale the input boxes | | `int` | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. | | `str` | `mode` | pooling mode in each bin. `avg` or `max` | | `int` | `aligned` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | -| `int` | `clockwise` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | +| `int` | `clockwise` | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly. | ### Inputs @@ -581,9 +581,7 @@ Perform RoI align pooling for rotated proposals - T:tensor(float32) -# torch - -## grid_sampler +## grid_sampler\* ### Description @@ -619,7 +617,7 @@ Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generate - T:tensor(float32, Linear) -## cummax +## cummax\* ### Description @@ -627,9 +625,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | ---------------------------------------------------------------- | -| `int` | `dim` | the dimension to do the operation over | +| Type | Parameter | Description | +| ----- | --------- | -------------------------------------- | +| `int` | `dim` | the dimension to do the operation over | ### Inputs @@ -651,7 +649,7 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e - T:tensor(float32) -## cummin +## cummin\* ### Description @@ -659,9 +657,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e ### Parameters -| Type | Parameter | Description | -| ------- | --------------- | ---------------------------------------------------------------- | -| `int` | `dim` | the dimension to do the operation over | +| Type | Parameter | Description | +| ----- | --------- | -------------------------------------- | +| `int` | `dim` | the dimension to do the operation over | ### Inputs @@ -682,3 +680,7 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e ### Type Constraints - T:tensor(float32) + +## Reminders + +- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX. diff --git a/docs/en/deployment/onnx.md b/docs/en/deployment/onnx.md new file mode 100644 index 0000000000000000000000000000000000000000..528a9fdb91a4306bb41edf242efa9705a8a52c37 --- /dev/null +++ b/docs/en/deployment/onnx.md @@ -0,0 +1,28 @@ +## Introduction of mmcv.onnx module + +### DeprecationWarning + +ONNX support will be deprecated in the future. +Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy + +### register_extra_symbolics + +Some extra symbolic functions need to be registered before exporting PyTorch model to ONNX. + +#### Example + +```python +import mmcv +from mmcv.onnx import register_extra_symbolics + +opset_version = 11 +register_extra_symbolics(opset_version) +``` + +#### Reminder + +- *Please note that this feature is experimental and may change in the future.* + +#### FAQs + +- None diff --git a/docs/deployment/onnxruntime_custom_ops.md b/docs/en/deployment/onnxruntime_custom_ops.md similarity index 98% rename from docs/deployment/onnxruntime_custom_ops.md rename to docs/en/deployment/onnxruntime_custom_ops.md index baaa576f6d789f0eb53b4005dec537de5e06e700..85df4e2a2ee31e1b1097ff270af5b710f3244a87 100644 --- a/docs/deployment/onnxruntime_custom_ops.md +++ b/docs/en/deployment/onnxruntime_custom_ops.md @@ -1,8 +1,8 @@ -## Onnxruntime Custom Ops +## ONNX Runtime Custom Ops -- [Onnxruntime Custom Ops](#onnxruntime-custom-ops) +- [ONNX Runtime Custom Ops](#onnx-runtime-custom-ops) - [SoftNMS](#softnms) - [Description](#description) - [Parameters](#parameters) @@ -143,10 +143,10 @@ Filter out boxes has high IoU overlap with previously selected boxes. #### Parameters -| Type | Parameter | Description | -| ------- | --------------- | ---------------------------------------------------------------------------------------------------------------- | -| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0. | -| `int` | `offset` | 0 or 1, boxes' width or height is (x2 - x1 + offset). | +| Type | Parameter | Description | +| ------- | --------------- | ------------------------------------------------------------------------------------------------------------------ | +| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0. | +| `int` | `offset` | 0 or 1, boxes' width or height is (x2 - x1 + offset). | #### Inputs @@ -338,13 +338,13 @@ Perform Modulated Deformable Convolution on input feature, read [Deformable Conv - T:tensor(float32, Linear) -## MMCVDeformConv2d +### MMCVDeformConv2d -### Description +#### Description Perform Deformable Convolution on input feature, read [Deformable Convolutional Network](https://arxiv.org/abs/1703.06211) for detail. -### Parameters +#### Parameters | Type | Parameter | Description | | -------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------- | @@ -355,7 +355,7 @@ Perform Deformable Convolution on input feature, read [Deformable Convolutional | `int` | `group` | Split input into groups. `input_channel` should be divisible by the number of groups. | | `int` | `im2col_step` | DeformableConv2d use im2col to compute convolution. im2col_step is used to split input and offset, reduce memory usage of column. | -### Inputs +#### Inputs
inputs[0]: T
@@ -366,13 +366,13 @@ Perform Deformable Convolution on input feature, read [Deformable Convolutional
Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).
-### Outputs +#### Outputs
outputs[0]: T
Output feature; 4-D tensor of shape (N, output_channel, outH, outW).
-### Type Constraints +#### Type Constraints - T:tensor(float32, Linear) diff --git a/docs/deployment/onnxruntime_op.md b/docs/en/deployment/onnxruntime_op.md similarity index 65% rename from docs/deployment/onnxruntime_op.md rename to docs/en/deployment/onnxruntime_op.md index f17b32a0647e2f25b1736580f385e7ae1fcb8163..2778ba3448813ca1e63fa250c4fc99e170dea736 100644 --- a/docs/deployment/onnxruntime_op.md +++ b/docs/en/deployment/onnxruntime_op.md @@ -1,4 +1,9 @@ -## Custom operators for ONNX Runtime in MMCV +## ONNX Runtime Deployment + +### DeprecationWarning + +ONNX support will be deprecated in the future. +Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy ### Introduction of ONNX Runtime @@ -15,15 +20,15 @@ ### List of operators for ONNX Runtime supported in MMCV -| Operator | CPU | GPU | MMCV Releases | -| :----------------------------------------------------: | :---: | :---: | :-----------: | -| [SoftNMS](onnxruntime_custom_ops.md#softnms) | Y | N | 1.2.3 | -| [RoIAlign](onnxruntime_custom_ops.md#roialign) | Y | N | 1.2.5 | -| [NMS](onnxruntime_custom_ops.md#nms) | Y | N | 1.2.7 | -| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) | Y | N | 1.3.1 | -| [CornerPool](onnxruntime_custom_ops.md#cornerpool) | Y | N | 1.3.4 | -| [cummax](onnxruntime_custom_ops.md#cummax) | Y | N | master | -| [cummin](onnxruntime_custom_ops.md#cummin) | Y | N | master | +| Operator | CPU | GPU | MMCV Releases | +| :----------------------------------------------------- | :-: | :-: | :-----------: | +| [SoftNMS](onnxruntime_custom_ops.md#softnms) | Y | N | 1.2.3 | +| [RoIAlign](onnxruntime_custom_ops.md#roialign) | Y | N | 1.2.5 | +| [NMS](onnxruntime_custom_ops.md#nms) | Y | N | 1.2.7 | +| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) | Y | N | 1.3.1 | +| [CornerPool](onnxruntime_custom_ops.md#cornerpool) | Y | N | 1.3.4 | +| [cummax](onnxruntime_custom_ops.md#cummax) | Y | N | 1.3.4 | +| [cummin](onnxruntime_custom_ops.md#cummin) | Y | N | 1.3.4 | ### How to build custom operators for ONNX Runtime @@ -88,7 +93,10 @@ onnx_results = sess.run(None, {'input' : input_data}) #### Reminder +- *Please note that this feature is experimental and may change in the future. Strongly suggest users always try with the latest master branch.* + - The custom operator is not included in [supported operator list](https://github.com/microsoft/onnxruntime/blob/master/docs/OperatorKernels.md) in ONNX Runtime. + - The custom operator should be able to be exported to ONNX. #### Main procedures @@ -96,18 +104,20 @@ onnx_results = sess.run(None, {'input' : input_data}) Take custom operator `soft_nms` for example. 1. Add header `soft_nms.h` to ONNX Runtime include directory `mmcv/ops/csrc/onnxruntime/` + 2. Add source `soft_nms.cpp` to ONNX Runtime source directory `mmcv/ops/csrc/onnxruntime/cpu/` -3. Register `soft_nms` operator in [onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp) - ```c++ - #include "soft_nms.h" +3. Register `soft_nms` operator in [onnxruntime_register.cpp](../../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp) + + ```c++ + #include "soft_nms.h" - SoftNmsOp c_SoftNmsOp; + SoftNmsOp c_SoftNmsOp; - if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) { - return status; - } - ``` + if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) { + return status; + } + ``` 4. Add unit test into `tests/test_ops/test_onnx.py` Check [here](../../tests/test_ops/test_onnx.py) for examples. @@ -117,10 +127,10 @@ Take custom operator `soft_nms` for example. ### Known Issues - "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`." - 1. Note generally `cummax` or `cummin` is exportable to ONNX as long as the torch version >= 1.5.0, since `torch.cummax` is only supported with torch >= 1.5.0. But when `cummax` or `cummin` serves as an intermediate component whose outputs is used as inputs for another modules, it's expected that torch version must be >= 1.7.0. Otherwise the above error might arise, when running exported ONNX model with onnxruntime. - 2. Solution: update the torch version to 1.7.0 or higher. + 1. Note generally `cummax` or `cummin` is exportable to ONNX as long as the torch version >= 1.5.0, since `torch.cummax` is only supported with torch >= 1.5.0. But when `cummax` or `cummin` serves as an intermediate component whose outputs is used as inputs for another modules, it's expected that torch version must be >= 1.7.0. Otherwise the above error might arise, when running exported ONNX model with onnxruntime. + 2. Solution: update the torch version to 1.7.0 or higher. ### References - [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md) -- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md) +- [How to add a custom operator/kernel in ONNX Runtime](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html) diff --git a/docs/deployment/tensorrt_custom_ops.md b/docs/en/deployment/tensorrt_custom_ops.md similarity index 96% rename from docs/deployment/tensorrt_custom_ops.md rename to docs/en/deployment/tensorrt_custom_ops.md index be47e355be6316295ca18f12450630e9fe6d3854..37ebb27bf20870b944fe9cca1e029f2499957245 100644 --- a/docs/deployment/tensorrt_custom_ops.md +++ b/docs/en/deployment/tensorrt_custom_ops.md @@ -102,7 +102,7 @@ detectors. #### Description -ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1, and `updates` tensor of rank q + r - indices.shape[-1] - 1. The output of the operation is produced by creating a copy of the input `data`, and then updating its value to values specified by updates at specific index positions specified by `indices`. Its output shape is the same as the shape of `data`. Note that `indices` should not have duplicate entries. That is, two or more updates for the same index-location is not supported. +ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1, and `updates` tensor of rank q + r - indices.shape\[-1\] - 1. The output of the operation is produced by creating a copy of the input `data`, and then updating its value to values specified by updates at specific index positions specified by `indices`. Its output shape is the same as the shape of `data`. Note that `indices` should not have duplicate entries. That is, two or more updates for the same index-location is not supported. The `output` is calculated via the following equation: @@ -151,9 +151,9 @@ Filter out boxes has high IoU overlap with previously selected boxes or low scor | Type | Parameter | Description | | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -| `int` | `center_point_box` | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height]. | +| `int` | `center_point_box` | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\]. | | `int` | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. | -| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0. | +| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0. | | `float` | `score_threshold` | The threshold for deciding when to remove boxes based on score. | | `int` | `offset` | 0 or 1, boxes' width or height is (x2 - x1 + offset). | diff --git a/docs/deployment/tensorrt_plugin.md b/docs/en/deployment/tensorrt_plugin.md similarity index 72% rename from docs/deployment/tensorrt_plugin.md rename to docs/en/deployment/tensorrt_plugin.md index cd8924e33e5183516dcc86d5dc5b2fd786a54f87..de7809b6aac64c126ec8b8cfd3291d65053f60e5 100644 --- a/docs/deployment/tensorrt_plugin.md +++ b/docs/en/deployment/tensorrt_plugin.md @@ -1,8 +1,14 @@ -## TensorRT Plugins for custom operators in MMCV (Experimental) +## TensorRT Deployment + +### DeprecationWarning + +TensorRT support will be deprecated in the future. +Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy -- [TensorRT Plugins for custom operators in MMCV (Experimental)](#tensorrt-plugins-for-custom-operators-in-mmcv-experimental) +- [TensorRT Deployment](#tensorrt-deployment) + - [DeprecationWarning](#deprecationwarning) - [Introduction](#introduction) - [List of TensorRT plugins supported in MMCV](#list-of-tensorrt-plugins-supported-in-mmcv) - [How to build TensorRT plugins in MMCV](#how-to-build-tensorrt-plugins-in-mmcv) @@ -24,17 +30,17 @@ To ease the deployment of trained models with custom operators from `mmcv.ops` u ### List of TensorRT plugins supported in MMCV -| ONNX Operator | TensorRT Plugin | MMCV Releases | -| :-----------------------: | :-----------------------------------------------------------------------------: | :-----------: | -| MMCVRoiAlign | [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign) | 1.2.6 | -| ScatterND | [ScatterND](./tensorrt_custom_ops.md#scatternd) | 1.2.6 | -| NonMaxSuppression | [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression) | 1.3.0 | -| MMCVDeformConv2d | [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d) | 1.3.0 | -| grid_sampler | [grid_sampler](./tensorrt_custom_ops.md#grid-sampler) | 1.3.1 | -| cummax | [cummax](./tensorrt_custom_ops.md#cummax) | 1.3.5 | -| cummin | [cummin](./tensorrt_custom_ops.md#cummin) | 1.3.5 | +| ONNX Operator | TensorRT Plugin | MMCV Releases | +| :------------------------ | :------------------------------------------------------------------------------ | :-----------: | +| MMCVRoiAlign | [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign) | 1.2.6 | +| ScatterND | [ScatterND](./tensorrt_custom_ops.md#scatternd) | 1.2.6 | +| NonMaxSuppression | [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression) | 1.3.0 | +| MMCVDeformConv2d | [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d) | 1.3.0 | +| grid_sampler | [grid_sampler](./tensorrt_custom_ops.md#grid-sampler) | 1.3.1 | +| cummax | [cummax](./tensorrt_custom_ops.md#cummax) | 1.3.5 | +| cummin | [cummin](./tensorrt_custom_ops.md#cummin) | 1.3.5 | | MMCVInstanceNormalization | [MMCVInstanceNormalization](./tensorrt_custom_ops.md#mmcvinstancenormalization) | 1.3.5 | -| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) | master | +| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) | 1.3.8 | Notes @@ -75,6 +81,10 @@ pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl For more detailed information of installing TensorRT using tar, please refer to [Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar). +- Install cuDNN + +Install cuDNN 8 following [Nvidia' website](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar). + #### Build on Linux ```bash @@ -142,27 +152,32 @@ Below are the main steps: **Take RoIAlign plugin `roi_align` for example.** 1. Add header `trt_roi_align.hpp` to TensorRT include directory `mmcv/ops/csrc/tensorrt/` + 2. Add source `trt_roi_align.cpp` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/` + 3. Add cuda kernel `trt_roi_align_kernel.cu` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/` + 4. Register `roi_align` plugin in [trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp) - ```c++ - #include "trt_plugin.hpp" + ```c++ + #include "trt_plugin.hpp" - #include "trt_roi_align.hpp" + #include "trt_roi_align.hpp" - REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator); + REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator); - extern "C" { - bool initLibMMCVInferPlugins() { return true; } - } // extern "C" - ``` + extern "C" { + bool initLibMMCVInferPlugins() { return true; } + } // extern "C" + ``` 5. Add unit test into `tests/test_ops/test_tensorrt.py` Check [here](https://github.com/open-mmlab/mmcv/blob/master/tests/test_ops/test_tensorrt.py) for examples. #### Reminders +- *Please note that this feature is experimental and may change in the future. Strongly suggest users always try with the latest master branch.* + - Some of the [custom ops](https://mmcv.readthedocs.io/en/latest/ops.html) in `mmcv` have their cuda implementations, which could be referred. ### Known Issues diff --git a/docs/en/faq.md b/docs/en/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..02d31c233a9ff66d5e8f3f288b5d5f64e5c5298c --- /dev/null +++ b/docs/en/faq.md @@ -0,0 +1,93 @@ +## Frequently Asked Questions + +We list some common troubles faced by many users and their corresponding solutions here. +Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. + +### Installation + +- KeyError: "xxx: 'yyy is not in the zzz registry'" + + The registry mechanism will be triggered only when the file of the module is imported. + So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974). + +- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'" + + 1. Uninstall existing mmcv in the environment using `pip uninstall mmcv` + 2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) + +- "invalid device function" or "no kernel image is available for execution" + + 1. Check the CUDA compute capability of you GPU + 2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when using old GPUS, e.g., Tesla K80 (3.7) on colab. + 3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments + +- "undefined symbol" or "cannot open xxx.so" + + 1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check + whether the CUDA/GCC runtimes are the same as those used for compiling mmcv + 2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv + 3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment + +- "RuntimeError: CUDA error: invalid configuration argument" + + This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) + and recompile mmcv. + +- "RuntimeError: nms is not compiled with GPU support" + + This error is because your CUDA environment is not installed correctly. + You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv. + +- "Segmentation fault" + + 1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem + 2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results + ```shell + python -c 'import torch; print(torch.cuda.is_available())' + ``` + 3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command + ```shell + python -c 'import mmcv; import mmcv.ops' + ``` + 4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault` + +- "libtorch_cuda_cu.so: cannot open shared object file" + + `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch. + +- "fatal error C1189: #error: -- unsupported Microsoft Visual Studio version!" + + If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error: -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017. + +- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized" + + If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394). + +- "error: a member with an in-class initializer must be const" + + If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575). + +- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized" + + If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch: + + - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h` + - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h` + - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h` + + More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956). + +- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer" + + Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation). + +### Usage + +- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" + + 1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582). + 2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually + +- "RuntimeError: Trying to backward through the graph a second time" + + `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379). diff --git a/docs/get_started/build.md b/docs/en/get_started/build.md similarity index 61% rename from docs/get_started/build.md rename to docs/en/get_started/build.md index 758a83a4fb84398c9e192df37f7778a736109813..d987c1e17e2e91f232cb733ac7bc1f425dba27a8 100644 --- a/docs/get_started/build.md +++ b/docs/en/get_started/build.md @@ -9,6 +9,12 @@ git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` +It is recommended to install `ninja` to speed up the compilation + +```bash +pip install -r requirements/optional.txt +``` + You can either - install the lite version @@ -40,6 +46,7 @@ If you would like to use `opencv-python-headless` instead of `opencv-python`, e.g., in a minimum container environment or servers without GUI, you can first install it before installing MMCV to skip the installation of `opencv-python`. ``` + ### Build on Windows Building MMCV on Windows is a bit more complicated than that on Linux. @@ -68,35 +75,41 @@ You should know how to set up environment variables, especially `Path`, on Windo 1. Launch Anaconda prompt from Windows Start menu - Do not use raw `cmd.exe` s instruction is based on PowerShell syntax. + Do not use raw `cmd.exe` s instruction is based on PowerShell syntax. -1. Create a new conda environment +2. Create a new conda environment - ```shell - conda create --name mmcv python=3.7 # 3.6, 3.7, 3.8 should work too as tested - conda activate mmcv # make sure to activate environment before any operation - ``` + ```shell + conda create --name mmcv python=3.7 # 3.6, 3.7, 3.8 should work too as tested + conda activate mmcv # make sure to activate environment before any operation + ``` -1. Install PyTorch. Choose a version based on your need. +3. Install PyTorch. Choose a version based on your need. - ```shell - conda install pytorch torchvision cudatoolkit=10.2 -c pytorch - ``` + ```shell + conda install pytorch torchvision cudatoolkit=10.2 -c pytorch + ``` - We only tested PyTorch version >= 1.6.0. + We only tested PyTorch version >= 1.6.0. -1. Prepare MMCV source code +4. Prepare MMCV source code - ```shell - git clone https://github.com/open-mmlab/mmcv.git - cd mmcv - ``` + ```shell + git clone https://github.com/open-mmlab/mmcv.git + cd mmcv + ``` -1. Install required Python packages +5. Install required Python packages - ```shell - pip3 install -r requirements.txt - ``` + ```shell + pip3 install -r requirements/runtime.txt + ``` + +6. It is recommended to install `ninja` to speed up the compilation + + ```bash + pip install -r requirements/optional.txt + ``` #### Build and install MMCV @@ -106,11 +119,11 @@ MMCV can be built in three ways: In this way, no custom ops are compiled and mmcv is a pure python package. -1. Full version (CPU ops) +2. Full version (CPU ops) Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only. -1. Full version (CUDA ops) +3. Full version (CUDA ops) Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented). @@ -118,19 +131,19 @@ MMCV can be built in three ways: 1. Set up MSVC compiler - Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below. + Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below. - ```none - (base) PS C:\Users\xxx> cl - Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 - Copyright (C) Microsoft Corporation. All rights reserved. + ```none + (base) PS C:\Users\xxx> cl + Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 + Copyright (C) Microsoft Corporation. All rights reserved. - usage: cl [ option... ] filename... [ / link linkoption... ] - ``` + usage: cl [ option... ] filename... [ / link linkoption... ] + ``` - For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path. + For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path. - You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English. + You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English. ##### Option 1: Build MMCV (lite version) @@ -150,32 +163,34 @@ pip list ##### Option 2: Build MMCV (full version with CPU) 1. Finish above common steps -1. Set up environment variables - - ```shell - $env:MMCV_WITH_OPS = 1 - $env:MAX_JOBS = 8 # based on your available number of CPU cores and amount of memory - ``` - -1. Following build steps of the lite version - - ```shell - # activate environment - conda activate mmcv - # change directory - cd mmcv - # build - python setup.py build_ext # if success, cl will be launched to compile ops - # install - python setup.py develop - # check - pip list - ``` + +2. Set up environment variables + + ```shell + $env:MMCV_WITH_OPS = 1 + $env:MAX_JOBS = 8 # based on your available number of CPU cores and amount of memory + ``` + +3. Following build steps of the lite version + + ```shell + # activate environment + conda activate mmcv + # change directory + cd mmcv + # build + python setup.py build_ext # if success, cl will be launched to compile ops + # install + python setup.py develop + # check + pip list + ``` ##### Option 3: Build MMCV (full version with CUDA) 1. Finish above common steps -1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below: + +2. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below: ```none (base) PS C:\Users\WRH> ls env: @@ -197,7 +212,7 @@ pip list $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs: ``` -1. Set CUDA target arch +3. Set CUDA target arch ```shell # Suppose you are using GTX 1080, which is of capability 6.1 @@ -210,7 +225,7 @@ pip list Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus). ``` -1. Launch compiling the same way as CPU +4. Launch compiling the same way as CPU ```shell $env:MMCV_WITH_OPS = 1 @@ -232,3 +247,23 @@ If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTo ``` If you meet issues when running or compiling mmcv, we list some common issues in [Frequently Asked Question](../faq.html). + +## \[Optional\] Build MMCV on IPU machine + +Firstly, you need to apply for an IPU cloud machine, see [here](https://www.graphcore.ai/ipus-in-the-cloud). + +### Option 1: Docker + +1. Pull docker + +```shell + docker pull graphcore/pytorch +``` + +2. Build MMCV under same python environment + +### Option 2: Install from SDK + +1. Build MMCV + +2. Use pip to install sdk according to [IPU PyTorch document](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html). Also, you need to apply for machine and sdk to Graphcore. diff --git a/docs/get_started/installation.md b/docs/en/get_started/installation.md similarity index 75% rename from docs/get_started/installation.md rename to docs/en/get_started/installation.md index 0c64ea825cad548f21c2b41a9538f9447b7431b8..d9fd1b33607684b5c2c39fdc4d86635e0e41e263 100644 --- a/docs/get_started/installation.md +++ b/docs/en/get_started/installation.md @@ -3,7 +3,7 @@ There are two versions of MMCV: - **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build. -- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops. +- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops. ```{warning} Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`. @@ -13,36 +13,36 @@ a. Install the full version. Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/). -We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands. +We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building for **Linux and Windows systems**. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands. i. Install the latest version. -The rule for installing the latest ``mmcv-full`` is as follows: +The rule for installing the latest `mmcv-full` is as follows: ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example, -to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command: +Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. For example, +to install the latest `mmcv-full` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command: ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html ``` -For more details, please refer the the following tables and delete ``=={mmcv_version}``. +For more details, please refer the the following tables and delete `=={mmcv_version}`. ii. Install a specified version. -The rule for installing a specified ``mmcv-full`` is as follows: +The rule for installing a specified `mmcv-full` is as follows: ```shell pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``. -Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example, -to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command: +First of all, please refer to the Releases and replace `{mmcv_version}` a specified one. e.g. `1.3.9`. +Then replace `{cu_version}` and `{torch_version}` in the url to your desired versions. For example, +to install `mmcv-full==1.3.9` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command: ```shell pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html @@ -64,16 +64,28 @@ For more details, please refer the the following tables. CUDA - torch 1.10 - torch 1.9 - torch 1.8 - torch 1.7 - torch 1.6 - torch 1.5 + torch 1.11 + torch 1.10 + torch 1.9 + torch 1.8 + torch 1.7 + torch 1.6 + torch 1.5 + + + 11.5 +
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html
+ + + + + + 11.3 -
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
+
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
+
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
@@ -82,6 +94,7 @@ For more details, please refer the the following tables. 11.1 +
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
@@ -94,13 +107,15 @@ For more details, please refer the the following tables. +
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
10.2 -
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
+
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html
+
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
install
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html
@@ -111,6 +126,7 @@ For more details, please refer the the following tables. 10.1 +
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
@@ -121,12 +137,14 @@ For more details, please refer the the following tables. +
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html
cpu +
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html
install
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
@@ -138,7 +156,11 @@ For more details, please refer the the following tables. ```{note} -The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, if you click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html), you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year. +The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, if you click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html), you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year. +``` + +```{note} +mmcv-full does not provide pre-built packages for `cu102-torch1.11` and `cu92-torch*` on Windows. ``` Another way is to compile locally by running diff --git a/docs/get_started/introduction.md b/docs/en/get_started/introduction.md similarity index 62% rename from docs/get_started/introduction.md rename to docs/en/get_started/introduction.md index 4ffb59d2d57cd24c23dd5d9fb0558ab1d66a06a8..9ef6ee99dc400267b1fb465be689e7831a9ca858 100644 --- a/docs/get_started/introduction.md +++ b/docs/en/get_started/introduction.md @@ -3,16 +3,24 @@ MMCV is a foundational library for computer vision research and supports many research projects as below: +- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages. - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark. - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark. - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection. +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark. - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark. +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox. +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark. +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark. +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark. +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark. - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark. - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark. -- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark. +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark. - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox. -- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition and understanding toolbox. - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox. +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework. It provides the following functionalities. @@ -24,6 +32,12 @@ It provides the following functionalities. - Various CNN architectures - High-quality implementation of common CUDA ops +It supports the following systems. + +- Linux +- Windows +- macOS + ```{note} MMCV requires Python 3.6+. ``` diff --git a/docs/get_started/previous_versions.md b/docs/en/get_started/previous_versions.md similarity index 93% rename from docs/get_started/previous_versions.md rename to docs/en/get_started/previous_versions.md index c91180d2203dc5cf21c4dccbc4b4e20891879795..a9c3717667fec3e8f338c319413aa6ad639dc6d3 100644 --- a/docs/get_started/previous_versions.md +++ b/docs/en/get_started/previous_versions.md @@ -4,7 +4,7 @@ We no longer provide `mmcv-full` packages compiled under lower versions of `PyTo ### PyTorch 1.4 -| 1.0.0 <= mmcv_version <= 1.2.1 +| 1.0.0 \<= mmcv_version \<= 1.2.1 #### CUDA 10.1 @@ -26,7 +26,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis ### PyTorch v1.3 -| 1.0.0 <= mmcv_version <= 1.3.16 +| 1.0.0 \<= mmcv_version \<= 1.3.16 #### CUDA 10.1 diff --git a/docs/index.rst b/docs/en/index.rst similarity index 100% rename from docs/index.rst rename to docs/en/index.rst index 6019f107a842107f5e38989df313ca7cc7fe9f9c..bccbc372976a491dabbe90d8c519ec8d5f00850a 100644 --- a/docs/index.rst +++ b/docs/en/index.rst @@ -29,12 +29,12 @@ You can switch between Chinese and English documents in the lower-left corner of :maxdepth: 2 :caption: Deployment + deployment/mmcv_ops_definition.md deployment/onnx.md - deployment/onnxruntime_op.md deployment/onnxruntime_custom_ops.md - deployment/tensorrt_plugin.md + deployment/onnxruntime_op.md deployment/tensorrt_custom_ops.md - deployment/mmcv_ops_definition.md + deployment/tensorrt_plugin.md .. toctree:: :maxdepth: 2 diff --git a/docs/make.bat b/docs/en/make.bat similarity index 100% rename from docs/make.bat rename to docs/en/make.bat diff --git a/docs/mmcv-logo.png b/docs/en/mmcv-logo.png similarity index 100% rename from docs/mmcv-logo.png rename to docs/en/mmcv-logo.png diff --git a/docs/en/understand_mmcv/cnn.md b/docs/en/understand_mmcv/cnn.md new file mode 100644 index 0000000000000000000000000000000000000000..0c401c6b609f093e6bf854c9abdbe78a13b04ac1 --- /dev/null +++ b/docs/en/understand_mmcv/cnn.md @@ -0,0 +1,583 @@ +## CNN + +We provide some building bricks for CNNs, including layer building, module bundles and weight initialization. + +### Layer building + +We may need to try different layers of the same type when running experiments, +but do not want to modify the code from time to time. +Here we provide some layer building methods to construct layers from a dict, +which can be written in configs or specified via command line arguments. + +#### Usage + +A simplest example is + +```python +cfg = dict(type='Conv3d') +layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3) +``` + +- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d). +- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d). +- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU. +- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle. +- `build_padding_layer`: Supported types are zero, reflect, replicate. + +#### Extension + +We also allow extending the building methods with custom layers and operators. + +1. Write and register your own module. + + ```python + from mmcv.cnn import UPSAMPLE_LAYERS + + @UPSAMPLE_LAYERS.register_module() + class MyUpsample: + + def __init__(self, scale_factor): + pass + + def forward(self, x): + pass + ``` + +2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it. + + ```python + cfg = dict(type='MyUpsample', scale_factor=2) + layer = build_upsample_layer(cfg) + ``` + +### Module bundles + +We also provide common module bundles to facilitate the network construction. +`ConvModule` is a bundle of convolution, normalization and activation layers, +please refer to the [api](api.html#mmcv.cnn.ConvModule) for details. + +```python +# conv + bn + relu +conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN')) +# conv + gn + relu +conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2)) +# conv + relu +conv = ConvModule(3, 8, 2) +# conv +conv = ConvModule(3, 8, 2, act_cfg=None) +# conv + leaky relu +conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU')) +# bn + conv + relu +conv = ConvModule( + 3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act')) +``` + +### Weight initialization + +> Implementation details are available at [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py) + +During training, a proper initialization strategy is beneficial to speed up the +training or obtain a higher performance. In MMCV, we provide some commonly used +methods for initializing modules like `nn.Conv2d`. Of course, we also provide +high-level APIs for initializing models containing one or more +modules. + +#### Initialization functions + +Initialize a `nn.Module` such as `nn.Conv2d`, `nn.Linear` in a functional way. + +We provide the following initialization methods. + +- constant_init + + Initialize module parameters with constant values. + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import constant_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # constant_init(module, val, bias=0) + >>> constant_init(conv1, 1, 0) + >>> conv1.weight + ``` + +- xavier_init + + Initialize module parameters with values according to the method + described in [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import xavier_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # xavier_init(module, gain=1, bias=0, distribution='normal') + >>> xavier_init(conv1, distribution='normal') + ``` + +- normal_init + + Initialize module parameters with the values drawn from a normal distribution. + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import normal_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # normal_init(module, mean=0, std=1, bias=0) + >>> normal_init(conv1, std=0.01, bias=0) + ``` + +- uniform_init + + Initialize module parameters with values drawn from a uniform distribution. + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import uniform_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # uniform_init(module, a=0, b=1, bias=0) + >>> uniform_init(conv1, a=0, b=1) + ``` + +- kaiming_init + + Initialize module parameters with the values according to the method + described in [Delving deep into rectifiers: Surpassing human-level + performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import kaiming_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal') + >>> kaiming_init(conv1) + ``` + +- caffe2_xavier_init + + The xavier initialization is implemented in caffe2, which corresponds to `kaiming_uniform_` in PyTorch. + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import caffe2_xavier_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # caffe2_xavier_init(module, bias=0) + >>> caffe2_xavier_init(conv1) + ``` + +- bias_init_with_prob + + Initialize conv/fc bias value according to a given probability, as proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf). + + ```python + >>> from mmcv.cnn import bias_init_with_prob + >>> # bias_init_with_prob is proposed in Focal Loss + >>> bias = bias_init_with_prob(0.01) + >>> bias + -4.59511985013459 + ``` + +#### Initializers and configs + +On the basis of the initialization methods, we define the corresponding initialization classes and register them to `INITIALIZERS`, so we can +use the configuration to initialize the model. + +We provide the following initialization classes. + +- ConstantInit +- XavierInit +- NormalInit +- UniformInit +- KaimingInit +- Caffe2XavierInit +- PretrainedInit + +Let us introduce the usage of `initialize` in detail. + +1. Initialize model by `layer` key + + If we only define `layer`, it just initialize the layer in `layer` key. + + NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, so `MultiheadAttention layer` is not supported. + +- Define `layer` key for initializing module with same configuration. + + ```python + import torch.nn as nn + from mmcv.cnn import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Linear(1, 2) + + model = FooNet() + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) + # initialize whole module with same configuration + initialize(model, init_cfg) + # model.feat.weight + # Parameter containing: + # tensor([[[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]]], requires_grad=True) + ``` + +- Define `layer` key for initializing layer with different configurations. + + ```python + import torch.nn as nn + from mmcv.cnn.utils import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Linear(1,2) + + model = FooNet() + init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Conv2d', val=2), + dict(type='Constant', layer='Linear', val=3)] + # nn.Conv1d will be initialized with dict(type='Constant', val=1) + # nn.Conv2d will be initialized with dict(type='Constant', val=2) + # nn.Linear will be initialized with dict(type='Constant', val=3) + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + ``` + +2. Initialize model by `override` key + +- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg. + + ```python + import torch.nn as nn + from mmcv.cnn import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) + + # if we would like to initialize model's weights as 1 and bias as 2 + # but weight in `reg` as 3 and bias 4, we can use override key + model = FooNet() + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', name='reg', val=3, bias=4)) + # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2) + # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4) + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[3., 3., 3.], + # [3., 3., 3.], + # [3., 3., 3.]], + # ..., + # [[3., 3., 3.], + # [3., 3., 3.], + # [3., 3., 3.]]]], requires_grad=True) + ``` + +- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted. + + ```python + model = FooNet() + init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) + # self.feat and self.cls will be initialized by Pytorch + # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2) + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]], + # ..., + # [[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]]]], requires_grad=True) + ``` + +- If we don't define `layer` key or `override` key, it will not initialize anything. + +- Invalid usage + + ```python + # It is invalid that override don't have name key + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], + val=1, bias=2, + override=dict(type='Constant', val=3, bias=4)) + + # It is also invalid that override has name and other args except type + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], + val=1, bias=2, + override=dict(name='reg', val=3, bias=4)) + ``` + +3. Initialize model with the pretrained model + + ```python + import torch.nn as nn + import torchvision.models as models + from mmcv.cnn import initialize + + # initialize model with pretrained model + model = models.resnet50() + # model.conv1.weight + # Parameter containing: + # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03, ..., -2.1245e-03, + # -1.8077e-03, 3.0338e-03], + # [-1.2603e-02, -2.7831e-02, 2.3187e-02, ..., -1.5793e-02, + # 1.1655e-02, 4.5889e-03], + # [-3.7916e-02, 1.2014e-02, 1.3815e-02, ..., -4.2651e-03, + # 1.7314e-02, -9.9998e-03], + # ..., + + init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + initialize(model, init_cfg) + # model.conv1.weight + # Parameter containing: + # tensor([[[[ 1.3335e-02, 1.4664e-02, -1.5351e-02, ..., -4.0896e-02, + # -4.3034e-02, -7.0755e-02], + # [ 4.1205e-03, 5.8477e-03, 1.4948e-02, ..., 2.2060e-03, + # -2.0912e-02, -3.8517e-02], + # [ 2.2331e-02, 2.3595e-02, 1.6120e-02, ..., 1.0281e-01, + # 6.2641e-02, 5.1977e-02], + # ..., + + # initialize weights of a sub-module with the specific part of a pretrained model by using 'prefix' + model = models.resnet50() + url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ + 'retinanet_r50_fpn_1x_coco/'\ + 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' + init_cfg = dict(type='Pretrained', + checkpoint=url, prefix='backbone.') + initialize(model, init_cfg) + ``` + +4. Initialize model inherited from BaseModule, Sequential, ModuleList, ModuleDict + + `BaseModule` is inherited from `torch.nn.Module`, and the only different between them is that `BaseModule` implements `init_weights()`. + + `Sequential` is inherited from `BaseModule` and `torch.nn.Sequential`. + + `ModuleList` is inherited from `BaseModule` and `torch.nn.ModuleList`. + + `ModuleDict` is inherited from `BaseModule` and `torch.nn.ModuleDict`. + + ```python + import torch.nn as nn + from mmcv.runner import BaseModule, Sequential, ModuleList, ModuleDict + + class FooConv1d(BaseModule): + + def __init__(self, init_cfg=None): + super().__init__(init_cfg) + self.conv1d = nn.Conv1d(4, 1, 4) + + def forward(self, x): + return self.conv1d(x) + + class FooConv2d(BaseModule): + + def __init__(self, init_cfg=None): + super().__init__(init_cfg) + self.conv2d = nn.Conv2d(3, 1, 3) + + def forward(self, x): + return self.conv2d(x) + + # BaseModule + init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.) + model = FooConv1d(init_cfg) + model.init_weights() + # model.conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + + # Sequential + init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.) + init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.) + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + seq_model = Sequential(model1, model2) + seq_model.init_weights() + # seq_model[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # seq_model[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + seq_model = Sequential(model1, model2, init_cfg=init_cfg) + seq_model.init_weights() + # seq_model[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # seq_model[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # ModuleList + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + modellist = ModuleList([model1, model2]) + modellist.init_weights() + # modellist[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modellist[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + modellist = ModuleList([model1, model2], init_cfg=init_cfg) + modellist.init_weights() + # modellist[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modellist[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # ModuleDict + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + modeldict = ModuleDict(dict(model1=model1, model2=model2)) + modeldict.init_weights() + # modeldict['model1'].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modeldict['model2'].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + modeldict = ModuleDict(dict(model1=model1, model2=model2), init_cfg=init_cfg) + modeldict.init_weights() + # modeldict['model1'].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modeldict['model2'].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + ``` + +### Model Zoo + +Besides torchvision pre-trained models, we also provide pre-trained models of following CNN: + +- VGG Caffe +- ResNet Caffe +- ResNeXt +- ResNet with Group Normalization +- ResNet with Group Normalization and Weight Standardization +- HRNetV2 +- Res2Net +- RegNet + +#### Model URLs in JSON + +The model zoo links in MMCV are managed by JSON files. +The json file consists of key-value pair of model name and its url or path. +An example json file could be like: + +```json +{ + "model_a": "https://example.com/models/model_a_9e5bac.pth", + "model_b": "pretrain/model_b_ab3ef2c.pth" +} +``` + +The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json). + +You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path. + +The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used. + +#### Load Checkpoint + +The following types are supported for `filename` argument of `mmcv.load_checkpoint()`. + +- filepath: The filepath of the checkpoint. +- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename. +- `torchvision://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details. +- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files. diff --git a/docs/understand_mmcv/config.md b/docs/en/understand_mmcv/config.md similarity index 95% rename from docs/understand_mmcv/config.md rename to docs/en/understand_mmcv/config.md index d0b669b8516c0281000a88c1bd41aac731dc8326..9626dbe2c331273995e6e2fbf095461b171101bd 100644 --- a/docs/understand_mmcv/config.md +++ b/docs/en/understand_mmcv/config.md @@ -196,5 +196,5 @@ _deprecation_ = dict( ```python >>> cfg = Config.fromfile('./deprecated_cfg.py') -UserWarning: The config file deprecated.py will be deprecated in the future. Please use expected_cfg.py instead. More information can be found at https://github.com/open-mmlab/mmcv/pull/1275 +UserWarning: The config file deprecated_cfg.py will be deprecated in the future. Please use expected_cfg.py instead. More information can be found at https://github.com/open-mmlab/mmcv/pull/1275 ``` diff --git a/docs/understand_mmcv/data_process.md b/docs/en/understand_mmcv/data_process.md similarity index 99% rename from docs/understand_mmcv/data_process.md rename to docs/en/understand_mmcv/data_process.md index 79e9281b6c88c907e6edfc6d03f73930b2cd51ef..94a4c5431fe6237220cf2d99af1894dd06961d1e 100644 --- a/docs/understand_mmcv/data_process.md +++ b/docs/en/understand_mmcv/data_process.md @@ -232,7 +232,7 @@ mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2) - IO - Visualization -- Flow warpping +- Flow warping We provide two options to dump optical flow files: uncompressed and compressed. The uncompressed way just dumps the floating numbers to a binary file. It is diff --git a/docs/understand_mmcv/io.md b/docs/en/understand_mmcv/io.md similarity index 98% rename from docs/understand_mmcv/io.md rename to docs/en/understand_mmcv/io.md index f6c28dd425cb0bcc54ca5d92a3a3849103f47e2a..64fbc8b8e60841f8de74235e17a6b42566cf912d 100644 --- a/docs/understand_mmcv/io.md +++ b/docs/en/understand_mmcv/io.md @@ -195,8 +195,8 @@ disk_backend = HardDiskBackend() with io.BytesIO(disk_backend.get(filepath1)) as buffer: checkpoint = torch.load(buffer) with io.BytesIO() as buffer: - torch.save(checkpoint, f) - disk_backend.put(f.getvalue(), filepath2) + torch.save(checkpoint, buffer) + disk_backend.put(buffer.getvalue(), filepath2) ``` If we want to implement an interface which automatically select the corresponding diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md new file mode 100644 index 0000000000000000000000000000000000000000..127f92bf959ab89725332a43ebe8630625c4c5ab --- /dev/null +++ b/docs/en/understand_mmcv/ops.md @@ -0,0 +1,60 @@ +## ops + +We implement common ops used in detection, segmentation, etc. + +| Device | CPU | CUDA | MLU | MPS | +| ---------------------------- | --- | ---- | --- | --- | +| ActiveRotatedFilter | √ | √ | | | +| AssignScoreWithK | | √ | | | +| BallQuery | | √ | | | +| BBoxOverlaps | | √ | √ | √ | +| BorderAlign | | √ | | | +| BoxIouRotated | √ | √ | | | +| CARAFE | | √ | | | +| ChamferDistance | | √ | | | +| CrissCrossAttention | | √ | | | +| ContourExpand | √ | | | | +| ConvexIoU | | √ | | | +| CornerPool | | √ | | | +| Correlation | | √ | | | +| Deformable Convolution v1/v2 | √ | √ | | | +| Deformable RoIPool | | √ | | | +| DiffIoURotated | | √ | | | +| DynamicScatter | | √ | | | +| FurthestPointSample | | √ | | | +| FurthestPointSampleWithDist | | √ | | | +| FusedBiasLeakyrelu | | √ | | | +| GatherPoints | | √ | | | +| GroupPoints | | √ | | | +| Iou3d | | √ | | | +| KNN | | √ | | | +| MaskedConv | | √ | | | +| MergeCells | | √ | | | +| MinAreaPolygon | | √ | | | +| ModulatedDeformConv2d | √ | √ | | | +| MultiScaleDeformableAttn | | √ | | | +| NMS | √ | √ | √ | | +| NMSRotated | √ | √ | | | +| PixelGroup | √ | | | | +| PointsInBoxes | √ | √ | | | +| PointsInPolygons | | √ | | | +| PSAMask | √ | √ | √ | | +| RotatedFeatureAlign | √ | √ | | | +| RoIPointPool3d | | √ | | | +| RoIPool | | √ | √ | | +| RoIAlignRotated | √ | √ | √ | | +| RiRoIAlignRotated | | √ | | | +| RoIAlign | √ | √ | √ | | +| RoIAwarePool3d | | √ | | | +| SAConv2d | | √ | | | +| SigmoidFocalLoss | | √ | √ | | +| SoftmaxFocalLoss | | √ | | | +| SoftNMS | | √ | | | +| Sparse Convolution | | √ | | | +| Synchronized BatchNorm | | √ | | | +| ThreeInterpolate | | √ | | | +| ThreeNN | | √ | | | +| TINShift | | √ | √ | | +| UpFirDn2d | | √ | | | +| Voxelization | √ | √ | | | +| PrRoIPool | | √ | | | diff --git a/docs/understand_mmcv/registry.md b/docs/en/understand_mmcv/registry.md similarity index 74% rename from docs/understand_mmcv/registry.md rename to docs/en/understand_mmcv/registry.md index 2cf10819fea6ac81645cc127c6b7aea54af19d5f..824e0295a4cd16870002ce9098ad46ddc76adbb9 100644 --- a/docs/understand_mmcv/registry.md +++ b/docs/en/understand_mmcv/registry.md @@ -3,11 +3,15 @@ MMCV implements [registry](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) to manage different modules that share similar functionalities, e.g., backbones, head, and necks, in detectors. Most projects in OpenMMLab use registry to manage modules of datasets and models, such as [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting), etc. +```{note} +In v1.5.1 and later, the Registry supports registering functions and calling them. +``` + ### What is registry -In MMCV, registry can be regarded as a mapping that maps a class to a string. -These classes contained by a single registry usually have similar APIs but implement different algorithms or support different datasets. -With the registry, users can find and instantiate the class through its corresponding string, and use the instantiated module as they want. +In MMCV, registry can be regarded as a mapping that maps a class or function to a string. +These classes or functions contained by a single registry usually have similar APIs but implement different algorithms or support different datasets. +With the registry, users can find the class or function through its corresponding string, and instantiate the corresponding module or call the function to obtain the result according to needs. One typical example is the config systems in most OpenMMLab projects, which use the registry to create hooks, runners, models, and datasets, through configs. The API reference could be found [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry). @@ -17,7 +21,7 @@ To manage your modules in the codebase by `Registry`, there are three steps as b 2. Create a registry. 3. Use this registry to manage the modules. -`build_func` argument of `Registry` is to customize how to instantiate the class instance, the default one is `build_from_cfg` implemented [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg). +`build_func` argument of `Registry` is to customize how to instantiate the class instance or how to call the function to obtain the result, the default one is `build_from_cfg` implemented [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg). ### A Simple Example @@ -31,10 +35,10 @@ In the package, we first create a file to implement builders, named `converters/ ```python from mmcv.utils import Registry # create a registry for converters -CONVERTERS = Registry('converter') +CONVERTERS = Registry('converters') ``` -Then we can implement different converters in the package. For example, implement `Converter1` in `converters/converter1.py` +Then we can implement different converters that is class or function in the package. For example, implement `Converter1` in `converters/converter1.py`, and `converter2` in `converters/converter2.py`. ```python @@ -48,18 +52,38 @@ class Converter1(object): self.b = b ``` +```python +# converter2.py +from .builder import CONVERTERS +from .converter1 import Converter1 + +# 使用注册器管理模块 +@CONVERTERS.register_module() +def converter2(a, b) + return Converter1(a, b) +``` + The key step to use registry for managing the modules is to register the implemented module into the registry `CONVERTERS` through -`@CONVERTERS.register_module()` when you are creating the module. By this way, a mapping between a string and the class is built and maintained by `CONVERTERS` as below +`@CONVERTERS.register_module()` when you are creating the module. By this way, a mapping between a string and the class (function) is built and maintained by `CONVERTERS` as below ```python 'Converter1' -> +'converter2' -> +``` + +```{note} +The registry mechanism will be triggered only when the file where the module is located is imported. +So you need to import that file somewhere. More details can be found at https://github.com/open-mmlab/mmdetection/issues/5974. ``` If the module is successfully registered, you can use this converter through configs as ```python -converter_cfg = dict(type='Converter1', a=a_value, b=b_value) -converter = CONVERTERS.build(converter_cfg) +converter1_cfg = dict(type='Converter1', a=a_value, b=b_value) +converter2_cfg = dict(type='converter2', a=a_value, b=b_value) +converter1 = CONVERTERS.build(converter1_cfg) +# returns the calling result +result = CONVERTERS.build(converter2_cfg) ``` ### Customize Build Function @@ -88,7 +112,7 @@ CONVERTERS = Registry('converter', build_func=build_converter) ```{note} In this example, we demonstrate how to use the `build_func` argument to customize the way to build a class instance. The functionality is similar to the default `build_from_cfg`. In most cases, default one would be sufficient. -`build_model_from_cfg` is also implemented to build PyTorch module in `nn.Sequentail`, you may directly use them instead of implementing by yourself. +`build_model_from_cfg` is also implemented to build PyTorch module in `nn.Sequential`, you may directly use them instead of implementing by yourself. ``` ### Hierarchy Registry diff --git a/docs/understand_mmcv/runner.md b/docs/en/understand_mmcv/runner.md similarity index 88% rename from docs/understand_mmcv/runner.md rename to docs/en/understand_mmcv/runner.md index 2e6e3868335d92f94e98441a5c7ec6d0b92a960b..eeeb859ee82534632365c98b2e6e4370da2b955b 100644 --- a/docs/understand_mmcv/runner.md +++ b/docs/en/understand_mmcv/runner.md @@ -8,7 +8,7 @@ The runner class is designed to manage the training. It eases the training proce ### EpochBasedRunner -As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, [('train', 2), ('val', 1)] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default. +As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, \[('train', 2), ('val', 1)\] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default. Let's take a look at its core logic: @@ -44,7 +44,7 @@ def train(self, data_loader, **kwargs): ### IterBasedRunner -Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, [('train', 2), ('val', 1)] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default. +Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, \[('train', 2), ('val', 1)\] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default. Let's take a look at its core logic: @@ -156,8 +156,8 @@ runner.run(data_loaders, cfg.workflow) Let's take `EpochBasedRunner` for example and go a little bit into details about setting workflow: -- Say we only want to put train in the workflow, then we can set: workflow = [('train', 1)]. The runner will only execute train iteratively in this case. -- Say we want to put both train and val in the workflow, then we can set: workflow = [('train', 3), ('val',1)]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs. -- Workflow is highly flexible. Therefore, you can set workflow = [('val', 1), ('train',1)] if you would like the runner to validate first and train after. +- Say we only want to put train in the workflow, then we can set: workflow = \[('train', 1)\]. The runner will only execute train iteratively in this case. +- Say we want to put both train and val in the workflow, then we can set: workflow = \[('train', 3), ('val',1)\]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs. +- Workflow is highly flexible. Therefore, you can set workflow = \[('val', 1), ('train',1)\] if you would like the runner to validate first and train after. The code we demonstrated above is already in `train.py` in MM repositories. Simply modify the corresponding keys in the configuration files and the script will execute the expected workflow automatically. diff --git a/docs/understand_mmcv/utils.md b/docs/en/understand_mmcv/utils.md similarity index 100% rename from docs/understand_mmcv/utils.md rename to docs/en/understand_mmcv/utils.md diff --git a/docs/understand_mmcv/visualization.md b/docs/en/understand_mmcv/visualization.md similarity index 100% rename from docs/understand_mmcv/visualization.md rename to docs/en/understand_mmcv/visualization.md diff --git a/docs/faq.md b/docs/faq.md deleted file mode 100644 index ab0dd135f946c63f6dc3d08e2b6ca2f6837c7437..0000000000000000000000000000000000000000 --- a/docs/faq.md +++ /dev/null @@ -1,42 +0,0 @@ -## Frequently Asked Questions - -We list some common troubles faced by many users and their corresponding solutions here. -Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. - -- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer" - - Please install the correct version of MMCV for the version of your MMDetection following the instruction above. - -- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'". - - 1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`. - 2. Install mmcv-full following the instruction above. - -- "invalid device function" or "no kernel image is available for execution". - - 1. Check the CUDA compute capability of you GPU. - 2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, - and MMCV are built for the correct GPU architecture. - You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. - The compatibility issue could happen when using old GPUS, e.g., Tesla K80 (3.7) on colab. - 3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. - For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments. - -- "undefined symbol" or "cannot open xxx.so". - - 1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check - whether the CUDA/GCC runtimes are the same as those used for compiling mmcv. - 2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether - the Pytorch version is the same as that used for compiling mmcv. - 3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, - and MMCV are built by and running on the same environment. - -- "RuntimeError: CUDA error: invalid configuration argument". - - This error may be due to your poor GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) - and recompile mmcv. - -- "RuntimeError: nms is not compiled with GPU support". - - This error is because your CUDA environment is not installed correctly. - You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv. diff --git a/docs/understand_mmcv/cnn.md b/docs/understand_mmcv/cnn.md deleted file mode 100644 index 749cb951131efe5c9ec4c59ef05b90243913df68..0000000000000000000000000000000000000000 --- a/docs/understand_mmcv/cnn.md +++ /dev/null @@ -1,538 +0,0 @@ -## CNN - -We provide some building bricks for CNNs, including layer building, module bundles and weight initialization. - -### Layer building - -We may need to try different layers of the same type when running experiments, -but do not want to modify the code from time to time. -Here we provide some layer building methods to construct layers from a dict, -which can be written in configs or specified via command line arguments. - -#### Usage - -A simplest example is - -```python -cfg = dict(type='Conv3d') -layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3) -``` - -- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d). -- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d). -- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU. -- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle. -- `build_padding_layer`: Supported types are zero, reflect, replicate. - -#### Extension - -We also allow extending the building methods with custom layers and operators. - -1. Write and register your own module. - - ```python - from mmcv.cnn import UPSAMPLE_LAYERS - - @UPSAMPLE_LAYERS.register_module() - class MyUpsample: - - def __init__(self, scale_factor): - pass - - def forward(self, x): - pass - ``` - -2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it. - - ```python - cfg = dict(type='MyUpsample', scale_factor=2) - layer = build_upsample_layer(cfg) - ``` - -### Module bundles - -We also provide common module bundles to facilitate the network construction. -`ConvModule` is a bundle of convolution, normalization and activation layers, -please refer to the [api](api.html#mmcv.cnn.ConvModule) for details. - -```python -# conv + bn + relu -conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN')) -# conv + gn + relu -conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2)) -# conv + relu -conv = ConvModule(3, 8, 2) -# conv -conv = ConvModule(3, 8, 2, act_cfg=None) -# conv + leaky relu -conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU')) -# bn + conv + relu -conv = ConvModule( - 3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act')) -``` - -### Weight initialization - -> Implementation details are available at [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py) - -During training, a proper initialization strategy is beneficial to speed up the -training or obtain a higher performance. In MMCV, we provide some commonly used -methods for initializing modules like `nn.Conv2d`. Of course, we also provide -high-level APIs for initializing models containing one or more -modules. - -#### Initialization functions - -Initialize a `nn.Module` such as `nn.Conv2d`, `nn.Linear` in a functional way. - -We provide the following initialization methods. - -- constant_init - - Initialize module parameters with constant values. - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import constant_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # constant_init(module, val, bias=0) - >>> constant_init(conv1, 1, 0) - >>> conv1.weight - ``` - -- xavier_init - - Initialize module parameters with values according to the method - described in [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import xavier_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # xavier_init(module, gain=1, bias=0, distribution='normal') - >>> xavier_init(conv1, distribution='normal') - ``` - -- normal_init - - Initialize module parameters with the values drawn from a normal distribution. - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import normal_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # normal_init(module, mean=0, std=1, bias=0) - >>> normal_init(conv1, std=0.01, bias=0) - ``` - -- uniform_init - - Initialize module parameters with values drawn from a uniform distribution. - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import uniform_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # uniform_init(module, a=0, b=1, bias=0) - >>> uniform_init(conv1, a=0, b=1) - ``` - -- kaiming_init - - Initialize module parameters with the values according to the method - described in [Delving deep into rectifiers: Surpassing human-level - performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import kaiming_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal') - >>> kaiming_init(conv1) - ``` - -- caffe2_xavier_init - - The xavier initialization is implemented in caffe2, which corresponds to `kaiming_uniform_` in PyTorch. - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import caffe2_xavier_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # caffe2_xavier_init(module, bias=0) - >>> caffe2_xavier_init(conv1) - ``` - -- bias_init_with_prob - - Initialize conv/fc bias value according to a given probability, as proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf). - - ```python - >>> from mmcv.cnn import bias_init_with_prob - >>> # bias_init_with_prob is proposed in Focal Loss - >>> bias = bias_init_with_prob(0.01) - >>> bias - -4.59511985013459 - ``` - -#### Initializers and configs - -On the basis of the initialization methods, we define the corresponding initialization classes and register them to `INITIALIZERS`, so we can -use the configuration to initialize the model. - -We provide the following initialization classes. - -- ConstantInit -- XavierInit -- NormalInit -- UniformInit -- KaimingInit -- Caffe2XavierInit -- PretrainedInit - -Let us introduce the usage of `initialize` in detail. - -1. Initialize model by `layer` key - - If we only define `layer`, it just initialize the layer in `layer` key. - - NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, so `MultiheadAttention layer` is not supported. - -- Define `layer` key for initializing module with same configuration. - - ```python - import torch.nn as nn - from mmcv.cnn import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Linear(1, 2) - - model = FooNet() - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) - # initialize whole module with same configuration - initialize(model, init_cfg) - # model.feat.weight - # Parameter containing: - # tensor([[[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]]], requires_grad=True) - ``` - -- Define `layer` key for initializing layer with different configurations. - - ```python - import torch.nn as nn - from mmcv.cnn.utils import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Linear(1,2) - - model = FooNet() - init_cfg = [dict(type='Constant', layer='Conv1d', val=1), - dict(type='Constant', layer='Conv2d', val=2), - dict(type='Constant', layer='Linear', val=3)] - # nn.Conv1d will be initialized with dict(type='Constant', val=1) - # nn.Conv2d will be initialized with dict(type='Constant', val=2) - # nn.Linear will be initialized with dict(type='Constant', val=3) - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - ``` - -2. Initialize model by `override` key - -- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg. - - ```python - import torch.nn as nn - from mmcv.cnn import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) - - # if we would like to initialize model's weights as 1 and bias as 2 - # but weight in `cls` as 3 and bias 4, we can use override key - model = FooNet() - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, - override=dict(type='Constant', name='reg', val=3, bias=4)) - # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2) - # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4) - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[3., 3., 3.], - # [3., 3., 3.], - # [3., 3., 3.]], - # ..., - # [[3., 3., 3.], - # [3., 3., 3.], - # [3., 3., 3.]]]], requires_grad=True) - ``` - -- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted. - - ```python - model = FooNet() - init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) - # self.feat and self.cls will be initialized by Pytorch - # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2) - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]], - # ..., - # [[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]]]], requires_grad=True) - ``` - -- If we don't define `layer` key or `override` key, it will not initialize anything. - -- Invalid usage - - ```python - # It is invalid that override don't have name key - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], - val=1, bias=2, - override=dict(type='Constant', val=3, bias=4)) - - # It is also invalid that override has name and other args except type - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], - val=1, bias=2, - override=dict(name='reg', val=3, bias=4)) - ``` - -3. Initialize model with the pretrained model - - ```python - import torch.nn as nn - import torchvision.models as models - from mmcv.cnn import initialize - - # initialize model with pretrained model - model = models.resnet50() - # model.conv1.weight - # Parameter containing: - # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03, ..., -2.1245e-03, - # -1.8077e-03, 3.0338e-03], - # [-1.2603e-02, -2.7831e-02, 2.3187e-02, ..., -1.5793e-02, - # 1.1655e-02, 4.5889e-03], - # [-3.7916e-02, 1.2014e-02, 1.3815e-02, ..., -4.2651e-03, - # 1.7314e-02, -9.9998e-03], - # ..., - - init_cfg = dict(type='Pretrained', - checkpoint='torchvision://resnet50') - initialize(model, init_cfg) - # model.conv1.weight - # Parameter containing: - # tensor([[[[ 1.3335e-02, 1.4664e-02, -1.5351e-02, ..., -4.0896e-02, - # -4.3034e-02, -7.0755e-02], - # [ 4.1205e-03, 5.8477e-03, 1.4948e-02, ..., 2.2060e-03, - # -2.0912e-02, -3.8517e-02], - # [ 2.2331e-02, 2.3595e-02, 1.6120e-02, ..., 1.0281e-01, - # 6.2641e-02, 5.1977e-02], - # ..., - - # initialize weights of a sub-module with the specific part of a pretrained model by using 'prefix' - model = models.resnet50() - url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ - 'retinanet_r50_fpn_1x_coco/'\ - 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' - init_cfg = dict(type='Pretrained', - checkpoint=url, prefix='backbone.') - initialize(model, init_cfg) - ``` - -4. Initialize model inherited from BaseModule, Sequential, ModuleList - - `BaseModule` is inherited from `torch.nn.Module`, and the only different between them is that `BaseModule` implements `init_weight`. - - `Sequential` is inherited from `BaseModule` and `torch.nn.Sequential`. - - `ModuleList` is inherited from `BaseModule` and `torch.nn.ModuleList`. - - `````python - import torch.nn as nn - from mmcv.runner import BaseModule, Sequential, ModuleList - - class FooConv1d(BaseModule): - - def __init__(self, init_cfg=None): - super().__init__(init_cfg) - self.conv1d = nn.Conv1d(4, 1, 4) - - def forward(self, x): - return self.conv1d(x) - - class FooConv2d(BaseModule): - - def __init__(self, init_cfg=None): - super().__init__(init_cfg) - self.conv2d = nn.Conv2d(3, 1, 3) - - def forward(self, x): - return self.conv2d(x) - - # BaseModule - init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.) - model = FooConv1d(init_cfg) - model.init_weights() - # model.conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - - # Sequential - init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.) - init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.) - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - seq_model = Sequential(model1, model2) - seq_model.init_weights() - # seq_model[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # seq_model[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # inner init_cfg has higher priority - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) - seq_model = Sequential(model1, model2, init_cfg=init_cfg) - seq_model.init_weights() - # seq_model[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # seq_model[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # ModuleList - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - modellist = ModuleList([model1, model2]) - modellist.init_weights() - # modellist[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # modellist[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # inner init_cfg has higher priority - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) - modellist = ModuleList([model1, model2], init_cfg=init_cfg) - modellist.init_weights() - # modellist[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # modellist[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - ````` - -### Model Zoo - -Besides torchvision pre-trained models, we also provide pre-trained models of following CNN: - -- VGG Caffe -- ResNet Caffe -- ResNeXt -- ResNet with Group Normalization -- ResNet with Group Normalization and Weight Standardization -- HRNetV2 -- Res2Net -- RegNet - -#### Model URLs in JSON - -The model zoo links in MMCV are managed by JSON files. -The json file consists of key-value pair of model name and its url or path. -An example json file could be like: - -```json -{ - "model_a": "https://example.com/models/model_a_9e5bac.pth", - "model_b": "pretrain/model_b_ab3ef2c.pth" -} -``` - -The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json). - -You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path. - -The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used. - -#### Load Checkpoint - -The following types are supported for `filename` argument of `mmcv.load_checkpoint()`. - -- filepath: The filepath of the checkpoint. -- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename. -- `torchvision://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details. -- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files. diff --git a/docs/understand_mmcv/ops.md b/docs/understand_mmcv/ops.md deleted file mode 100644 index 2729e441c1318ca2850c21bf72df428910657f31..0000000000000000000000000000000000000000 --- a/docs/understand_mmcv/ops.md +++ /dev/null @@ -1,37 +0,0 @@ -## CUDA ops - -We implement common CUDA ops used in detection, segmentation, etc. - -- AssignScoreWithK -- BallQuery -- BBoxOverlaps -- CARAFE -- CrissCrossAttention -- ContextBlock -- CornerPool -- Deformable Convolution v1/v2 -- Deformable RoIPool -- DynamicScatter -- GatherPoints -- FurthestPointSample -- FurthestPointSampleWithDist -- GeneralizedAttention -- GroupPoints -- KNN -- MaskedConv -- NMS -- PSAMask -- RoIPointPool3d -- RoIPool -- RoIAlign -- RoIAwarePool3d -- SimpleRoIAlign -- SigmoidFocalLoss -- SoftmaxFocalLoss -- SoftNMS -- Synchronized BatchNorm -- Voxelization -- ThreeInterpolate -- ThreeNN -- Weight standardization -- Correlation diff --git a/docs_zh_CN/Makefile b/docs/zh_cn/Makefile similarity index 100% rename from docs_zh_CN/Makefile rename to docs/zh_cn/Makefile diff --git a/docs_zh_CN/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css similarity index 100% rename from docs_zh_CN/_static/css/readthedocs.css rename to docs/zh_cn/_static/css/readthedocs.css diff --git a/docs_zh_CN/_static/image/mmcv-logo.png b/docs/zh_cn/_static/image/mmcv-logo.png similarity index 100% rename from docs_zh_CN/_static/image/mmcv-logo.png rename to docs/zh_cn/_static/image/mmcv-logo.png diff --git a/docs_zh_CN/api.rst b/docs/zh_cn/api.rst similarity index 90% rename from docs_zh_CN/api.rst rename to docs/zh_cn/api.rst index 8ca9118c3b033f1b7311ec3c1533ce9c93fa1aa2..5d3e623037e3fb102f8c927ff5909d478a46cab9 100644 --- a/docs_zh_CN/api.rst +++ b/docs/zh_cn/api.rst @@ -38,6 +38,11 @@ runner .. automodule:: mmcv.runner :members: +engine +------ +.. automodule:: mmcv.engine + :members: + ops ------ .. automodule:: mmcv.ops diff --git a/docs_zh_CN/community/contributing.md b/docs/zh_cn/community/contributing.md similarity index 68% rename from docs_zh_CN/community/contributing.md rename to docs/zh_cn/community/contributing.md index 30bac8738bee8db306287c6b245b3115464e64da..b7bc1d22d9bb52875b37a15ea1bb3eea1e61c027 100644 --- a/docs_zh_CN/community/contributing.md +++ b/docs/zh_cn/community/contributing.md @@ -7,7 +7,9 @@ - 添加新功能和新组件 ### 工作流 + | 详细工作流见 [拉取请求](pr.md) + 1. 复刻并拉取最新的 OpenMMLab 算法库 2. 创建新的分支(不建议使用主分支提拉取请求) 3. 提交你的修改 @@ -16,16 +18,18 @@ ```{note} 如果你计划添加新功能并且该功能包含比较大的改动,建议先开 issue 讨论 ``` + ### 代码风格 #### Python [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范,我们使用以下工具检查和格式化代码 -- [flake8](http://flake8.pycqa.org/en/latest/): Python 官方发布的代码规范检查工具,是多个检查工具的封装 -- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具 +- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具,是多个检查工具的封装 - [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具 -- [markdownlint](https://github.com/markdownlint/markdownlint): 检查 markdown 文件的工具 +- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具 +- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误 +- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具 - [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具 yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到 @@ -46,23 +50,7 @@ pip install -U pre-commit pre-commit install ``` -如果安装 markdownlint 遇到了问题,可以尝试使用以下的步骤安装 ruby - -```shell -# install rvm -curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail -[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm" -rvm autolibs disable - -# install ruby -rvm install 2.7.1 -``` - -或者参考 [这个代码库](https://github.com/innerlee/setup) 和 [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh)。 - -至此,每一次 commit 修改都会触发 pre-commit 检查代码格式。 - ->提交拉取请求前,请确保你的代码符合 yapf 的格式 +> 提交拉取请求前,请确保你的代码符合 yapf 的格式 #### C++ and CUDA diff --git a/docs/zh_cn/community/pr.md b/docs/zh_cn/community/pr.md new file mode 100644 index 0000000000000000000000000000000000000000..720f38986320bb94be67165ddb2dea2f04f659c9 --- /dev/null +++ b/docs/zh_cn/community/pr.md @@ -0,0 +1,114 @@ +## 拉取请求 + +### 什么是拉取请求? + +`拉取请求` (Pull Request), [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests)定义如下。 + +``` +拉取请求是一种通知机制。你修改了他人的代码,将你的修改通知原来作者,希望他合并你的修改。 +``` + +### 基本的工作流: + +1. 获取最新的代码库 +2. 从主分支创建最新的分支进行开发 +3. 提交修改 +4. 推送你的修改并创建一个 `拉取请求` +5. 讨论、审核代码 +6. 将开发分支合并到主分支 + +### 具体步骤 + +#### 1. 获取最新的代码库 + +- 当你第一次提 PR 时 + + 复刻 OpenMMLab 原代码库,点击 GitHub 页面右上角的 **Fork** 按钮即可 + ![avatar](../../en/_static/community/1.png) + + 克隆复刻的代码库到本地 + + ```bash + git clone git@github.com:XXX/mmcv.git + ``` + + 添加原代码库为上游代码库 + + ```bash + git remote add upstream git@github.com:open-mmlab/mmcv + ``` + +- 从第二个 PR 起 + + 检出本地代码库的主分支,然后从最新的原代码库的主分支拉取更新 + + ```bash + git checkout master + git pull upstream master + ``` + +#### 2. 从主分支创建一个新的开发分支 + +```bash +git checkout -b branchname +``` + +```{tip} +为了保证提交历史清晰可读,我们强烈推荐您先检出主分支 (master),再创建新的分支。 +``` + +#### 3. 提交你的修改 + +```bash +# coding +git add [files] +git commit -m 'messages' +``` + +#### 4. 推送你的修改到复刻的代码库,并创建一个`拉取请求` + +- 推送当前分支到远端复刻的代码库 + + ```bash + git push origin branchname + ``` + +- 创建一个`拉取请求` + ![avatar](../../en/_static/community/2.png) + +- 修改`拉取请求`信息模板,描述修改原因和修改内容。还可以在 PR 描述中,手动关联到相关的`议题` (issue),(更多细节,请参考[官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue))。 + +#### 5. 讨论并评审你的代码 + +- 创建`拉取请求`时,可以关联给相关人员进行评审 + ![avatar](../../en/_static/community/3.png) + +- 根据评审人员的意见修改代码,并推送修改 + +#### 6. `拉取请求`合并之后删除该分支 + +```bash +git branch -d branchname # delete local branch +git push origin --delete branchname # delete remote branch +``` + +### PR 规范 + +1. 使用 [pre-commit hook](https://pre-commit.com),尽量减少代码风格相关问题 + +2. 一个 PR 对应一个短期分支 + +3. 粒度要细,一个PR只做一件事情,避免超大的PR + + - Bad:实现 Faster R-CNN + - Acceptable:给 Faster R-CNN 添加一个 box head + - Good:给 box head 增加一个参数来支持自定义的 conv 层数 + +4. 每次 Commit 时需要提供清晰且有意义 commit 信息 + +5. 提供清晰且有意义的`拉取请求`描述 + + - 标题写明白任务名称,一般格式:\[Prefix\] Short description of the pull request (Suffix) + - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review) + - 描述里介绍`拉取请求`的主要修改内容,结果,以及对其他部分的影响, 参考`拉取请求`模板 + - 关联相关的`议题` (issue) 和其他`拉取请求` diff --git a/docs_zh_CN/compatibility.md b/docs/zh_cn/compatibility.md similarity index 100% rename from docs_zh_CN/compatibility.md rename to docs/zh_cn/compatibility.md diff --git a/docs/conf.py b/docs/zh_cn/conf.py similarity index 62% rename from docs/conf.py rename to docs/zh_cn/conf.py index bea4706cf0430220087b77847f5a07cd24c9b31f..2c144917848c787ea1db602e482c09bcf8fae6af 100644 --- a/docs/conf.py +++ b/docs/zh_cn/conf.py @@ -15,21 +15,19 @@ import os import sys import pytorch_sphinx_theme -from m2r import MdInclude -from recommonmark.transform import AutoStructify from sphinx.builders.html import StandaloneHTMLBuilder -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath('../..')) -version_file = '../mmcv/version.py' -with open(version_file, 'r') as f: +version_file = '../../mmcv/version.py' +with open(version_file) as f: exec(compile(f.read(), version_file, 'exec')) __version__ = locals()['__version__'] # -- Project information ----------------------------------------------------- project = 'mmcv' -copyright = '2018-2021, OpenMMLab' +copyright = '2018-2022, OpenMMLab' author = 'MMCV Authors' # The short X.Y version @@ -57,6 +55,8 @@ extensions = [ 'sphinx_copybutton', ] # yapf: disable +myst_heading_anchors = 4 + autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision'] autosectionlabel_prefix_document = True @@ -79,7 +79,7 @@ master_doc = 'index' # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = 'zh_CN' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -108,92 +108,9 @@ html_theme_options = { 'name': 'GitHub', 'url': 'https://github.com/open-mmlab/mmcv' }, - { - 'name': - 'Docs', - 'children': [ - { - 'name': 'MMCV', - 'url': 'https://mmcv.readthedocs.io/en/latest/', - }, - { - 'name': 'MIM', - 'url': 'https://openmim.readthedocs.io/en/latest/' - }, - { - 'name': 'MMAction2', - 'url': 'https://mmaction2.readthedocs.io/en/latest/', - }, - { - 'name': 'MMClassification', - 'url': - 'https://mmclassification.readthedocs.io/en/latest/', - }, - { - 'name': 'MMDetection', - 'url': 'https://mmdetection.readthedocs.io/en/latest/', - }, - { - 'name': 'MMDetection3D', - 'url': 'https://mmdetection3d.readthedocs.io/en/latest/', - }, - { - 'name': 'MMEditing', - 'url': 'https://mmediting.readthedocs.io/en/latest/', - }, - { - 'name': 'MMGeneration', - 'url': 'https://mmgeneration.readthedocs.io/en/latest/', - }, - { - 'name': 'MMOCR', - 'url': 'https://mmocr.readthedocs.io/en/latest/', - }, - { - 'name': 'MMPose', - 'url': 'https://mmpose.readthedocs.io/en/latest/', - }, - { - 'name': 'MMSegmentation', - 'url': 'https://mmsegmentation.readthedocs.io/en/latest/', - }, - { - 'name': 'MMTracking', - 'url': 'https://mmtracking.readthedocs.io/en/latest/', - }, - { - 'name': 'MMFlow', - 'url': 'https://mmflow.readthedocs.io/en/latest/', - }, - { - 'name': 'MMFewShot', - 'url': 'https://mmfewshot.readthedocs.io/en/latest/', - }, - ] - }, - { - 'name': - 'OpenMMLab', - 'children': [ - { - 'name': 'Homepage', - 'url': 'https://openmmlab.com/' - }, - { - 'name': 'GitHub', - 'url': 'https://github.com/open-mmlab/' - }, - { - 'name': 'Twitter', - 'url': 'https://twitter.com/OpenMMLab' - }, - { - 'name': 'Zhihu', - 'url': 'https://zhihu.com/people/openmmlab' - }, - ] - }, - ] + ], + # Specify the language of shared menu + 'menu_lang': 'cn', } # Add any paths that contain custom static files (such as style sheets) here, @@ -286,16 +203,3 @@ StandaloneHTMLBuilder.supported_image_types = [ # Ignore >>> when copying code copybutton_prompt_text = r'>>> |\.\.\. ' copybutton_prompt_is_regexp = True - - -def setup(app): - app.add_config_value('no_underscore_emphasis', False, 'env') - app.add_config_value('m2r_parse_relative_links', False, 'env') - app.add_config_value('m2r_anonymous_references', False, 'env') - app.add_config_value('m2r_disable_inline_math', False, 'env') - app.add_directive('mdinclude', MdInclude) - app.add_config_value('recommonmark_config', { - 'auto_toc_tree_section': 'Contents', - 'enable_eval_rst': True, - }, True) - app.add_transform(AutoStructify) diff --git a/docs_zh_CN/deployment/onnx.md b/docs/zh_cn/deployment/onnx.md similarity index 100% rename from docs_zh_CN/deployment/onnx.md rename to docs/zh_cn/deployment/onnx.md diff --git a/docs_zh_CN/deployment/onnxruntime_custom_ops.md b/docs/zh_cn/deployment/onnxruntime_custom_ops.md similarity index 98% rename from docs_zh_CN/deployment/onnxruntime_custom_ops.md rename to docs/zh_cn/deployment/onnxruntime_custom_ops.md index 594aefb4ba4566aeda990ee5f42512f5e2be1917..1150f919efb1df20e2d99d02747fe2c331554010 100644 --- a/docs_zh_CN/deployment/onnxruntime_custom_ops.md +++ b/docs/zh_cn/deployment/onnxruntime_custom_ops.md @@ -64,7 +64,7 @@ | 类型 | 参数名 | 描述 | | ------- | --------------- | ------------------------------------------------------- | -| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围[0, 1]。默认值为0 | +| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围\[0, 1\]。默认值为0 | | `float` | `sigma` | 高斯方法的超参数 | | `float` | `min_score` | NMS的score阈值 | | `int` | `method` | NMS的计算方式, (0: `naive`, 1: `linear`, 2: `gaussian`) | @@ -137,10 +137,10 @@ #### 模型参数 -| 类型 | 参数名 | 描述 | -| ------- | --------------- | ----------------------------------------------------- | -| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围[0, 1]。默认值为0 | -| `int` | `offset` | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1 | +| 类型 | 参数名 | 描述 | +| ------- | --------------- | ------------------------------------------------------- | +| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围\[0, 1\]。默认值为0 | +| `int` | `offset` | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1 | #### 输入 diff --git a/docs_zh_CN/deployment/onnxruntime_op.md b/docs/zh_cn/deployment/onnxruntime_op.md similarity index 78% rename from docs_zh_CN/deployment/onnxruntime_op.md rename to docs/zh_cn/deployment/onnxruntime_op.md index 3898aa164fd019b635890243d03de316d2f36127..e5599307294a87093110bdd5fa33966f275572cd 100644 --- a/docs_zh_CN/deployment/onnxruntime_op.md +++ b/docs/zh_cn/deployment/onnxruntime_op.md @@ -15,16 +15,16 @@ ### MMCV已支持的算子 -| 算子 | CPU | GPU | MMCV版本 | -| :------------------------------------------------------------------------------: | :---: | :---: | :------: | -| [SoftNMS](onnxruntime_custom_ops.md#softnms) | Y | N | 1.2.3 | -| [RoIAlign](onnxruntime_custom_ops.md#roialign) | Y | N | 1.2.5 | -| [NMS](onnxruntime_custom_ops.md#nms) | Y | N | 1.2.7 | -| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) | Y | N | 1.3.1 | -| [CornerPool](onnxruntime_custom_ops.md#cornerpool) | Y | N | 1.3.4 | -| [cummax](onnxruntime_custom_ops.md#cummax) | Y | N | 1.3.4 | -| [cummin](onnxruntime_custom_ops.md#cummin) | Y | N | 1.3.4 | -| [MMCVModulatedDeformConv2d](onnxruntime_custom_ops.md#mmcvmodulateddeformconv2d) | Y | N | 1.3.12 | +| 算子 | CPU | GPU | MMCV版本 | +| :------------------------------------------------------------------------------: | :-: | :-: | :------: | +| [SoftNMS](onnxruntime_custom_ops.md#softnms) | Y | N | 1.2.3 | +| [RoIAlign](onnxruntime_custom_ops.md#roialign) | Y | N | 1.2.5 | +| [NMS](onnxruntime_custom_ops.md#nms) | Y | N | 1.2.7 | +| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) | Y | N | 1.3.1 | +| [CornerPool](onnxruntime_custom_ops.md#cornerpool) | Y | N | 1.3.4 | +| [cummax](onnxruntime_custom_ops.md#cummax) | Y | N | 1.3.4 | +| [cummin](onnxruntime_custom_ops.md#cummin) | Y | N | 1.3.4 | +| [MMCVModulatedDeformConv2d](onnxruntime_custom_ops.md#mmcvmodulateddeformconv2d) | Y | N | 1.3.12 | ### 如何编译ONNX Runtime自定义算子? @@ -97,18 +97,20 @@ onnx_results = sess.run(None, {'input' : input_data}) 以`soft_nms`为例: 1. 在ONNX Runtime头文件目录`mmcv/ops/csrc/onnxruntime/`下添加头文件`soft_nms.h` + 2. 在ONNX Runtime源码目录`mmcv/ops/csrc/onnxruntime/cpu/`下添加算子实现`soft_nms.cpp` -3. 在[onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)中注册实现的算子`soft_nms` - ```c++ - #include "soft_nms.h" +3. 在[onnxruntime_register.cpp](../../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)中注册实现的算子`soft_nms` + + ```c++ + #include "soft_nms.h" - SoftNmsOp c_SoftNmsOp; + SoftNmsOp c_SoftNmsOp; - if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) { - return status; - } - ``` + if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) { + return status; + } + ``` 4. 在`tests/test_ops/test_onnx.py`添加单元测试, 可以参考[here](../../tests/test_ops/test_onnx.py)。 @@ -118,10 +120,10 @@ onnx_results = sess.run(None, {'input' : input_data}) ### 已知问题 - "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`." - 1. 请注意`cummax`和`cummin`算子是在torch >= 1.5.0被添加的。但他们需要在torch version >= 1.7.0才能正确导出。否则会在导出时发生上面的错误。 - 2. 解决方法:升级PyTorch到1.7.0以上版本 + 1. 请注意`cummax`和`cummin`算子是在torch >= 1.5.0被添加的。但他们需要在torch version >= 1.7.0才能正确导出。否则会在导出时发生上面的错误。 + 2. 解决方法:升级PyTorch到1.7.0以上版本 ### 引用 - [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md) -- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md) +- [How to add a custom operator/kernel in ONNX Runtime](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html) diff --git a/docs_zh_CN/deployment/tensorrt_custom_ops.md b/docs/zh_cn/deployment/tensorrt_custom_ops.md similarity index 95% rename from docs_zh_CN/deployment/tensorrt_custom_ops.md rename to docs/zh_cn/deployment/tensorrt_custom_ops.md index 123f2889bf18aa549c327ea70f3ba974b45e48f5..d7731548303a03bd089950d5a2c87bed1c8e2fd7 100644 --- a/docs_zh_CN/deployment/tensorrt_custom_ops.md +++ b/docs/zh_cn/deployment/tensorrt_custom_ops.md @@ -100,7 +100,7 @@ #### 描述 -ScatterND接收三个输入,分别为秩为r >= 1的`data`,秩为q >= 1的`indices`以及秩为 q + r - indices.shape[-1] -1 的`update`。输出的计算方式为:首先创建一个`data`的拷贝,然后根据`indces`的值使用`update`对拷贝的`data`进行更新。注意`indices`中不应该存在相同的条目,也就是说对同一个位置进行一次以上的更新是不允许的。 +ScatterND接收三个输入,分别为秩为r >= 1的`data`,秩为q >= 1的`indices`以及秩为 q + r - indices.shape\[-1\] -1 的`update`。输出的计算方式为:首先创建一个`data`的拷贝,然后根据`indces`的值使用`update`对拷贝的`data`进行更新。注意`indices`中不应该存在相同的条目,也就是说对同一个位置进行一次以上的更新是不允许的。 输出的计算方式可以参考如下代码: @@ -147,13 +147,13 @@ ScatterND接收三个输入,分别为秩为r >= 1的`data`,秩为q >= 1的`i #### 模型参数 -| 类型 | 参数名 | 描述 | -| ------- | ---------------------------- | ---------------------------------------------------------------------------------------- | -| `int` | `center_point_box` | 0 - 候选框的格式为[y1, x1, y2, x2], 1-候选框的格式为[x_center, y_center, width, height] | -| `int` | `max_output_boxes_per_class` | 每一类最大的输出检测框个数。默认为0,输出检测框个数等于输入候选框数 | -| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围[0, 1]。默认值为0 | -| `float` | `score_threshold` | 用来判断候选框是否合法的阈值 | -| `int` | `offset` | 检测框长宽计算方式为(x2 - x1 + offset),可选值0或1 | +| 类型 | 参数名 | 描述 | +| ------- | ---------------------------- | -------------------------------------------------------------------------------------------- | +| `int` | `center_point_box` | 0 - 候选框的格式为\[y1, x1, y2, x2\], 1-候选框的格式为\[x_center, y_center, width, height\] | +| `int` | `max_output_boxes_per_class` | 每一类最大的输出检测框个数。默认为0,输出检测框个数等于输入候选框数 | +| `float` | `iou_threshold` | 用来判断候选框重合度的阈值,取值范围\[0, 1\]。默认值为0 | +| `float` | `score_threshold` | 用来判断候选框是否合法的阈值 | +| `int` | `offset` | 检测框长宽计算方式为(x2 - x1 + offset),可选值0或1 | #### 输入 diff --git a/docs_zh_CN/deployment/tensorrt_plugin.md b/docs/zh_cn/deployment/tensorrt_plugin.md similarity index 79% rename from docs_zh_CN/deployment/tensorrt_plugin.md rename to docs/zh_cn/deployment/tensorrt_plugin.md index 0f385b8e032fac3267a838367b53d26880a693c9..0c29f14b1eb93450b606c41e831e9c6b511efe96 100644 --- a/docs_zh_CN/deployment/tensorrt_plugin.md +++ b/docs/zh_cn/deployment/tensorrt_plugin.md @@ -2,18 +2,18 @@ -- [MMCV中的TensorRT自定义算子 (实验性)](#mmcv中的tensorrt自定义算子-实验性) - - [介绍](#介绍) - - [MMCV中的TensorRT插件列表](#mmcv中的tensorrt插件列表) - - [如何编译MMCV中的TensorRT插件](#如何编译mmcv中的tensorrt插件) - - [准备](#准备) - - [在Linux上编译](#在linux上编译) - - [创建TensorRT推理引擎并在python下进行推理](#创建tensorrt推理引擎并在python下进行推理) - - [如何在MMCV中添加新的TensorRT自定义算子](#如何在mmcv中添加新的tensorrt自定义算子) - - [主要流程](#主要流程) - - [注意](#注意) - - [已知问题](#已知问题) - - [引用](#引用) +- [MMCV中的TensorRT自定义算子 (实验性)](#mmcv%E4%B8%AD%E7%9A%84tensorrt%E8%87%AA%E5%AE%9A%E4%B9%89%E7%AE%97%E5%AD%90-%E5%AE%9E%E9%AA%8C%E6%80%A7) + - [介绍](#%E4%BB%8B%E7%BB%8D) + - [MMCV中的TensorRT插件列表](#mmcv%E4%B8%AD%E7%9A%84tensorrt%E6%8F%92%E4%BB%B6%E5%88%97%E8%A1%A8) + - [如何编译MMCV中的TensorRT插件](#%E5%A6%82%E4%BD%95%E7%BC%96%E8%AF%91mmcv%E4%B8%AD%E7%9A%84tensorrt%E6%8F%92%E4%BB%B6) + - [准备](#%E5%87%86%E5%A4%87) + - [在Linux上编译](#%E5%9C%A8linux%E4%B8%8A%E7%BC%96%E8%AF%91) + - [创建TensorRT推理引擎并在python下进行推理](#%E5%88%9B%E5%BB%BAtensorrt%E6%8E%A8%E7%90%86%E5%BC%95%E6%93%8E%E5%B9%B6%E5%9C%A8python%E4%B8%8B%E8%BF%9B%E8%A1%8C%E6%8E%A8%E7%90%86) + - [如何在MMCV中添加新的TensorRT自定义算子](#%E5%A6%82%E4%BD%95%E5%9C%A8mmcv%E4%B8%AD%E6%B7%BB%E5%8A%A0%E6%96%B0%E7%9A%84tensorrt%E8%87%AA%E5%AE%9A%E4%B9%89%E7%AE%97%E5%AD%90) + - [主要流程](#%E4%B8%BB%E8%A6%81%E6%B5%81%E7%A8%8B) + - [注意](#%E6%B3%A8%E6%84%8F) + - [已知问题](#%E5%B7%B2%E7%9F%A5%E9%97%AE%E9%A2%98) + - [引用](#%E5%BC%95%E7%94%A8) @@ -75,6 +75,10 @@ pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl 想了解更多通过tar包安装TensorRT,请访问[Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar). +- 安装 cuDNN + +参考[Nvidia' website](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar)安装 cuDNN 8。 + #### 在Linux上编译 ```bash @@ -142,21 +146,24 @@ with torch.no_grad(): **以RoIAlign算子插件`roi_align`举例。** 1. 在TensorRT包含目录`mmcv/ops/csrc/tensorrt/`中添加头文件`trt_roi_align.hpp` + 2. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加头文件`trt_roi_align.cpp` + 3. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加cuda kernel文件`trt_roi_align_kernel.cu` + 4. 在[trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp)中注册`roi_align`插件 - ```c++ - #include "trt_plugin.hpp" + ```c++ + #include "trt_plugin.hpp" - #include "trt_roi_align.hpp" + #include "trt_roi_align.hpp" - REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator); + REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator); - extern "C" { - bool initLibMMCVInferPlugins() { return true; } - } // extern "C" - ``` + extern "C" { + bool initLibMMCVInferPlugins() { return true; } + } // extern "C" + ``` 5. 在`tests/test_ops/test_tensorrt.py`中添加单元测试 diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md new file mode 100644 index 0000000000000000000000000000000000000000..6cfb100c631b101fa0cff0650105a3cc7d735e7b --- /dev/null +++ b/docs/zh_cn/faq.md @@ -0,0 +1,91 @@ +## 常见问题 + +在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题,并且知道可以帮到大家的解决办法, +欢迎随时丰富这个列表。 + +### 安装问题 + +- KeyError: "xxx: 'yyy is not in the zzz registry'" + + 只有模块所在的文件被导入时,注册机制才会被触发,所以您需要在某处导入该文件,更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。 + +- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'" + + 1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv + 2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full + +- "invalid device function" 或者 "no kernel image is available for execution" + + 1. 检查 GPU 的 CUDA 计算能力 + 2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的,您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs,如:colab 上的 Tesla K80 (3.7) + 3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如,您可能使用 CUDA 10.0 编译 mmcv,但在 CUDA 9.0 的环境中运行它 + +- "undefined symbol" 或者 "cannot open xxx.so" + + 1. 如果符号和 CUDA/C++ 相关(例如:libcudart.so 或者 GLIBCXX),请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致 + 2. 如果符号和 PyTorch 相关(例如:符号包含 caffe、aten 和 TH),请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致 + 3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同 + +- "RuntimeError: CUDA error: invalid configuration argument" + + 这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) + 的值并重新编译 mmcv。 + +- "RuntimeError: nms is not compiled with GPU support" + + 这个错误是由于您的 CUDA 环境没有正确安装。 + 您可以尝试重新安装您的 CUDA 环境,然后删除 mmcv/build 文件夹并重新编译 mmcv。 + +- "Segmentation fault" + + 1. 检查 GCC 的版本,通常是因为 PyTorch 版本与 GCC 版本不匹配 (例如 GCC \< 4.9 ),我们推荐用户使用 GCC 5.4,我们也不推荐使用 GCC 5.5, 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题 + 2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True + ```shell + python -c 'import torch; print(torch.cuda.is_available())' + ``` + 3. 如果 `torch` 安装成功,那么检查 MMCV 是否安装成功。输入以下命令,如果没有报错说明 mmcv-full 安装成。 + ```shell + python -c 'import mmcv; import mmcv.ops' + ``` + 4. 如果 MMCV 与 PyTorch 都安装成功了,则可以使用 `ipdb` 设置断点或者使用 `print` 函数,分析是哪一部分的代码导致了 `segmentation fault` + +- "libtorch_cuda_cu.so: cannot open shared object file" + + `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件,但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。 + +- "fatal error C1189: #error: -- unsupported Microsoft Visual Studio version!" + + 如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2,您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error: -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`,您可以尝试使用低版本的 Microsoft Visual Studio,例如 vs2017。 + +- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized" + + 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0,您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。 + +- "error: a member with an in-class initializer must be const" + + 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0,您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。 + +- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized" + + 如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0,您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件: + + - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` + - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }` + - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const` + + 更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。 + +- MMCV 和 MMDetection 的兼容性问题;"ConvWS is already registered in conv layer" + + 请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。 + +### 使用问题 + +- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one" + + 1. 这个错误是因为有些参数没有参与 loss 的计算,可能是代码中存在多个分支,导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。 + 2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`,或者手动查找哪些参数没有用到。 + +- "RuntimeError: Trying to backward through the graph a second time" + + 不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`,这会导致 `loss.backward()` 被调用两次,于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。 diff --git a/docs_zh_CN/get_started/build.md b/docs/zh_cn/get_started/build.md similarity index 50% rename from docs_zh_CN/get_started/build.md rename to docs/zh_cn/get_started/build.md index 77fb86e9cf5c805bdca5fdaff6f22768cbfe8d3e..ec6ebb887946f115a7a7ac06e43da6b261e36d28 100644 --- a/docs_zh_CN/get_started/build.md +++ b/docs/zh_cn/get_started/build.md @@ -9,6 +9,12 @@ git clone https://github.com/open-mmlab/mmcv.git cd mmcv ``` +建议安装 `ninja` 以加快编译速度 + +```bash +pip install -r requirements/optional.txt +``` + 你可以安装 lite 版本 ```bash @@ -36,6 +42,7 @@ CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e . ```{note} 如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`,例如在一个很小的容器环境或者没有图形用户界面的服务器中,你可以先安装 `opencv-python-headless`,这样在安装 mmcv 依赖的过程中会跳过 `opencv-python` ``` + ### 在 Windows 上编译 MMCV 在 Windows 上编译 MMCV 比 Linux 复杂,本节将一步步介绍如何在 Windows 上编译 MMCV。 @@ -63,32 +70,38 @@ CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e . 2. 创建一个新的 Conda 环境 - ```shell - conda create --name mmcv python=3.7 # 经测试,3.6, 3.7, 3.8 也能通过 - conda activate mmcv # 确保做任何操作前先激活环境 - ``` + ```shell + conda create --name mmcv python=3.7 # 经测试,3.6, 3.7, 3.8 也能通过 + conda activate mmcv # 确保做任何操作前先激活环境 + ``` 3. 安装 PyTorch 时,可以根据需要安装支持 CUDA 或不支持 CUDA 的版本 - ```shell - # CUDA version - conda install pytorch torchvision cudatoolkit=10.2 -c pytorch - # CPU version - conda install pytorch torchvision cpuonly -c pytorch - ``` + ```shell + # CUDA version + conda install pytorch torchvision cudatoolkit=10.2 -c pytorch + # CPU version + conda install pytorch torchvision cpuonly -c pytorch + ``` 4. 准备 MMCV 源代码 - ```shell - git clone https://github.com/open-mmlab/mmcv.git - cd mmcv - ``` + ```shell + git clone https://github.com/open-mmlab/mmcv.git + cd mmcv + ``` 5. 安装所需 Python 依赖包 - ```shell - pip3 install -r requirements.txt - ``` + ```shell + pip3 install -r requirements/runtime.txt + ``` + +6. 建议安装 `ninja` 以加快编译速度 + + ```bash + pip install -r requirements/optional.txt + ``` #### 编译与安装 MMCV @@ -96,33 +109,33 @@ MMCV 有三种安装的模式: 1. Lite 版本(不包含算子) - 这种方式下,没有算子被编译,这种模式的 mmcv 是原生的 python 包 + 这种方式下,没有算子被编译,这种模式的 mmcv 是原生的 python 包 2. Full 版本(只包含 CPU 算子) - 编译 CPU 算子,但只有 x86 将会被编译,并且编译版本只能在 CPU only 情况下运行 + 编译 CPU 算子,但只有 x86 将会被编译,并且编译版本只能在 CPU only 情况下运行 3. Full 版本(既包含 CPU 算子,又包含 CUDA 算子) - 同时编译 CPU 和 CUDA 算子,`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU + 同时编译 CPU 和 CUDA 算子,`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU ##### 通用步骤 1. 设置 MSVC 编译器 - 设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`,则 `cl.exe` 可以在命令行中运行,如下所示。 + 设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`,则 `cl.exe` 可以在命令行中运行,如下所示。 - ```none - (base) PS C:\Users\xxx> cl - Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 - Copyright (C) Microsoft Corporation. All rights reserved. + ```none + (base) PS C:\Users\xxx> cl + Microsoft (R) C/C++ Optimizing Compiler Version 19.27.29111 for x64 + Copyright (C) Microsoft Corporation. All rights reserved. - usage: cl [ option... ] filename... [ / link linkoption... ] - ``` + usage: cl [ option... ] filename... [ / link linkoption... ] + ``` - 为了兼容性,我们使用 x86-hosted 以及 x64-targeted 版本,即路径中的 `Hostx86\x64` 。 + 为了兼容性,我们使用 x86-hosted 以及 x64-targeted 版本,即路径中的 `Hostx86\x64` 。 - 因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本,只有 utf-8 将会被识别,你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。 + 因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本,只有 utf-8 将会被识别,你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。 ##### 安装方式一:Lite version(不包含算子) @@ -145,20 +158,20 @@ pip list 2. 设置环境变量 - ```shell - $env:MMCV_WITH_OPS = 1 - $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 - ``` + ```shell + $env:MMCV_WITH_OPS = 1 + $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 + ``` 3. 编译安装 - ```shell - conda activate mmcv # 激活环境 - cd mmcv # 改变路径 - python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 - python setup.py develop # 安装 - pip list # 检查是否安装成功 - ``` + ```shell + conda activate mmcv # 激活环境 + cd mmcv # 改变路径 + python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 + python setup.py develop # 安装 + pip list # 检查是否安装成功 + ``` ##### 安装方式三:Full version(既编译 CPU 算子又编译 CUDA 算子) @@ -166,38 +179,38 @@ pip list 2. 设置环境变量 - ```shell - $env:MMCV_WITH_OPS = 1 - $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 - ``` + ```shell + $env:MMCV_WITH_OPS = 1 + $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 + ``` -3. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中 +3. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中 - ```none - (base) PS C:\Users\WRH> ls env: + ```none + (base) PS C:\Users\WRH> ls env: - Name Value - ---- ----- - CUDA_PATH C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 - CUDA_PATH_V10_1 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 - CUDA_PATH_V10_2 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 - ``` + Name Value + ---- ----- + CUDA_PATH C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 + CUDA_PATH_V10_1 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1 + CUDA_PATH_V10_2 C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 + ``` - 如果没有,你可以按照下面的步骤设置 + 如果没有,你可以按照下面的步骤设置 - ```shell - $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" - # 或者 - $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # CUDA_PATH_V10_2 已经在环境变量中 - ``` + ```shell + $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2" + # 或者 + $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # CUDA_PATH_V10_2 已经在环境变量中 + ``` 4. 设置 CUDA 的目标架构 - ```shell - $env:TORCH_CUDA_ARCH_LIST="6.1" # 支持 GTX 1080 - # 或者用所有支持的版本,但可能会变得很慢 - $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5" - ``` + ```shell + $env:TORCH_CUDA_ARCH_LIST="6.1" # 支持 GTX 1080 + # 或者用所有支持的版本,但可能会变得很慢 + $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5" + ``` ```{note} 我们可以在 [here](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力 @@ -205,15 +218,15 @@ pip list 5. 编译安装 - ```shell - $env:MMCV_WITH_OPS = 1 - $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 - conda activate mmcv # 激活环境 - cd mmcv # 改变路径 - python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 - python setup.py develop # 安装 - pip list # 检查是否安装成功 - ``` + ```shell + $env:MMCV_WITH_OPS = 1 + $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置 + conda activate mmcv # 激活环境 + cd mmcv # 改变路径 + python setup.py build_ext # 如果成功, cl 将被启动用于编译算子 + python setup.py develop # 安装 + pip list # 检查是否安装成功 + ``` ```{note} 如果你的 PyTorch 版本是 1.6.0,你可能会遇到一些这个 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误,则可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改 本地环境的 PyTorch 源代码 diff --git a/docs_zh_CN/get_started/installation.md b/docs/zh_cn/get_started/installation.md similarity index 74% rename from docs_zh_CN/get_started/installation.md rename to docs/zh_cn/get_started/installation.md index 20e8cd59545fefb833b35195c1df7b4d3736b281..a6a20b054184623eea17a0852d37121d3fccea58 100644 --- a/docs_zh_CN/get_started/installation.md +++ b/docs/zh_cn/get_started/installation.md @@ -13,17 +13,17 @@ a. 安装完整版 在安装 mmcv-full 之前,请确保 PyTorch 已经成功安装在环境中,可以参考 PyTorch 官方[文档](https://pytorch.org/)。 -我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包,可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外,安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。 +我们提供了 **Linux 和 Windows 平台** PyTorch 和 CUDA 版本组合的 mmcv-full 预编译包,可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外,安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。 i. 安装最新版本 -如下是安装最新版 ``mmcv-full`` 的命令 +如下是安装最新版 `mmcv-full` 的命令 ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号,例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``,使用如下替换过的命令 +请将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号,例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的最新版 `mmcv-full`,使用如下替换过的命令 ```shell pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html @@ -37,18 +37,18 @@ PyTorch 版本是 1.8.1、CUDA 版本是 11.1,你可以使用以下命令安 `pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html` ``` -如果想知道更多 CUDA 和 PyTorch 版本的命令,可以参考下面的表格,将链接中的 ``=={mmcv_version}`` 删去即可。 +如果想知道更多 CUDA 和 PyTorch 版本的命令,可以参考下面的表格,将链接中的 `=={mmcv_version}` 删去即可。 ii. 安装特定的版本 -如下是安装特定版本 ``mmcv-full`` 的命令 +如下是安装特定版本 `mmcv-full` 的命令 ```shell pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html ``` -首先请参考版本发布信息找到想要安装的版本号,将 ``{mmcv_version}`` 替换成该版本号,例如 ``1.3.9``。 -然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号,例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本,使用如下替换过的命令 +首先请参考版本发布信息找到想要安装的版本号,将 `{mmcv_version}` 替换成该版本号,例如 `1.3.9`。 +然后将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号,例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的 `mmcv-full` 1.3.9 版本,使用如下替换过的命令 ```shell pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html @@ -60,15 +60,27 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t CUDA - torch 1.10 - torch 1.9 - torch 1.8 - torch 1.7 - torch 1.6 - torch 1.5 + torch 1.11 + torch 1.10 + torch 1.9 + torch 1.8 + torch 1.7 + torch 1.6 + torch 1.5 + + + 11.5 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html
+ + + + + + 11.3 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html
@@ -78,6 +90,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t 11.1 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
@@ -90,12 +103,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html
10.2 +
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html
安装
pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html
@@ -107,6 +122,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t 10.1 +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
@@ -117,12 +133,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html
cpu +
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html
安装
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html
@@ -134,7 +152,11 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t ```{note} -以上提供的预编译包并不囊括所有的 mmcv-full 版本,我们可以点击对应链接查看支持的版本。例如,点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html),可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外,从 `mmcv v1.3.17` 开始,我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包,但是我们依然在 CI 中保证对它们的兼容持续到下一年。 +以上提供的预编译包并不囊括所有的 mmcv-full 版本,我们可以点击对应链接查看支持的版本。例如,点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html),可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外,从 `mmcv v1.3.17` 开始,我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包,但是我们依然在 CI 中保证对它们的兼容持续到下一年。 +``` + +```{note} +mmcv-full 没有提供 Windows 平台 `cu102-torch1.8.0` 和 `cu92-torch*` 的预编译包。 ``` 除了使用预编译包之外,另一种方式是在本地进行编译,直接运行下述命令 diff --git a/docs_zh_CN/get_started/introduction.md b/docs/zh_cn/get_started/introduction.md similarity index 62% rename from docs_zh_CN/get_started/introduction.md rename to docs/zh_cn/get_started/introduction.md index 0082ae88a6a94fb09c76d9a821121ceb58b901a5..990713254928616f53240ca6f8926d9d1e5a8aec 100644 --- a/docs_zh_CN/get_started/introduction.md +++ b/docs/zh_cn/get_started/introduction.md @@ -2,16 +2,24 @@ MMCV 是一个面向计算机视觉的基础库,它支持了很多开源项目,例如: +- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台 +- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱 +- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱 +- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准 +- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准 +- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准 +- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台 -- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱 +- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱 -- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱 +- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架 MMCV 提供了如下众多功能: @@ -23,6 +31,12 @@ MMCV 提供了如下众多功能: - 多种 CNN 网络结构 - 高质量实现的常见 CUDA 算子 +MMCV 支持以下的系统: + +- Linux +- Windows +- macOS + 如想了解更多特性和使用,请参考[文档](https://mmcv.readthedocs.io/zh_CN/latest)。 ```{note} diff --git a/docs_zh_CN/get_started/previous_versions.md b/docs/zh_cn/get_started/previous_versions.md similarity index 93% rename from docs_zh_CN/get_started/previous_versions.md rename to docs/zh_cn/get_started/previous_versions.md index 56679d48181290768f33d0da866b7399ca63e710..d543818752b51985169d4489bd46708725ce422d 100644 --- a/docs_zh_CN/get_started/previous_versions.md +++ b/docs/zh_cn/get_started/previous_versions.md @@ -1,11 +1,10 @@ - ## 其他版本的 PyTorch 我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包,但为了您的方便,您可以在下面找到它们。 ### PyTorch 1.4 -| 1.0.0 <= mmcv_version <= 1.2.1 +| 1.0.0 \<= mmcv_version \<= 1.2.1 #### CUDA 10.1 @@ -27,7 +26,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis ### PyTorch v1.3 -| 1.0.0 <= mmcv_version <= 1.3.16 +| 1.0.0 \<= mmcv_version \<= 1.3.16 #### CUDA 10.1 diff --git a/docs_zh_CN/index.rst b/docs/zh_cn/index.rst similarity index 100% rename from docs_zh_CN/index.rst rename to docs/zh_cn/index.rst diff --git a/docs_zh_CN/make.bat b/docs/zh_cn/make.bat similarity index 100% rename from docs_zh_CN/make.bat rename to docs/zh_cn/make.bat diff --git a/docs_zh_CN/mmcv-logo.png b/docs/zh_cn/mmcv-logo.png similarity index 100% rename from docs_zh_CN/mmcv-logo.png rename to docs/zh_cn/mmcv-logo.png diff --git a/docs/zh_cn/understand_mmcv/cnn.md b/docs/zh_cn/understand_mmcv/cnn.md new file mode 100644 index 0000000000000000000000000000000000000000..aa8584f72f3825080c8620dadaf947a591bed22a --- /dev/null +++ b/docs/zh_cn/understand_mmcv/cnn.md @@ -0,0 +1,570 @@ +## 卷积神经网络 + +我们为卷积神经网络提供了一些构建模块,包括层构建、模块组件和权重初始化。 + +### 网络层的构建 + +在运行实验时,我们可能需要尝试同属一种类型但不同配置的层,但又不希望每次都修改代码。于是我们提供一些层构建方法,可以从字典构建层,字典可以在配置文件中配置,也可以通过命令行参数指定。 + +#### 用法 + +一个简单的例子: + +```python +cfg = dict(type='Conv3d') +layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3) +``` + +- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名) +- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN(IN是IN2d的别名) +- `build_activation_layer`:支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU +- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle +- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate + +#### 拓展 + +我们还允许自定义层和算子来扩展构建方法。 + +1. 编写和注册自己的模块: + + ```python + from mmcv.cnn import UPSAMPLE_LAYERS + + @UPSAMPLE_LAYERS.register_module() + class MyUpsample: + + def __init__(self, scale_factor): + pass + + def forward(self, x): + pass + ``` + +2. 在某处导入 `MyUpsample` (例如 `__init__.py` )然后使用它: + + ```python + cfg = dict(type='MyUpsample', scale_factor=2) + layer = build_upsample_layer(cfg) + ``` + +### 模块组件 + +我们还提供了常用的模块组件,以方便网络构建。 +卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成,更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。 + +```python +# conv + bn + relu +conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN')) +# conv + gn + relu +conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2)) +# conv + relu +conv = ConvModule(3, 8, 2) +# conv +conv = ConvModule(3, 8, 2, act_cfg=None) +# conv + leaky relu +conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU')) +# bn + conv + relu +conv = ConvModule( + 3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act')) +``` + +### Weight initialization + +> 实现细节可以在 [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)中找到 + +在训练过程中,适当的初始化策略有利于加快训练速度或者获得更高的性能。 在MMCV中,我们提供了一些常用的方法来初始化模块,比如 `nn.Conv2d` 模块。当然,我们也提供了一些高级API,可用于初始化包含一个或多个模块的模型。 + +#### Initialization functions + +以函数的方式初始化 `nn.Module` ,例如 `nn.Conv2d` 、 `nn.Linear` 等。 + +我们提供以下初始化方法, + +- constant_init + + 使用给定常量值初始化模型参数 + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import constant_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # constant_init(module, val, bias=0) + >>> constant_init(conv1, 1, 0) + >>> conv1.weight + ``` + +- xavier_init + + 按照 [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 描述的方法初始化模型参数 + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import xavier_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # xavier_init(module, gain=1, bias=0, distribution='normal') + >>> xavier_init(conv1, distribution='normal') + ``` + +- normal_init + + 使用正态分布(高斯分布)初始化模型参数 + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import normal_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # normal_init(module, mean=0, std=1, bias=0) + >>> normal_init(conv1, std=0.01, bias=0) + ``` + +- uniform_init + + 使用均匀分布初始化模型参数 + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import uniform_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # uniform_init(module, a=0, b=1, bias=0) + >>> uniform_init(conv1, a=0, b=1) + ``` + +- kaiming_init + + 按照 [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) 描述的方法来初始化模型参数。 + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import kaiming_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal') + >>> kaiming_init(conv1) + ``` + +- caffe2_xavier_init + + caffe2中实现的 `xavier initialization`,对应于 PyTorch中的 `kaiming_uniform_` + + ```python + >>> import torch.nn as nn + >>> from mmcv.cnn import caffe2_xavier_init + >>> conv1 = nn.Conv2d(3, 3, 1) + >>> # caffe2_xavier_init(module, bias=0) + >>> caffe2_xavier_init(conv1) + ``` + +- bias_init_with_prob + + 根据给定的概率初始化 `conv/fc`, 这在 [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf) 提出。 + + ```python + >>> from mmcv.cnn import bias_init_with_prob + >>> # bias_init_with_prob is proposed in Focal Loss + >>> bias = bias_init_with_prob(0.01) + >>> bias + -4.59511985013459 + ``` + +#### Initializers and configs + +在初始化方法的基础上,我们定义了相应的初始化类,并将它们注册到 `INITIALIZERS` 中,这样我们就可以使用 `config` 配置来初始化模型了。 + +我们提供以下初始化类: + +- ConstantInit +- XavierInit +- NormalInit +- UniformInit +- KaimingInit +- Caffe2XavierInit +- PretrainedInit + +接下来详细介绍 `initialize` 的使用方法 + +1. 通过关键字 `layer` 来初始化模型 + + 如果我们只定义了关键字 `layer` ,那么只初始化 `layer` 中包含的层。 + + 注意: 关键字 `layer` 支持的模块是带有 weights 和 bias 属性的 PyTorch 模块,所以不支持 `MultiheadAttention layer` + +- 定义关键字 `layer` 列表并使用相同相同配置初始化模块 + + ```python + import torch.nn as nn + from mmcv.cnn import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Linear(1, 2) + + model = FooNet() + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) + # 使用相同的配置初始化整个模块 + initialize(model, init_cfg) + # model.feat.weight + # Parameter containing: + # tensor([[[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]]], requires_grad=True) + ``` + +- 定义关键字 `layer` 用于初始化不同配置的层 + + ```python + import torch.nn as nn + from mmcv.cnn.utils import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Linear(1,2) + + model = FooNet() + init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Conv2d', val=2), + dict(type='Constant', layer='Linear', val=3)] + # nn.Conv1d 使用 dict(type='Constant', val=1) 初始化 + # nn.Conv2d 使用 dict(type='Constant', val=2) 初始化 + # nn.Linear 使用 dict(type='Constant', val=3) 初始化 + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + ``` + +2. 定义关键字`override`初始化模型 + +- 当用属性名初始化某个特定部分时, 我们可以使用关键字 `override`, 关键字 `override` 对应的Value会替代init_cfg中相应的值 + + ```python + import torch.nn as nn + from mmcv.cnn import initialize + + class FooNet(nn.Module): + def __init__(self): + super().__init__() + self.feat = nn.Conv1d(3, 1, 3) + self.reg = nn.Conv2d(3, 3, 3) + self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) + + # 如果我们想将模型的权重初始化为 1,将偏差初始化为 2 + # 但希望 `reg` 中的权重为 3,偏差为 4,则我们可以使用关键字override + + model = FooNet() + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, + override=dict(type='Constant', name='reg', val=3, bias=4)) + # 使用 dict(type='Constant', val=1, bias=2)来初始化 self.feat and self.cls + # 使用dict(type='Constant', val=3, bias=4)来初始化‘reg’模块。 + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[3., 3., 3.], + # [3., 3., 3.], + # [3., 3., 3.]], + # ..., + # [[3., 3., 3.], + # [3., 3., 3.], + # [3., 3., 3.]]]], requires_grad=True) + ``` + +- 如果 init_cfg 中的关键字`layer`为None,则只初始化在关键字override中的子模块,并且省略override中的 type 和其他参数 + + ```python + model = FooNet() + init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) + # self.feat 和 self.cls 使用pyTorch默认的初始化 + # 将使用 dict(type='Constant', val=1, bias=2) 初始化名为 'reg' 的模块 + initialize(model, init_cfg) + # model.reg.weight + # Parameter containing: + # tensor([[[[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]], + # ..., + # [[1., 1., 1.], + # [1., 1., 1.], + # [1., 1., 1.]]]], requires_grad=True) + ``` + +- 如果我们没有定义关键字`layer`或`override` , 将不会初始化任何东西 + +- 关键字`override`的无效用法 + + ```python + # 没有重写任何子模块 + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], + val=1, bias=2, + override=dict(type='Constant', val=3, bias=4)) + + # 没有指定type,即便有其他参数,也是无效的。 + init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], + val=1, bias=2, + override=dict(name='reg', val=3, bias=4)) + ``` + +3. 用预训练模型初始化 + + ```python + import torch.nn as nn + import torchvision.models as models + from mmcv.cnn import initialize + + # 使用预训练模型来初始化 + model = models.resnet50() + # model.conv1.weight + # Parameter containing: + # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03, ..., -2.1245e-03, + # -1.8077e-03, 3.0338e-03], + # [-1.2603e-02, -2.7831e-02, 2.3187e-02, ..., -1.5793e-02, + # 1.1655e-02, 4.5889e-03], + # [-3.7916e-02, 1.2014e-02, 1.3815e-02, ..., -4.2651e-03, + # 1.7314e-02, -9.9998e-03], + # ..., + + init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + initialize(model, init_cfg) + # model.conv1.weight + # Parameter containing: + # tensor([[[[ 1.3335e-02, 1.4664e-02, -1.5351e-02, ..., -4.0896e-02, + # -4.3034e-02, -7.0755e-02], + # [ 4.1205e-03, 5.8477e-03, 1.4948e-02, ..., 2.2060e-03, + # -2.0912e-02, -3.8517e-02], + # [ 2.2331e-02, 2.3595e-02, 1.6120e-02, ..., 1.0281e-01, + # 6.2641e-02, 5.1977e-02], + # ..., + + # 使用关键字'prefix'用预训练模型的特定部分来初始化子模块权重 + model = models.resnet50() + url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ + 'retinanet_r50_fpn_1x_coco/'\ + 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' + init_cfg = dict(type='Pretrained', + checkpoint=url, prefix='backbone.') + initialize(model, init_cfg) + ``` + +4. 初始化继承自BaseModule、Sequential、ModuleList、ModuleDict的模型 + + `BaseModule` 继承自 `torch.nn.Module`, 它们之间唯一的不同是 `BaseModule` 实现了 `init_weight` + + `Sequential` 继承自 `BaseModule` 和 `torch.nn.Sequential` + + `ModuleList` 继承自 `BaseModule` 和 `torch.nn.ModuleList` + + `ModuleDict` 继承自 `BaseModule` 和 `torch.nn.ModuleDict` + + ```python + import torch.nn as nn + from mmcv.runner import BaseModule, Sequential, ModuleList, ModuleDict + + class FooConv1d(BaseModule): + + def __init__(self, init_cfg=None): + super().__init__(init_cfg) + self.conv1d = nn.Conv1d(4, 1, 4) + + def forward(self, x): + return self.conv1d(x) + + class FooConv2d(BaseModule): + + def __init__(self, init_cfg=None): + super().__init__(init_cfg) + self.conv2d = nn.Conv2d(3, 1, 3) + + def forward(self, x): + return self.conv2d(x) + + # BaseModule + init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.) + model = FooConv1d(init_cfg) + model.init_weights() + # model.conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + + # Sequential + init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.) + init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.) + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + seq_model = Sequential(model1, model2) + seq_model.init_weights() + # seq_model[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # seq_model[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + seq_model = Sequential(model1, model2, init_cfg=init_cfg) + seq_model.init_weights() + # seq_model[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # seq_model[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # ModuleList + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + modellist = ModuleList([model1, model2]) + modellist.init_weights() + # modellist[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modellist[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + modellist = ModuleList([model1, model2], init_cfg=init_cfg) + modellist.init_weights() + # modellist[0].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modellist[1].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # ModuleDict + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + modeldict = ModuleDict(dict(model1=model1, model2=model2)) + modeldict.init_weights() + # modeldict['model1'].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modeldict['model2'].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + + # inner init_cfg has higher priority + model1 = FooConv1d(init_cfg1) + model2 = FooConv2d(init_cfg2) + init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) + modeldict = ModuleDict(dict(model1=model1, model2=model2), init_cfg=init_cfg) + modeldict.init_weights() + # modeldict['model1'].conv1d.weight + # Parameter containing: + # tensor([[[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]], requires_grad=True) + # modeldict['model2'].conv2d.weight + # Parameter containing: + # tensor([[[[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]], + # ..., + # [[2., 2., 2.], + # [2., 2., 2.], + # [2., 2., 2.]]]], requires_grad=True) + ``` + +### Model Zoo + +除了`torchvision`的预训练模型,我们还提供以下 CNN 的预训练模型: + +- VGG Caffe +- ResNet Caffe +- ResNeXt +- ResNet with Group Normalization +- ResNet with Group Normalization and Weight Standardization +- HRNetV2 +- Res2Net +- RegNet + +#### Model URLs in JSON + +MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于: + +```json +{ + "model_a": "https://example.com/models/model_a_9e5bac.pth", + "model_b": "pretrain/model_b_ab3ef2c.pth" +} +``` + +可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。 + +你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接,如果在环境中找不到`MMCV_HOME`,则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。 + +外部的json文件将被合并为默认文件,如果相同的键出现在外部`json`和默认`json`中,则将使用外部`json`。 + +#### Load Checkpoint + +`mmcv.load_checkpoint()`的参数`filename`支持以下类型: + +- filepath: `checkpoint`路径 +- `http://xxx` and `https://xxx`: 下载checkpoint的链接,文件名中必需包含`SHA256`后缀 +- `torchvision://xxx`: `torchvision.models`中的模型链接,更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) +- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径 diff --git a/docs_zh_CN/understand_mmcv/config.md b/docs/zh_cn/understand_mmcv/config.md similarity index 99% rename from docs_zh_CN/understand_mmcv/config.md rename to docs/zh_cn/understand_mmcv/config.md index c6da308833ebb3e1588d7dfb5ba66cc90fb5ee42..52d7ab37b4a375cf67a08fdb1ae7add4672c2d44 100644 --- a/docs_zh_CN/understand_mmcv/config.md +++ b/docs/zh_cn/understand_mmcv/config.md @@ -40,6 +40,7 @@ d = 'string' 这里是一个带有预定义变量的配置文件的例子。 `config_a.py` + ```python a = 1 b = './work_dir/{{ fileBasenameNoExtension }}' @@ -65,6 +66,7 @@ c = '{{ fileExtname }}' a = 1 b = dict(b1=[0, 1, 2], b2=None) ``` + ### 不含重复键值对从基类配置文件继承 `config_b.py` @@ -83,6 +85,7 @@ d = 'string' ... c=(1, 2), ... d='string') ``` + 在`config_b.py`里的新字段与在`config_a.py`里的旧字段拼接 ### 含重复键值对从基类配置文件继承 diff --git a/docs_zh_CN/understand_mmcv/data_process.md b/docs/zh_cn/understand_mmcv/data_process.md similarity index 96% rename from docs_zh_CN/understand_mmcv/data_process.md rename to docs/zh_cn/understand_mmcv/data_process.md index 0885fe03353738d42b4503c9dddf4ec70883c5bb..98f00f1ed6a33f3dcbdb662008621474bb45b7ef 100644 --- a/docs_zh_CN/understand_mmcv/data_process.md +++ b/docs/zh_cn/understand_mmcv/data_process.md @@ -252,9 +252,9 @@ flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1) mmcv.flowshow(flow) ``` -![progress](../../docs/_static/flow_visualization.png) +![progress](../../en/_static/flow_visualization.png) -3. 流变换 +1. 流变换 ```python img1 = mmcv.imread('img1.jpg') @@ -264,12 +264,12 @@ warpped_img2 = mmcv.flow_warp(img1, flow) img1 (左) and img2 (右) -![raw images](../../docs/_static/flow_raw_images.png) +![raw images](../../en/_static/flow_raw_images.png) 光流 (img2 -> img1) -![optical flow](../../docs/_static/flow_img2toimg1.png) +![optical flow](../../en/_static/flow_img2toimg1.png) 变换后的图像和真实图像的差异 -![warpped image](../../docs/_static/flow_warp_diff.png) +![warpped image](../../en/_static/flow_warp_diff.png) diff --git a/docs_zh_CN/understand_mmcv/io.md b/docs/zh_cn/understand_mmcv/io.md similarity index 99% rename from docs_zh_CN/understand_mmcv/io.md rename to docs/zh_cn/understand_mmcv/io.md index 0e5002f828f5489ee0447d65501de78e20d3f093..eb4fe14ba1102effa43acb906e23ffdd95ecf5c6 100644 --- a/docs_zh_CN/understand_mmcv/io.md +++ b/docs/zh_cn/understand_mmcv/io.md @@ -107,6 +107,7 @@ c d e ``` + #### 从硬盘读取 使用 `list_from_file` 读取 `a.txt` diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md new file mode 100644 index 0000000000000000000000000000000000000000..82c9eb4fcabc42f2506f22cfc4b5cc5881ae939a --- /dev/null +++ b/docs/zh_cn/understand_mmcv/ops.md @@ -0,0 +1,60 @@ +## 算子 + +MMCV 提供了检测、分割等任务中常用的算子 + +| Device | CPU | CUDA | MLU | MPS | +| ---------------------------- | --- | ---- | --- | --- | +| ActiveRotatedFilter | √ | √ | | | +| AssignScoreWithK | | √ | | | +| BallQuery | | √ | | | +| BBoxOverlaps | | √ | √ | √ | +| BorderAlign | | √ | | | +| BoxIouRotated | √ | √ | | | +| CARAFE | | √ | | | +| ChamferDistance | | √ | | | +| CrissCrossAttention | | √ | | | +| ContourExpand | √ | | | | +| ConvexIoU | | √ | | | +| CornerPool | | √ | | | +| Correlation | | √ | | | +| Deformable Convolution v1/v2 | √ | √ | | | +| Deformable RoIPool | | √ | | | +| DiffIoURotated | | √ | | | +| DynamicScatter | | √ | | | +| FurthestPointSample | | √ | | | +| FurthestPointSampleWithDist | | √ | | | +| FusedBiasLeakyrelu | | √ | | | +| GatherPoints | | √ | | | +| GroupPoints | | √ | | | +| Iou3d | | √ | | | +| KNN | | √ | | | +| MaskedConv | | √ | | | +| MergeCells | | √ | | | +| MinAreaPolygon | | √ | | | +| ModulatedDeformConv2d | √ | √ | | | +| MultiScaleDeformableAttn | | √ | | | +| NMS | √ | √ | √ | | +| NMSRotated | √ | √ | | | +| PixelGroup | √ | | | | +| PointsInBoxes | √ | √ | | | +| PointsInPolygons | | √ | | | +| PSAMask | √ | √ | √ | | +| RotatedFeatureAlign | √ | √ | | | +| RoIPointPool3d | | √ | | | +| RoIPool | | √ | √ | | +| RoIAlignRotated | √ | √ | √ | | +| RiRoIAlignRotated | | √ | | | +| RoIAlign | √ | √ | √ | | +| RoIAwarePool3d | | √ | | | +| SAConv2d | | √ | | | +| SigmoidFocalLoss | | √ | √ | | +| SoftmaxFocalLoss | | √ | | | +| SoftNMS | | √ | | | +| Sparse Convolution | | √ | | | +| Synchronized BatchNorm | | √ | | | +| ThreeInterpolate | | √ | | | +| ThreeNN | | √ | | | +| TINShift | | √ | √ | | +| UpFirDn2d | | √ | | | +| Voxelization | √ | √ | | | +| PrRoIPool | | √ | | | diff --git a/docs_zh_CN/understand_mmcv/registry.md b/docs/zh_cn/understand_mmcv/registry.md similarity index 76% rename from docs_zh_CN/understand_mmcv/registry.md rename to docs/zh_cn/understand_mmcv/registry.md index 3afd0ab66e8e9787280ce54cdfb807e2acf60827..325baa41db36f13fc627ccb57759fb5210e696f9 100644 --- a/docs_zh_CN/understand_mmcv/registry.md +++ b/docs/zh_cn/understand_mmcv/registry.md @@ -1,11 +1,17 @@ ## 注册器 + MMCV 使用 [注册器](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) 来管理具有相似功能的不同模块, 例如, 检测器中的主干网络、头部、和模型颈部。 在 OpenMMLab 家族中的绝大部分开源项目使用注册器去管理数据集和模型的模块,例如 [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting) 等。 +```{note} +在 v1.5.1 版本开始支持注册函数的功能。 +``` + ### 什么是注册器 -在MMCV中,注册器可以看作类到字符串的映射。 -一个注册器中的类通常有相似的接口,但是可以实现不同的算法或支持不同的数据集。 -借助注册器,用户可以通过使用相应的字符串查找并实例化该类,并根据他们的需要实例化对应模块。 + +在MMCV中,注册器可以看作类或函数到字符串的映射。 +一个注册器中的类或函数通常有相似的接口,但是可以实现不同的算法或支持不同的数据集。 +借助注册器,用户可以通过使用相应的字符串查找类或函数,并根据他们的需要实例化对应模块或调用函数获取结果。 一个典型的案例是,OpenMMLab 中的大部分开源项目的配置系统,这些系统通过配置文件来使用注册器创建钩子、执行器、模型和数据集。 可以在[这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry)找到注册器接口使用文档。 @@ -15,7 +21,7 @@ MMCV 使用 [注册器](https://github.com/open-mmlab/mmcv/blob/master/mmcv/util 2. 创建注册器 3. 使用此注册器来管理模块 -`Registry`(注册器)的参数 `build_func`(构建函数) 用来自定以如何实例化类的实例,默认使用 [这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg)实现的`build_from_cfg`。 +`Registry`(注册器)的参数 `build_func`(构建函数) 用来自定义如何实例化类的实例或如何调用函数获取结果,默认使用 [这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg) 实现的`build_from_cfg`。 ### 一个简单的例子 @@ -29,9 +35,10 @@ from mmcv.utils import Registry CONVERTERS = Registry('converter') ``` -然后我们在包中可以实现不同的转换器(converter)。例如,在 `converters/converter1.py` 中实现 `Converter1`。 +然后我们在包中可以实现不同的转换器(converter),其可以为类或函数。例如,在 `converters/converter1.py` 中实现 `Converter1`,在 `converters/converter2.py` 中实现 `converter2`。 ```python +# converter1.py from .builder import CONVERTERS # 使用注册器管理模块 @@ -41,19 +48,39 @@ class Converter1(object): self.a = a self.b = b ``` -使用注册器管理模块的关键步骤是,将实现的模块注册到注册表 `CONVERTERS` 中。通过 `@CONVERTERS.register_module()` 装饰所实现的模块,字符串和类之间的映射就可以由 `CONVERTERS` 构建和维护,如下所示: -通过这种方式,就可以通过 `CONVERTERS` 建立字符串与类之间的映射,如下所示: +```python +# converter2.py +from .builder import CONVERTERS +from .converter1 import Converter1 + +# 使用注册器管理模块 +@CONVERTERS.register_module() +def converter2(a, b) + return Converter1(a, b) +``` + +使用注册器管理模块的关键步骤是,将实现的模块注册到注册表 `CONVERTERS` 中。通过 `@CONVERTERS.register_module()` 装饰所实现的模块,字符串到类或函数之间的映射就可以由 `CONVERTERS` 构建和维护,如下所示: + +通过这种方式,就可以通过 `CONVERTERS` 建立字符串与类或函数之间的映射,如下所示: ```python 'Converter1' -> +'converter2' -> +``` + +```{note} +只有模块所在的文件被导入时,注册机制才会被触发,所以您需要在某处导入该文件。更多详情请查看 https://github.com/open-mmlab/mmdetection/issues/5974。 ``` 如果模块被成功注册了,你可以通过配置文件使用这个转换器(converter),如下所示: ```python -converter_cfg = dict(type='Converter1', a=a_value, b=b_value) -converter = CONVERTERS.build(converter_cfg) +converter1_cfg = dict(type='Converter1', a=a_value, b=b_value) +converter2_cfg = dict(type='converter2', a=a_value, b=b_value) +converter1 = CONVERTERS.build(converter1_cfg) +# returns the calling result +result = CONVERTERS.build(converter2_cfg) ``` ### 自定义构建函数 @@ -84,7 +111,7 @@ CONVERTERS = Registry('converter', build_func=build_converter) 该功能类似于默认的`build_from_cfg`。在大多数情况下,默认就足够了。 ``` -`build_model_from_cfg`也实现了在`nn.Sequentail`中构建PyTorch模块,你可以直接使用它们。 +`build_model_from_cfg`也实现了在`nn.Sequential`中构建PyTorch模块,你可以直接使用它们。 ### 注册器层结构 diff --git a/docs_zh_CN/understand_mmcv/runner.md b/docs/zh_cn/understand_mmcv/runner.md similarity index 88% rename from docs_zh_CN/understand_mmcv/runner.md rename to docs/zh_cn/understand_mmcv/runner.md index 203a5dcacfd709772dce8c411a25bb8a623e0dd7..7098eb977f998ed67fc2a6fc66b0d436c47f3d75 100644 --- a/docs_zh_CN/understand_mmcv/runner.md +++ b/docs/zh_cn/understand_mmcv/runner.md @@ -8,7 +8,7 @@ ### EpochBasedRunner -顾名思义,`EpochBasedRunner` 是指以 epoch 为周期的工作流,例如设置 workflow = [('train', 2), ('val', 1)] 表示循环迭代地训练 2 个 epoch,然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。 +顾名思义,`EpochBasedRunner` 是指以 epoch 为周期的工作流,例如设置 workflow = \[('train', 2), ('val', 1)\] 表示循环迭代地训练 2 个 epoch,然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。 其抽象逻辑如下所示: @@ -25,6 +25,7 @@ while curr_epoch < max_epochs: for _ in range(epochs): epoch_runner(data_loaders[i], **kwargs) ``` + 目前支持训练和验证两个工作流,以训练函数为例,其抽象逻辑是: ```python @@ -40,7 +41,8 @@ def train(self, data_loader, **kwargs): ``` ### IterBasedRunner -不同于 `EpochBasedRunner`,`IterBasedRunner` 是指以 iter 为周期的工作流,例如设置 workflow = [('train', 2), ('val', 1)] 表示循环迭代的训练 2 个 iter,然后验证 1 个 iter,MMSegmentation 语义分割框架默认采用的是 `EpochBasedRunner`。 + +不同于 `EpochBasedRunner`,`IterBasedRunner` 是指以 iter 为周期的工作流,例如设置 workflow = \[('train', 2), ('val', 1)\] 表示循环迭代的训练 2 个 iter,然后验证 1 个 iter,MMSegmentation 语义分割框架默认采用的是 `IterBasedRunner`。 其抽象逻辑如下所示: @@ -59,6 +61,7 @@ while curr_iter < max_iters: for _ in range(iters): iter_runner(iter_loaders[i], **kwargs) ``` + 目前支持训练和验证两个工作流,以验证函数为例,其抽象逻辑是: ```python @@ -75,6 +78,7 @@ def val(self, data_loader, **kwargs): 除了上述基础功能外,`EpochBasedRunner` 和 `IterBasedRunner` 还提供了 resume 、 save_checkpoint 和注册 hook 功能。 ### 一个简单例子 + 以最常用的分类任务为例详细说明 `runner` 的使用方法。 开启任何一个训练任务,都需要包括如下步骤: **(1) dataloader、model 和优化器等类初始化** @@ -148,8 +152,8 @@ runner.run(data_loaders, cfg.workflow) 关于 workflow 设置,以 `EpochBasedRunner` 为例,详情如下: -- 假设只想运行训练工作流,则可以设置 workflow = [('train', 1)],表示只进行迭代训练 -- 假设想运行训练和验证工作流,则可以设置 workflow = [('train', 3), ('val', 1)],表示先训练 3 个 epoch ,然后切换到 val 工作流,运行 1 个 epoch,然后循环,直到训练 epoch 次数达到指定值 -- 工作流设置还自由定制,例如你可以先验证再训练 workflow = [('val', 1), ('train', 1)] +- 假设只想运行训练工作流,则可以设置 workflow = \[('train', 1)\],表示只进行迭代训练 +- 假设想运行训练和验证工作流,则可以设置 workflow = \[('train', 3), ('val', 1)\],表示先训练 3 个 epoch ,然后切换到 val 工作流,运行 1 个 epoch,然后循环,直到训练 epoch 次数达到指定值 +- 工作流设置还自由定制,例如你可以先验证再训练 workflow = \[('val', 1), ('train', 1)\] 上述代码都已经封装到了各个代码库的 train.py 中,用户只需要设置相应的配置即可,上述流程会自动运行。 diff --git a/docs_zh_CN/understand_mmcv/utils.md b/docs/zh_cn/understand_mmcv/utils.md similarity index 93% rename from docs_zh_CN/understand_mmcv/utils.md rename to docs/zh_cn/understand_mmcv/utils.md index 746c560039759df3e6f76ae665e63812ed3c9ed6..c02e5203a4cde69e9f9f332b047bfea25c151bb4 100644 --- a/docs_zh_CN/understand_mmcv/utils.md +++ b/docs/zh_cn/understand_mmcv/utils.md @@ -17,7 +17,7 @@ mmcv.track_progress(func, tasks) ``` 效果如下 -![progress](../../docs/_static/progress.*) +![progress](../../en/_static/progress.*) 如果你想可视化多进程任务的进度,你可以使用 `track_parallel_progress` 。 @@ -25,7 +25,7 @@ mmcv.track_progress(func, tasks) mmcv.track_parallel_progress(func, tasks, 8) # 8 workers ``` -![progress](../../docs/_static/parallel_progress.*) +![progress](../../_static/parallel_progress.*) 如果你想要迭代或枚举数据列表并可视化进度,你可以使用 `track_iter_progress` 。 @@ -58,7 +58,6 @@ with mmcv.Timer(): 你也可以使用 `since_start()` 和 `since_last_check()` 。前者返回计时器启动后的运行时长,后者返回最近一次查看计时器后的运行时长。 - ```python timer = mmcv.Timer() # code block 1 here diff --git a/docs_zh_CN/understand_mmcv/visualization.md b/docs/zh_cn/understand_mmcv/visualization.md similarity index 100% rename from docs_zh_CN/understand_mmcv/visualization.md rename to docs/zh_cn/understand_mmcv/visualization.md diff --git a/docs_zh_CN/community/pr.md b/docs_zh_CN/community/pr.md deleted file mode 100644 index 219e01dd747827adedddd922310624f97ff10672..0000000000000000000000000000000000000000 --- a/docs_zh_CN/community/pr.md +++ /dev/null @@ -1,90 +0,0 @@ -## 拉取请求 - -### 什么是拉取请求? - -`拉取请求` (Pull Request), [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests)定义如下。 - ->拉取请求是一种通知机制。你修改了他人的代码,将你的修改通知原来作者,希望他合并你的修改。 - -### 基本的工作流: - -1. 获取最新的代码库 -2. 从主分支创建最新的分支进行开发 -3. 提交修改 -4. 推送你的修改并创建一个`拉取请求` -5. 讨论、审核代码 -6. 将开发分支合并到主分支 - -### 具体步骤 - -1. 获取最新的代码库 - + 当你第一次提 PR 时 - - 复刻 OpenMMLab 原代码库,点击 GitHub 页面右上角的 **Fork** 按钮即可 - ![avatar](../../docs/_static/community/1.png) - - - 克隆复刻的代码库到本地 - ```bash - git clone git@github.com:XXX/mmcv.git - ``` - - - 添加原代码库为上游代码库 - ```bash - git remote add upstream git@github.com:open-mmlab/mmcv - ``` - + 从第二个 PR 起 - - 检出本地代码库的主分支,然后从最新的原代码库的主分支拉取更新 - ```bash - git checkout master - git pull upstream master - ``` - -2. 从主分支创建一个新的开发分支 - ```bash - git checkout -b branchname - ``` - 注意:为了保证提交历史清晰可读,我们强烈推荐您先检出主分支 (master),再创建新的分支。 - -3. 提交你的修改 - ```bash - # coding - git add [files] - git commit -m 'messages' - ``` - -4. 推送你的修改到复刻的代码库,并创建一个`拉取请求` - + 推送当前分支到远端复刻的代码库 - ```bash - git push origin branchname - ``` - - + 创建一个`拉取请求` - ![avatar](../../docs/_static/community/2.png) - - + 修改`拉取请求`信息模板,描述修改原因和修改内容。还可以在 PR 描述中,手动关联到相关的`议题` (issue),(更多细节,请参考[官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue))。 - -5. 讨论并评审你的代码 - + 创建`拉取请求`时,可以关联给相关人员进行评审 - ![avatar](../../docs/_static/community/3.png) - - + 根据评审人员的意见修改代码,并推送修改 - -6. `拉取请求`合并之后删除该分支 -```bash -git branch -d branchname # delete local branch -git push origin --delete branchname # delete remote branch -``` - -### PR 规范 - -1. 使用 [pre-commit hook](https://pre-commit.com),尽量减少代码风格相关问题 -2. 一个PR对应一个短期分支 -3. 粒度要细,一个PR只做一件事情,避免超大的PR - >- Bad:实现Faster R-CNN - >- Acceptable:给 Faster R-CNN 添加一个 box head - >- Good:给 box head 增加一个参数来支持自定义的 conv 层数 -4. 每次 Commit 时需要提供清晰且有意义 commit 信息 -5. 提供清晰且有意义的`拉取请求`描述 - >- 标题写明白任务名称,一般格式:[Prefix] Short description of the pull request (Suffix) - >- prefix: 新增功能 [Feature], 修 bug [Fix], 文档相关 [Docs], 开发中 [WIP] (暂时不会被review) - >- 描述里介绍`拉取请求`的主要修改内容,结果,以及对其他部分的影响, 参考`拉取请求`模板 - >- 关联相关的`议题` (issue) 和其他`拉取请求` diff --git a/docs_zh_CN/faq.md b/docs_zh_CN/faq.md deleted file mode 100644 index e5d6395720e9e210771e10256efb926a0da5f4fa..0000000000000000000000000000000000000000 --- a/docs_zh_CN/faq.md +++ /dev/null @@ -1,37 +0,0 @@ -## 常见问题 - -在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题,并且知道可以帮到大家的解决办法, -欢迎随时丰富这个列表。 - -- MMCV 和 MMDetection 的兼容性问题;"ConvWS is already registered in conv layer" - - 请按照上述说明为您的 MMDetection 版本安装正确版本的 MMCV。 - -- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'" - - 1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv - 2. 按照上述说明安装 mmcv-full - -- "invalid device function" 或者 "no kernel image is available for execution" - - 1. 检查 GPU 的 CUDA 计算能力 - 2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的 - 您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV - 兼容性问题的可能会出现在使用旧版的 GPUs,如:colab 上的 Tesla K80 (3.7) - 3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如,您可能使用 CUDA 10.0 编译 mmcv,但在 CUDA 9.0 的环境中运行它 - -- "undefined symbol" 或者 "cannot open xxx.so"。 - - 1. 如果符号和 CUDA/C++ 相关(例如:libcudart.so 或者 GLIBCXX),请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致 - 2. 如果符号和 PyTorch 相关(例如:符号包含 caffe、aten 和 TH),请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致 - 3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同 - -- "RuntimeError: CUDA error: invalid configuration argument"。 - - 这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低[THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10) - 的值并重新编译 mmcv。 - -- "RuntimeError: nms is not compiled with GPU support"。 - - 这个错误是由于您的 CUDA 环境没有正确安装。 - 您可以尝试重新安装您的 CUDA 环境,然后删除 mmcv/build 文件夹并重新编译 mmcv。 diff --git a/docs_zh_CN/understand_mmcv/cnn.md b/docs_zh_CN/understand_mmcv/cnn.md deleted file mode 100644 index 9027cf38dc48cbe342a48c3f4e658d629d2e0974..0000000000000000000000000000000000000000 --- a/docs_zh_CN/understand_mmcv/cnn.md +++ /dev/null @@ -1,525 +0,0 @@ -## 卷积神经网络 - -我们为卷积神经网络提供了一些构建模块,包括层构建、模块组件和权重初始化。 - -### 网络层的构建 - -在运行实验时,我们可能需要尝试同属一种类型但不同配置的层,但又不希望每次都修改代码。于是我们提供一些层构建方法,可以从字典构建层,字典可以在配置文件中配置,也可以通过命令行参数指定。 - -#### 用法 - -一个简单的例子: - -```python -cfg = dict(type='Conv3d') -layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3) -``` - -- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名) -- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN(IN是IN2d的别名) -- `build_activation_layer`:支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU -- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle -- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate - -#### 拓展 - -我们还允许自定义层和算子来扩展构建方法。 - -1. 编写和注册自己的模块: - - ```python - from mmcv.cnn import UPSAMPLE_LAYERS - - @UPSAMPLE_LAYERS.register_module() - class MyUpsample: - - def __init__(self, scale_factor): - pass - - def forward(self, x): - pass - ``` - -2. 在某处导入 `MyUpsample` (例如 `__init__.py` )然后使用它: - - ```python - cfg = dict(type='MyUpsample', scale_factor=2) - layer = build_upsample_layer(cfg) - ``` - -### 模块组件 - -我们还提供了常用的模块组件,以方便网络构建。 -卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成,更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。 - -```python -# conv + bn + relu -conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN')) -# conv + gn + relu -conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2)) -# conv + relu -conv = ConvModule(3, 8, 2) -# conv -conv = ConvModule(3, 8, 2, act_cfg=None) -# conv + leaky relu -conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU')) -# bn + conv + relu -conv = ConvModule( - 3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act')) -``` - -### Weight initialization - -> 实现细节可以在 [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)中找到 - -在训练过程中,适当的初始化策略有利于加快训练速度或者获得更高的性能。 在MMCV中,我们提供了一些常用的方法来初始化模块,比如 `nn.Conv2d` 模块。当然,我们也提供了一些高级API,可用于初始化包含一个或多个模块的模型。 - -#### Initialization functions - -以函数的方式初始化 `nn.Module` ,例如 `nn.Conv2d` 、 `nn.Linear` 等。 - -我们提供以下初始化方法, - -- constant_init - - 使用给定常量值初始化模型参数 - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import constant_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # constant_init(module, val, bias=0) - >>> constant_init(conv1, 1, 0) - >>> conv1.weight - ``` - -- xavier_init - - 按照 [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 描述的方法初始化模型参数 - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import xavier_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # xavier_init(module, gain=1, bias=0, distribution='normal') - >>> xavier_init(conv1, distribution='normal') - ``` - -- normal_init - - 使用正态分布(高斯分布)初始化模型参数 - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import normal_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # normal_init(module, mean=0, std=1, bias=0) - >>> normal_init(conv1, std=0.01, bias=0) - ``` - -- uniform_init - - 使用均匀分布初始化模型参数 - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import uniform_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # uniform_init(module, a=0, b=1, bias=0) - >>> uniform_init(conv1, a=0, b=1) - ``` - -- kaiming_init - - 按照 [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) 描述的方法来初始化模型参数。 - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import kaiming_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal') - >>> kaiming_init(conv1) - ``` - -- caffe2_xavier_init - - caffe2中实现的 `xavier initialization`,对应于 PyTorch中的 `kaiming_uniform_` - - ```python - >>> import torch.nn as nn - >>> from mmcv.cnn import caffe2_xavier_init - >>> conv1 = nn.Conv2d(3, 3, 1) - >>> # caffe2_xavier_init(module, bias=0) - >>> caffe2_xavier_init(conv1) - ``` - -- bias_init_with_prob - - 根据给定的概率初始化 `conv/fc`, 这在 [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf) 提出。 - - ```python - >>> from mmcv.cnn import bias_init_with_prob - >>> # bias_init_with_prob is proposed in Focal Loss - >>> bias = bias_init_with_prob(0.01) - >>> bias - -4.59511985013459 - ``` - -#### Initializers and configs - -在初始化方法的基础上,我们定义了相应的初始化类,并将它们注册到 `INITIALIZERS` 中,这样我们就可以使用 `config` 配置来初始化模型了。 - -我们提供以下初始化类: - -- ConstantInit -- XavierInit -- NormalInit -- UniformInit -- KaimingInit -- Caffe2XavierInit -- PretrainedInit - -接下来详细介绍 `initialize` 的使用方法 - -1. 通过关键字 `layer` 来初始化模型 - - 如果我们只定义了关键字 `layer` ,那么只初始化 `layer` 中包含的层。 - - 注意: 关键字 `layer` 支持的模块是带有 weights 和 bias 属性的 PyTorch 模块,所以不支持 `MultiheadAttention layer` - -- 定义关键字 `layer` 列表并使用相同相同配置初始化模块 - - ```python - import torch.nn as nn - from mmcv.cnn import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Linear(1, 2) - - model = FooNet() - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1) - # 使用相同的配置初始化整个模块 - initialize(model, init_cfg) - # model.feat.weight - # Parameter containing: - # tensor([[[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]]], requires_grad=True) - ``` - -- 定义关键字 `layer` 用于初始化不同配置的层 - - ```python - import torch.nn as nn - from mmcv.cnn.utils import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Linear(1,2) - - model = FooNet() - init_cfg = [dict(type='Constant', layer='Conv1d', val=1), - dict(type='Constant', layer='Conv2d', val=2), - dict(type='Constant', layer='Linear', val=3)] - # nn.Conv1d 使用 dict(type='Constant', val=1) 初始化 - # nn.Conv2d 使用 dict(type='Constant', val=2) 初始化 - # nn.Linear 使用 dict(type='Constant', val=3) 初始化 - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - ``` - -2. 定义关键字`override`初始化模型 - -- 当用属性名初始化某个特定部分时, 我们可以使用关键字 `override`, 关键字 `override` 对应的Value会替代init_cfg中相应的值 - - ```python - import torch.nn as nn - from mmcv.cnn import initialize - - class FooNet(nn.Module): - def __init__(self): - super().__init__() - self.feat = nn.Conv1d(3, 1, 3) - self.reg = nn.Conv2d(3, 3, 3) - self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) - - # 如果我们想将模型的权重初始化为 1,将偏差初始化为 2 - # 但希望 `cls` 中的权重为 3,偏差为 4,则我们可以使用关键字override - - model = FooNet() - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2, - override=dict(type='Constant', name='reg', val=3, bias=4)) - # 使用 dict(type='Constant', val=1, bias=2)来初始化 self.feat and self.cls - # 使用dict(type='Constant', val=3, bias=4)来初始化‘reg’模块。 - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[3., 3., 3.], - # [3., 3., 3.], - # [3., 3., 3.]], - # ..., - # [[3., 3., 3.], - # [3., 3., 3.], - # [3., 3., 3.]]]], requires_grad=True) - ``` - -- 如果 init_cfg 中的关键字`layer`为None,则只初始化在关键字override中的子模块,并且省略override中的 type 和其他参数 - - ```python - model = FooNet() - init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg')) - # self.feat 和 self.cls 使用pyTorch默认的初始化 - # 将使用 dict(type='Constant', val=1, bias=2) 初始化名为 'reg' 的模块 - initialize(model, init_cfg) - # model.reg.weight - # Parameter containing: - # tensor([[[[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]], - # ..., - # [[1., 1., 1.], - # [1., 1., 1.], - # [1., 1., 1.]]]], requires_grad=True) - ``` - -- 如果我们没有定义关键字`layer`或`override` , 将不会初始化任何东西 - -- 关键字`override`的无效用法 - - ```python - # 没有重写任何子模块 - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], - val=1, bias=2, - override=dict(type='Constant', val=3, bias=4)) - - # 没有指定type,即便有其他参数,也是无效的。 - init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], - val=1, bias=2, - override=dict(name='reg', val=3, bias=4)) - ``` - -3. 用预训练模型初始化 - - ```python - import torch.nn as nn - import torchvision.models as models - from mmcv.cnn import initialize - - # 使用预训练模型来初始化 - model = models.resnet50() - # model.conv1.weight - # Parameter containing: - # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03, ..., -2.1245e-03, - # -1.8077e-03, 3.0338e-03], - # [-1.2603e-02, -2.7831e-02, 2.3187e-02, ..., -1.5793e-02, - # 1.1655e-02, 4.5889e-03], - # [-3.7916e-02, 1.2014e-02, 1.3815e-02, ..., -4.2651e-03, - # 1.7314e-02, -9.9998e-03], - # ..., - - init_cfg = dict(type='Pretrained', - checkpoint='torchvision://resnet50') - initialize(model, init_cfg) - # model.conv1.weight - # Parameter containing: - # tensor([[[[ 1.3335e-02, 1.4664e-02, -1.5351e-02, ..., -4.0896e-02, - # -4.3034e-02, -7.0755e-02], - # [ 4.1205e-03, 5.8477e-03, 1.4948e-02, ..., 2.2060e-03, - # -2.0912e-02, -3.8517e-02], - # [ 2.2331e-02, 2.3595e-02, 1.6120e-02, ..., 1.0281e-01, - # 6.2641e-02, 5.1977e-02], - # ..., - - # 使用关键字'prefix'用预训练模型的特定部分来初始化子模块权重 - model = models.resnet50() - url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ - 'retinanet_r50_fpn_1x_coco/'\ - 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' - init_cfg = dict(type='Pretrained', - checkpoint=url, prefix='backbone.') - initialize(model, init_cfg) - ``` - -4. 初始化继承自BaseModule、Sequential、ModuleList的模型 - - `BaseModule` 继承自 `torch.nn.Module`, 它们之间唯一的不同是 `BaseModule` 实现了 `init_weight` - - `Sequential` 继承自 `BaseModule` 和 `torch.nn.Sequential` - - `ModuleList` 继承自 `BaseModule` 和 `torch.nn.ModuleList` - - `````python - import torch.nn as nn - from mmcv.runner import BaseModule, Sequential, ModuleList - - class FooConv1d(BaseModule): - - def __init__(self, init_cfg=None): - super().__init__(init_cfg) - self.conv1d = nn.Conv1d(4, 1, 4) - - def forward(self, x): - return self.conv1d(x) - - class FooConv2d(BaseModule): - - def __init__(self, init_cfg=None): - super().__init__(init_cfg) - self.conv2d = nn.Conv2d(3, 1, 3) - - def forward(self, x): - return self.conv2d(x) - - # BaseModule - init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.) - model = FooConv1d(init_cfg) - model.init_weights() - # model.conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - - # Sequential - init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.) - init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.) - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - seq_model = Sequential(model1, model2) - seq_model.init_weights() - # seq_model[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # seq_model[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # inner init_cfg has higher priority - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) - seq_model = Sequential(model1, model2, init_cfg=init_cfg) - seq_model.init_weights() - # seq_model[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # seq_model[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # ModuleList - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - modellist = ModuleList([model1, model2]) - modellist.init_weights() - # modellist[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # modellist[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - - # inner init_cfg has higher priority - model1 = FooConv1d(init_cfg1) - model2 = FooConv2d(init_cfg2) - init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.) - modellist = ModuleList([model1, model2], init_cfg=init_cfg) - modellist.init_weights() - # modellist[0].conv1d.weight - # Parameter containing: - # tensor([[[0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.], - # [0., 0., 0., 0.]]], requires_grad=True) - # modellist[1].conv2d.weight - # Parameter containing: - # tensor([[[[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]], - # ..., - # [[2., 2., 2.], - # [2., 2., 2.], - # [2., 2., 2.]]]], requires_grad=True) - ````` - -### Model Zoo - -除了`torchvision`的预训练模型,我们还提供以下 CNN 的预训练模型: - -- VGG Caffe -- ResNet Caffe -- ResNeXt -- ResNet with Group Normalization -- ResNet with Group Normalization and Weight Standardization -- HRNetV2 -- Res2Net -- RegNet - -#### Model URLs in JSON - -MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于: - -```json -{ - "model_a": "https://example.com/models/model_a_9e5bac.pth", - "model_b": "pretrain/model_b_ab3ef2c.pth" -} -``` - -可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。 - -你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接,如果在环境中找不到`MMCV_HOME`,则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。 - -外部的json文件将被合并为默认文件,如果相同的键出现在外部`json`和默认`json`中,则将使用外部`json`。 - -#### Load Checkpoint - -`mmcv.load_checkpoint()`的参数`filename`支持以下类型: - -- filepath: `checkpoint`路径 -- `http://xxx` and `https://xxx`: 下载checkpoint的链接,文件名中必需包含`SHA256`后缀 -- `torchvision://xxx`: `torchvision.models`中的模型链接,更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) -- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径 diff --git a/docs_zh_CN/understand_mmcv/ops.md b/docs_zh_CN/understand_mmcv/ops.md deleted file mode 100644 index a45bb14862ad0ec05d5fa4d66954ac1465bb668c..0000000000000000000000000000000000000000 --- a/docs_zh_CN/understand_mmcv/ops.md +++ /dev/null @@ -1,36 +0,0 @@ -## CUDA 算子 - -MMCV 提供了检测、分割等任务中常用的 CUDA 算子 - -- AssignScoreWithK -- BallQuery -- BBoxOverlaps -- CARAFE -- CrissCrossAttention -- ContextBlock -- CornerPool -- Deformable Convolution v1/v2 -- Deformable RoIPool -- DynamicScatter -- GatherPoints -- FurthestPointSample -- FurthestPointSampleWithDist -- GeneralizedAttention -- KNN -- MaskedConv -- NMS -- PSAMask -- RoIPointPool3d -- RoIPool -- RoIAlign -- RoIAwarePool3d -- SimpleRoIAlign -- SigmoidFocalLoss -- SoftmaxFocalLoss -- SoftNMS -- Synchronized BatchNorm -- Voxelization -- ThreeInterpolate -- ThreeNN -- Weight standardization -- Correlation diff --git a/examples/train.py b/examples/train.py index 2dbdfee40f049f55e07d7be1427fdd2da784a9f4..b08d36bf621747354d0df30bd6d787fd2c12faf1 100644 --- a/examples/train.py +++ b/examples/train.py @@ -14,7 +14,7 @@ from mmcv.utils import get_logger class Model(nn.Module): def __init__(self): - super(Model, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) diff --git a/mmcv/__init__.py b/mmcv/__init__.py index 210a2989138380559f23045b568d0fbbeb918c03..14c556acdf5832a1da569da6819a428f17adc328 100644 --- a/mmcv/__init__.py +++ b/mmcv/__init__.py @@ -13,3 +13,4 @@ from .visualization import * # - runner # - parallel # - op +# - device diff --git a/mmcv/arraymisc/quantization.py b/mmcv/arraymisc/quantization.py index 8e47a3545780cf071a1ef8195efb0b7b662c8186..6182710d51787061304cfc7304ec97d565822536 100644 --- a/mmcv/arraymisc/quantization.py +++ b/mmcv/arraymisc/quantization.py @@ -1,14 +1,20 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + import numpy as np -def quantize(arr, min_val, max_val, levels, dtype=np.int64): +def quantize(arr: np.ndarray, + min_val: Union[int, float], + max_val: Union[int, float], + levels: int, + dtype=np.int64) -> tuple: """Quantize an array of (-inf, inf) to [0, levels-1]. Args: arr (ndarray): Input array. - min_val (scalar): Minimum value to be clipped. - max_val (scalar): Maximum value to be clipped. + min_val (int or float): Minimum value to be clipped. + max_val (int or float): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the quantized array. @@ -29,13 +35,17 @@ def quantize(arr, min_val, max_val, levels, dtype=np.int64): return quantized_arr -def dequantize(arr, min_val, max_val, levels, dtype=np.float64): +def dequantize(arr: np.ndarray, + min_val: Union[int, float], + max_val: Union[int, float], + levels: int, + dtype=np.float64) -> tuple: """Dequantize an array. Args: arr (ndarray): Input array. - min_val (scalar): Minimum value to be clipped. - max_val (scalar): Maximum value to be clipped. + min_val (int or float): Minimum value to be clipped. + max_val (int or float): Maximum value to be clipped. levels (int): Quantization levels. dtype (np.type): The type of the dequantized array. diff --git a/mmcv/cnn/alexnet.py b/mmcv/cnn/alexnet.py index 89e36b8c7851f895d9ae7f07149f0e707456aab0..4d45d96d86bdcb52a51f095c4571b21c8421cbfa 100644 --- a/mmcv/cnn/alexnet.py +++ b/mmcv/cnn/alexnet.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging +from typing import Optional +import torch import torch.nn as nn @@ -11,8 +13,8 @@ class AlexNet(nn.Module): num_classes (int): number of classes for classification. """ - def __init__(self, num_classes=-1): - super(AlexNet, self).__init__() + def __init__(self, num_classes: int = -1): + super().__init__() self.num_classes = num_classes self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), @@ -40,7 +42,7 @@ class AlexNet(nn.Module): nn.Linear(4096, num_classes), ) - def init_weights(self, pretrained=None): + def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint @@ -51,7 +53,7 @@ class AlexNet(nn.Module): else: raise TypeError('pretrained must be a str or None') - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.features(x) if self.num_classes > 0: diff --git a/mmcv/cnn/bricks/activation.py b/mmcv/cnn/bricks/activation.py index 79f1988386cbf09a4a13e2c5a72222e22bcc6f7f..23e62722776d18b764cffe4a76e646e3103f8fb7 100644 --- a/mmcv/cnn/bricks/activation.py +++ b/mmcv/cnn/bricks/activation.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + import torch import torch.nn as nn import torch.nn.functional as F @@ -28,12 +30,12 @@ class Clamp(nn.Module): Default to 1. """ - def __init__(self, min=-1., max=1.): - super(Clamp, self).__init__() + def __init__(self, min: float = -1., max: float = 1.): + super().__init__() self.min = min self.max = max - def forward(self, x): + def forward(self, x) -> torch.Tensor: """Forward function. Args: @@ -67,7 +69,7 @@ class GELU(nn.Module): >>> output = m(input) """ - def forward(self, input): + def forward(self, input: torch.Tensor) -> torch.Tensor: return F.gelu(input) @@ -78,11 +80,12 @@ else: ACTIVATION_LAYERS.register_module(module=nn.GELU) -def build_activation_layer(cfg): +def build_activation_layer(cfg: Dict) -> nn.Module: """Build activation layer. Args: cfg (dict): The activation layer config, which should contain: + - type (str): Layer type. - layer args: Args needed to instantiate an activation layer. diff --git a/mmcv/cnn/bricks/context_block.py b/mmcv/cnn/bricks/context_block.py index d60fdb904c749ce3b251510dff3cc63cea70d42e..15669cab35dcdc98a95df006788f78f84b88dc44 100644 --- a/mmcv/cnn/bricks/context_block.py +++ b/mmcv/cnn/bricks/context_block.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + import torch from torch import nn @@ -6,7 +8,7 @@ from ..utils import constant_init, kaiming_init from .registry import PLUGIN_LAYERS -def last_zero_init(m): +def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None: if isinstance(m, nn.Sequential): constant_init(m[-1], val=0) else: @@ -34,11 +36,11 @@ class ContextBlock(nn.Module): _abbr_ = 'context_block' def __init__(self, - in_channels, - ratio, - pooling_type='att', - fusion_types=('channel_add', )): - super(ContextBlock, self).__init__() + in_channels: int, + ratio: float, + pooling_type: str = 'att', + fusion_types: tuple = ('channel_add', )): + super().__init__() assert pooling_type in ['avg', 'att'] assert isinstance(fusion_types, (list, tuple)) valid_fusion_types = ['channel_add', 'channel_mul'] @@ -82,7 +84,7 @@ class ContextBlock(nn.Module): if self.channel_mul_conv is not None: last_zero_init(self.channel_mul_conv) - def spatial_pool(self, x): + def spatial_pool(self, x: torch.Tensor) -> torch.Tensor: batch, channel, height, width = x.size() if self.pooling_type == 'att': input_x = x @@ -108,7 +110,7 @@ class ContextBlock(nn.Module): return context - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # [N, C, 1, 1] context = self.spatial_pool(x) diff --git a/mmcv/cnn/bricks/conv.py b/mmcv/cnn/bricks/conv.py index cf54491997a48ac3e7fadc4183ab7bf3e831024c..147517ef4ecdee16d26b535fa49c26a2fcbdd48e 100644 --- a/mmcv/cnn/bricks/conv.py +++ b/mmcv/cnn/bricks/conv.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional + from torch import nn from .registry import CONV_LAYERS @@ -9,7 +11,7 @@ CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d) CONV_LAYERS.register_module('Conv', module=nn.Conv2d) -def build_conv_layer(cfg, *args, **kwargs): +def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module: """Build convolution layer. Args: @@ -35,7 +37,7 @@ def build_conv_layer(cfg, *args, **kwargs): layer_type = cfg_.pop('type') if layer_type not in CONV_LAYERS: - raise KeyError(f'Unrecognized norm type {layer_type}') + raise KeyError(f'Unrecognized layer type {layer_type}') else: conv_layer = CONV_LAYERS.get(layer_type) diff --git a/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/mmcv/cnn/bricks/conv2d_adaptive_padding.py index b45e758ac6cf8dfb0382d072fe09125bc7e9b888..6a7a1d2844db097c21e5ecc55a579e0b9b95c816 100644 --- a/mmcv/cnn/bricks/conv2d_adaptive_padding.py +++ b/mmcv/cnn/bricks/conv2d_adaptive_padding.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import math +from typing import Tuple, Union +import torch from torch import nn from torch.nn import functional as F @@ -31,18 +33,18 @@ class Conv2dAdaptivePadding(nn.Conv2d): """ def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True): + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True): super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: img_h, img_w = x.size()[-2:] kernel_h, kernel_w = self.weight.size()[-2:] stride_h, stride_w = self.stride diff --git a/mmcv/cnn/bricks/conv_module.py b/mmcv/cnn/bricks/conv_module.py index 4f19f1d0cf4448179272ac53536e7ccf5fd860a3..b5d4a8c2760ea81656d3eefdad86e8dd43488447 100644 --- a/mmcv/cnn/bricks/conv_module.py +++ b/mmcv/cnn/bricks/conv_module.py @@ -1,6 +1,8 @@ # Copyright (c) OpenMMLab. All rights reserved. import warnings +from typing import Dict, Optional, Tuple, Union +import torch import torch.nn as nn from mmcv.utils import _BatchNorm, _InstanceNorm @@ -68,22 +70,22 @@ class ConvModule(nn.Module): _abbr_ = 'conv_block' def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias='auto', - conv_cfg=None, - norm_cfg=None, - act_cfg=dict(type='ReLU'), - inplace=True, - with_spectral_norm=False, - padding_mode='zeros', - order=('conv', 'norm', 'act')): - super(ConvModule, self).__init__() + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: Union[bool, str] = 'auto', + conv_cfg: Optional[Dict] = None, + norm_cfg: Optional[Dict] = None, + act_cfg: Optional[Dict] = dict(type='ReLU'), + inplace: bool = True, + with_spectral_norm: bool = False, + padding_mode: str = 'zeros', + order: tuple = ('conv', 'norm', 'act')): + super().__init__() assert conv_cfg is None or isinstance(conv_cfg, dict) assert norm_cfg is None or isinstance(norm_cfg, dict) assert act_cfg is None or isinstance(act_cfg, dict) @@ -96,7 +98,7 @@ class ConvModule(nn.Module): self.with_explicit_padding = padding_mode not in official_padding_mode self.order = order assert isinstance(self.order, tuple) and len(self.order) == 3 - assert set(order) == set(['conv', 'norm', 'act']) + assert set(order) == {'conv', 'norm', 'act'} self.with_norm = norm_cfg is not None self.with_activation = act_cfg is not None @@ -143,21 +145,22 @@ class ConvModule(nn.Module): norm_channels = out_channels else: norm_channels = in_channels - self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) + self.norm_name, norm = build_norm_layer( + norm_cfg, norm_channels) # type: ignore self.add_module(self.norm_name, norm) if self.with_bias: if isinstance(norm, (_BatchNorm, _InstanceNorm)): warnings.warn( 'Unnecessary conv bias before batch/instance norm') else: - self.norm_name = None + self.norm_name = None # type: ignore # build activation layer if self.with_activation: - act_cfg_ = act_cfg.copy() + act_cfg_ = act_cfg.copy() # type: ignore # nn.Tanh has no 'inplace' argument if act_cfg_['type'] not in [ - 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish' + 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU' ]: act_cfg_.setdefault('inplace', inplace) self.activate = build_activation_layer(act_cfg_) @@ -193,7 +196,10 @@ class ConvModule(nn.Module): if self.with_norm: constant_init(self.norm, 1, bias=0) - def forward(self, x, activate=True, norm=True): + def forward(self, + x: torch.Tensor, + activate: bool = True, + norm: bool = True) -> torch.Tensor: for layer in self.order: if layer == 'conv': if self.with_explicit_padding: diff --git a/mmcv/cnn/bricks/conv_ws.py b/mmcv/cnn/bricks/conv_ws.py index a3941e27874993418b3b5708d5a7485f175ff9c8..6569f920fea942a9345ff509c7dbdb6ace1f3741 100644 --- a/mmcv/cnn/bricks/conv_ws.py +++ b/mmcv/cnn/bricks/conv_ws.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple, Union + import torch import torch.nn as nn import torch.nn.functional as F @@ -6,14 +9,14 @@ import torch.nn.functional as F from .registry import CONV_LAYERS -def conv_ws_2d(input, - weight, - bias=None, - stride=1, - padding=0, - dilation=1, - groups=1, - eps=1e-5): +def conv_ws_2d(input: torch.Tensor, + weight: torch.Tensor, + bias: Optional[torch.Tensor] = None, + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + eps: float = 1e-5) -> torch.Tensor: c_in = weight.size(0) weight_flat = weight.view(c_in, -1) mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) @@ -26,16 +29,16 @@ def conv_ws_2d(input, class ConvWS2d(nn.Conv2d): def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - eps=1e-5): - super(ConvWS2d, self).__init__( + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True, + eps: float = 1e-5): + super().__init__( in_channels, out_channels, kernel_size, @@ -46,7 +49,7 @@ class ConvWS2d(nn.Conv2d): bias=bias) self.eps = eps - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, self.eps) @@ -76,14 +79,14 @@ class ConvAWS2d(nn.Conv2d): """ def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True): + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + groups: int = 1, + bias: bool = True): super().__init__( in_channels, out_channels, @@ -98,7 +101,7 @@ class ConvAWS2d(nn.Conv2d): self.register_buffer('weight_beta', torch.zeros(self.out_channels, 1, 1, 1)) - def _get_weight(self, weight): + def _get_weight(self, weight: torch.Tensor) -> torch.Tensor: weight_flat = weight.view(weight.size(0), -1) mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) @@ -106,13 +109,16 @@ class ConvAWS2d(nn.Conv2d): weight = self.weight_gamma * weight + self.weight_beta return weight - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: weight = self._get_weight(self.weight) return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) - def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, - missing_keys, unexpected_keys, error_msgs): + def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str, + local_metadata: Dict, strict: bool, + missing_keys: List[str], + unexpected_keys: List[str], + error_msgs: List[str]) -> None: """Override default load function. AWS overrides the function _load_from_state_dict to recover @@ -124,7 +130,7 @@ class ConvAWS2d(nn.Conv2d): """ self.weight_gamma.data.fill_(-1) - local_missing_keys = [] + local_missing_keys: List = [] super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, local_missing_keys, unexpected_keys, error_msgs) diff --git a/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/mmcv/cnn/bricks/depthwise_separable_conv_module.py index 722d5d8d71f75486e2db3008907c4eadfca41d63..cf1fe4cad3812007573211fa2bede28b23822122 100644 --- a/mmcv/cnn/bricks/depthwise_separable_conv_module.py +++ b/mmcv/cnn/bricks/depthwise_separable_conv_module.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict, Optional, Tuple, Union + +import torch import torch.nn as nn from .conv_module import ConvModule @@ -46,27 +49,27 @@ class DepthwiseSeparableConvModule(nn.Module): """ def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - norm_cfg=None, - act_cfg=dict(type='ReLU'), - dw_norm_cfg='default', - dw_act_cfg='default', - pw_norm_cfg='default', - pw_act_cfg='default', + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + stride: Union[int, Tuple[int, int]] = 1, + padding: Union[int, Tuple[int, int]] = 0, + dilation: Union[int, Tuple[int, int]] = 1, + norm_cfg: Optional[Dict] = None, + act_cfg: Dict = dict(type='ReLU'), + dw_norm_cfg: Union[Dict, str] = 'default', + dw_act_cfg: Union[Dict, str] = 'default', + pw_norm_cfg: Union[Dict, str] = 'default', + pw_act_cfg: Union[Dict, str] = 'default', **kwargs): - super(DepthwiseSeparableConvModule, self).__init__() + super().__init__() assert 'groups' not in kwargs, 'groups should not be specified' # if norm/activation config of depthwise/pointwise ConvModule is not # specified, use default config. - dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg + dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg - pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg + pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg # type: ignore # noqa E501 pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg # depthwise convolution @@ -78,19 +81,19 @@ class DepthwiseSeparableConvModule(nn.Module): padding=padding, dilation=dilation, groups=in_channels, - norm_cfg=dw_norm_cfg, - act_cfg=dw_act_cfg, + norm_cfg=dw_norm_cfg, # type: ignore + act_cfg=dw_act_cfg, # type: ignore **kwargs) self.pointwise_conv = ConvModule( in_channels, out_channels, 1, - norm_cfg=pw_norm_cfg, - act_cfg=pw_act_cfg, + norm_cfg=pw_norm_cfg, # type: ignore + act_cfg=pw_act_cfg, # type: ignore **kwargs) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.depthwise_conv(x) x = self.pointwise_conv(x) return x diff --git a/mmcv/cnn/bricks/drop.py b/mmcv/cnn/bricks/drop.py index b0a026654ac2e3b994eb7a5248ca9faa277f8985..ea05221d854592a5d885efbef002cb673c65f778 100644 --- a/mmcv/cnn/bricks/drop.py +++ b/mmcv/cnn/bricks/drop.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Any, Dict, Optional + import torch import torch.nn as nn @@ -6,7 +8,9 @@ from mmcv import build_from_cfg from .registry import DROPOUT_LAYERS -def drop_path(x, drop_prob=0., training=False): +def drop_path(x: torch.Tensor, + drop_prob: float = 0., + training: bool = False) -> torch.Tensor: """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). @@ -36,11 +40,11 @@ class DropPath(nn.Module): drop_prob (float): Probability of the path to be zeroed. Default: 0.1 """ - def __init__(self, drop_prob=0.1): - super(DropPath, self).__init__() + def __init__(self, drop_prob: float = 0.1): + super().__init__() self.drop_prob = drop_prob - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return drop_path(x, self.drop_prob, self.training) @@ -56,10 +60,10 @@ class Dropout(nn.Dropout): inplace (bool): Do the operation inplace or not. Default: False. """ - def __init__(self, drop_prob=0.5, inplace=False): + def __init__(self, drop_prob: float = 0.5, inplace: bool = False): super().__init__(p=drop_prob, inplace=inplace) -def build_dropout(cfg, default_args=None): +def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any: """Builder for drop out layers.""" return build_from_cfg(cfg, DROPOUT_LAYERS, default_args) diff --git a/mmcv/cnn/bricks/generalized_attention.py b/mmcv/cnn/bricks/generalized_attention.py index 988d9adf2f289ef223bd1c680a5ae1d3387f0269..118e39c7ea2d9f24a97f22878dfbe753c4afef0b 100644 --- a/mmcv/cnn/bricks/generalized_attention.py +++ b/mmcv/cnn/bricks/generalized_attention.py @@ -45,16 +45,16 @@ class GeneralizedAttention(nn.Module): _abbr_ = 'gen_attention_block' def __init__(self, - in_channels, - spatial_range=-1, - num_heads=9, - position_embedding_dim=-1, - position_magnitude=1, - kv_stride=2, - q_stride=1, - attention_type='1111'): + in_channels: int, + spatial_range: int = -1, + num_heads: int = 9, + position_embedding_dim: int = -1, + position_magnitude: int = 1, + kv_stride: int = 2, + q_stride: int = 1, + attention_type: str = '1111'): - super(GeneralizedAttention, self).__init__() + super().__init__() # hard range means local range for non-local operation self.position_embedding_dim = ( @@ -131,7 +131,7 @@ class GeneralizedAttention(nn.Module): max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) local_constraint_map = np.ones( - (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int) + (max_len, max_len, max_len_kv, max_len_kv), dtype=int) for iy in range(max_len): for ix in range(max_len): local_constraint_map[ @@ -213,7 +213,7 @@ class GeneralizedAttention(nn.Module): return embedding_x, embedding_y - def forward(self, x_input): + def forward(self, x_input: torch.Tensor) -> torch.Tensor: num_heads = self.num_heads # use empirical_attention @@ -351,7 +351,7 @@ class GeneralizedAttention(nn.Module): repeat(n, 1, 1, 1) position_feat_x_reshape = position_feat_x.\ - view(n, num_heads, w*w_kv, self.qk_embed_dim) + view(n, num_heads, w * w_kv, self.qk_embed_dim) position_feat_y_reshape = position_feat_y.\ view(n, num_heads, h * h_kv, self.qk_embed_dim) diff --git a/mmcv/cnn/bricks/hsigmoid.py b/mmcv/cnn/bricks/hsigmoid.py index 30b1a3d6580cf0360710426fbea1f05acdf07b4b..5eb97e8ab13e76c6916a7ebba15cb50f8b846897 100644 --- a/mmcv/cnn/bricks/hsigmoid.py +++ b/mmcv/cnn/bricks/hsigmoid.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch import torch.nn as nn from .registry import ACTIVATION_LAYERS @@ -8,11 +11,15 @@ from .registry import ACTIVATION_LAYERS class HSigmoid(nn.Module): """Hard Sigmoid Module. Apply the hard sigmoid function: Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) - Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1) + Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1) + + Note: + In MMCV v1.4.4, we modified the default value of args to align with + PyTorch official. Args: - bias (float): Bias of the input feature map. Default: 1.0. - divisor (float): Divisor of the input feature map. Default: 2.0. + bias (float): Bias of the input feature map. Default: 3.0. + divisor (float): Divisor of the input feature map. Default: 6.0. min_value (float): Lower bound value. Default: 0.0. max_value (float): Upper bound value. Default: 1.0. @@ -20,15 +27,25 @@ class HSigmoid(nn.Module): Tensor: The output tensor. """ - def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0): - super(HSigmoid, self).__init__() + def __init__(self, + bias: float = 3.0, + divisor: float = 6.0, + min_value: float = 0.0, + max_value: float = 1.0): + super().__init__() + warnings.warn( + 'In MMCV v1.4.4, we modified the default value of args to align ' + 'with PyTorch official. Previous Implementation: ' + 'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). ' + 'Current Implementation: ' + 'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).') self.bias = bias self.divisor = divisor assert self.divisor != 0 self.min_value = min_value self.max_value = max_value - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x = (x + self.bias) / self.divisor return x.clamp_(self.min_value, self.max_value) diff --git a/mmcv/cnn/bricks/hswish.py b/mmcv/cnn/bricks/hswish.py index 7e0c090ff037c99ee6c5c84c4592e87beae02208..6f6cc276c10a5c49bd9c0e30a1ffad4a1b6018d4 100644 --- a/mmcv/cnn/bricks/hswish.py +++ b/mmcv/cnn/bricks/hswish.py @@ -1,10 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch import torch.nn as nn +from mmcv.utils import TORCH_VERSION, digit_version from .registry import ACTIVATION_LAYERS -@ACTIVATION_LAYERS.register_module() class HSwish(nn.Module): """Hard Swish Module. @@ -21,9 +22,18 @@ class HSwish(nn.Module): Tensor: The output tensor. """ - def __init__(self, inplace=False): - super(HSwish, self).__init__() + def __init__(self, inplace: bool = False): + super().__init__() self.act = nn.ReLU6(inplace) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x * self.act(x + 3) / 6 + + +if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.7')): + # Hardswish is not supported when PyTorch version < 1.6. + # And Hardswish in PyTorch 1.6 does not support inplace. + ACTIVATION_LAYERS.register_module(module=HSwish) +else: + ACTIVATION_LAYERS.register_module(module=nn.Hardswish, name='HSwish') diff --git a/mmcv/cnn/bricks/non_local.py b/mmcv/cnn/bricks/non_local.py index 92d00155ef275c1201ea66bba30470a1785cc5d7..159db245e80950d9b94e2744361bca2a09e67c13 100644 --- a/mmcv/cnn/bricks/non_local.py +++ b/mmcv/cnn/bricks/non_local.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta +from typing import Dict, Optional import torch import torch.nn as nn @@ -33,14 +34,14 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): """ def __init__(self, - in_channels, - reduction=2, - use_scale=True, - conv_cfg=None, - norm_cfg=None, - mode='embedded_gaussian', + in_channels: int, + reduction: int = 2, + use_scale: bool = True, + conv_cfg: Optional[Dict] = None, + norm_cfg: Optional[Dict] = None, + mode: str = 'embedded_gaussian', **kwargs): - super(_NonLocalNd, self).__init__() + super().__init__() self.in_channels = in_channels self.reduction = reduction self.use_scale = use_scale @@ -61,7 +62,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): self.inter_channels, kernel_size=1, conv_cfg=conv_cfg, - act_cfg=None) + act_cfg=None) # type: ignore self.conv_out = ConvModule( self.inter_channels, self.in_channels, @@ -96,7 +97,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): self.init_weights(**kwargs) - def init_weights(self, std=0.01, zeros_init=True): + def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None: if self.mode != 'gaussian': for m in [self.g, self.theta, self.phi]: normal_init(m.conv, std=std) @@ -113,7 +114,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): else: normal_init(self.conv_out.norm, std=std) - def gaussian(self, theta_x, phi_x): + def gaussian(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] @@ -121,7 +123,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight - def embedded_gaussian(self, theta_x, phi_x): + def embedded_gaussian(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] @@ -132,7 +135,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): pairwise_weight = pairwise_weight.softmax(dim=-1) return pairwise_weight - def dot_product(self, theta_x, phi_x): + def dot_product(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] @@ -140,7 +144,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): pairwise_weight /= pairwise_weight.shape[-1] return pairwise_weight - def concatenation(self, theta_x, phi_x): + def concatenation(self, theta_x: torch.Tensor, + phi_x: torch.Tensor) -> torch.Tensor: # NonLocal1d pairwise_weight: [N, H, H] # NonLocal2d pairwise_weight: [N, HxW, HxW] # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] @@ -157,7 +162,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta): return pairwise_weight - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # Assume `reduction = 1`, then `inter_channels = C` # or `inter_channels = C` when `mode="gaussian"` @@ -224,12 +229,11 @@ class NonLocal1d(_NonLocalNd): """ def __init__(self, - in_channels, - sub_sample=False, - conv_cfg=dict(type='Conv1d'), + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv1d'), **kwargs): - super(NonLocal1d, self).__init__( - in_channels, conv_cfg=conv_cfg, **kwargs) + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample @@ -258,12 +262,11 @@ class NonLocal2d(_NonLocalNd): _abbr_ = 'nonlocal_block' def __init__(self, - in_channels, - sub_sample=False, - conv_cfg=dict(type='Conv2d'), + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv2d'), **kwargs): - super(NonLocal2d, self).__init__( - in_channels, conv_cfg=conv_cfg, **kwargs) + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample @@ -289,12 +292,11 @@ class NonLocal3d(_NonLocalNd): """ def __init__(self, - in_channels, - sub_sample=False, - conv_cfg=dict(type='Conv3d'), + in_channels: int, + sub_sample: bool = False, + conv_cfg: Dict = dict(type='Conv3d'), **kwargs): - super(NonLocal3d, self).__init__( - in_channels, conv_cfg=conv_cfg, **kwargs) + super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs) self.sub_sample = sub_sample if sub_sample: diff --git a/mmcv/cnn/bricks/norm.py b/mmcv/cnn/bricks/norm.py index cfb326bdb8ced3ec17ab5c3203cb6d6784ff2e78..b6281a7c697483fbdaaba5a37d88a00f3c259d31 100644 --- a/mmcv/cnn/bricks/norm.py +++ b/mmcv/cnn/bricks/norm.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. import inspect +from typing import Dict, Tuple, Union import torch.nn as nn @@ -69,7 +70,9 @@ def infer_abbr(class_type): return 'norm_layer' -def build_norm_layer(cfg, num_features, postfix=''): +def build_norm_layer(cfg: Dict, + num_features: int, + postfix: Union[int, str] = '') -> Tuple[str, nn.Module]: """Build normalization layer. Args: @@ -83,9 +86,9 @@ def build_norm_layer(cfg, num_features, postfix=''): to create named layer. Returns: - (str, nn.Module): The first element is the layer name consisting of - abbreviation and postfix, e.g., bn1, gn. The second element is the - created norm layer. + tuple[str, nn.Module]: The first element is the layer name consisting + of abbreviation and postfix, e.g., bn1, gn. The second element is the + created norm layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') @@ -119,7 +122,8 @@ def build_norm_layer(cfg, num_features, postfix=''): return name, layer -def is_norm(layer, exclude=None): +def is_norm(layer: nn.Module, + exclude: Union[type, tuple, None] = None) -> bool: """Check if a layer is a normalization layer. Args: diff --git a/mmcv/cnn/bricks/padding.py b/mmcv/cnn/bricks/padding.py index e4ac6b28a1789bd551c613a7d3e7b622433ac7ec..8412b0c6576fd220eca52382943ad5889f0dfd1f 100644 --- a/mmcv/cnn/bricks/padding.py +++ b/mmcv/cnn/bricks/padding.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + import torch.nn as nn from .registry import PADDING_LAYERS @@ -8,11 +10,11 @@ PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d) PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d) -def build_padding_layer(cfg, *args, **kwargs): +def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module: """Build padding layer. Args: - cfg (None or dict): The padding layer config, which should contain: + cfg (dict): The padding layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a padding layer. diff --git a/mmcv/cnn/bricks/plugin.py b/mmcv/cnn/bricks/plugin.py index 07c010d4053174dd41107aa654ea67e82b46a25c..095ef9234501d0bca54373d4422244b80f818341 100644 --- a/mmcv/cnn/bricks/plugin.py +++ b/mmcv/cnn/bricks/plugin.py @@ -1,15 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. import inspect import platform +from typing import Dict, Tuple, Union + +import torch.nn as nn from .registry import PLUGIN_LAYERS if platform.system() == 'Windows': - import regex as re + import regex as re # type: ignore else: - import re + import re # type: ignore -def infer_abbr(class_type): +def infer_abbr(class_type: type) -> str: """Infer abbreviation from the class name. This method will infer the abbreviation to map class types to @@ -47,25 +51,27 @@ def infer_abbr(class_type): raise TypeError( f'class_type must be a type, but got {type(class_type)}') if hasattr(class_type, '_abbr_'): - return class_type._abbr_ + return class_type._abbr_ # type: ignore else: return camel2snack(class_type.__name__) -def build_plugin_layer(cfg, postfix='', **kwargs): +def build_plugin_layer(cfg: Dict, + postfix: Union[int, str] = '', + **kwargs) -> Tuple[str, nn.Module]: """Build plugin layer. Args: - cfg (None or dict): cfg should contain: - type (str): identify plugin layer type. - layer args: args needed to instantiate a plugin layer. + cfg (dict): cfg should contain: + + - type (str): identify plugin layer type. + - layer args: args needed to instantiate a plugin layer. postfix (int, str): appended into norm abbreviation to create named layer. Default: ''. Returns: - tuple[str, nn.Module]: - name (str): abbreviation + postfix - layer (nn.Module): created plugin layer + tuple[str, nn.Module]: The first one is the concatenation of + abbreviation and postfix. The second is the created plugin layer. """ if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') diff --git a/mmcv/cnn/bricks/scale.py b/mmcv/cnn/bricks/scale.py index c905fffcc8bf998d18d94f927591963c428025e2..dbd07c6a445e116bd6f32c96d8b52079ccf9b28a 100644 --- a/mmcv/cnn/bricks/scale.py +++ b/mmcv/cnn/bricks/scale.py @@ -13,9 +13,9 @@ class Scale(nn.Module): scale (float): Initial value of scale factor. Default: 1.0 """ - def __init__(self, scale=1.0): - super(Scale, self).__init__() + def __init__(self, scale: float = 1.0): + super().__init__() self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x * self.scale diff --git a/mmcv/cnn/bricks/swish.py b/mmcv/cnn/bricks/swish.py index e2ca8ed7b749413f011ae54aac0cab27e6f0b51f..b297adff068661859265a5057c1b2204ac8eefa7 100644 --- a/mmcv/cnn/bricks/swish.py +++ b/mmcv/cnn/bricks/swish.py @@ -19,7 +19,7 @@ class Swish(nn.Module): """ def __init__(self): - super(Swish, self).__init__() + super().__init__() - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: return x * torch.sigmoid(x) diff --git a/mmcv/cnn/bricks/transformer.py b/mmcv/cnn/bricks/transformer.py index ed32688af40c0744289d07cd991b17a0dcb1c29f..f7ba4d9f836609cec8526607db98c4b03ec4fee3 100644 --- a/mmcv/cnn/bricks/transformer.py +++ b/mmcv/cnn/bricks/transformer.py @@ -1,21 +1,26 @@ # Copyright (c) OpenMMLab. All rights reserved. import copy +import math import warnings +from typing import Sequence import torch import torch.nn as nn +import torch.nn.functional as F -from mmcv import ConfigDict, deprecated_api_warning -from mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, + build_norm_layer) from mmcv.runner.base_module import BaseModule, ModuleList, Sequential -from mmcv.utils import build_from_cfg +from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, + to_2tuple) from .drop import build_dropout from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file try: - from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + from mmcv.ops.multi_scale_deform_attn import \ + MultiScaleDeformableAttention # noqa F401 warnings.warn( ImportWarning( '``MultiScaleDeformableAttention`` has been moved to ' @@ -55,6 +60,349 @@ def build_transformer_layer_sequence(cfg, default_args=None): return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args) +class AdaptivePadding(nn.Module): + """Applies padding adaptively to the input. + + This module can make input get fully covered by filter + you specified. It support two modes "same" and "corner". The + "same" mode is same with "SAME" padding mode in TensorFlow, pad + zero around input. The "corner" mode would pad zero + to bottom right. + + Args: + kernel_size (int | tuple): Size of the kernel. Default: 1. + stride (int | tuple): Stride of the filter. Default: 1. + dilation (int | tuple): Spacing between kernel elements. + Default: 1. + padding (str): Support "same" and "corner", "corner" mode + would pad zero to bottom right, and "same" mode would + pad zero around input. Default: "corner". + + Example: + >>> kernel_size = 16 + >>> stride = 16 + >>> dilation = 1 + >>> input = torch.rand(1, 1, 15, 17) + >>> adap_pad = AdaptivePadding( + >>> kernel_size=kernel_size, + >>> stride=stride, + >>> dilation=dilation, + >>> padding="corner") + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + >>> input = torch.rand(1, 1, 16, 17) + >>> out = adap_pad(input) + >>> assert (out.shape[2], out.shape[3]) == (16, 32) + """ + + def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'): + super().__init__() + assert padding in ('same', 'corner') + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + self.padding = padding + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + + def get_pad_shape(self, input_shape): + """Calculate the padding size of input. + + Args: + input_shape (:obj:`torch.Size`): arrange as (H, W). + + Returns: + Tuple[int]: The padding size along the + original H and W directions + """ + input_h, input_w = input_shape + kernel_h, kernel_w = self.kernel_size + stride_h, stride_w = self.stride + output_h = math.ceil(input_h / stride_h) + output_w = math.ceil(input_w / stride_w) + pad_h = max((output_h - 1) * stride_h + + (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0) + pad_w = max((output_w - 1) * stride_w + + (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0) + return pad_h, pad_w + + def forward(self, x): + """Add padding to `x` + + Args: + x (Tensor): Input tensor has shape (B, C, H, W). + + Returns: + Tensor: The tensor with adaptive padding + """ + pad_h, pad_w = self.get_pad_shape(x.size()[-2:]) + if pad_h > 0 or pad_w > 0: + if self.padding == 'corner': + x = F.pad(x, [0, pad_w, 0, pad_h]) + elif self.padding == 'same': + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, + pad_h - pad_h // 2 + ]) + return x + + +class PatchEmbed(BaseModule): + """Image to Patch Embedding. + + We use a conv layer to implement PatchEmbed. + + Args: + in_channels (int): The num of input channels. Default: 3 + embed_dims (int): The dimensions of embedding. Default: 768 + conv_type (str): The type of convolution + to generate patch embedding. Default: "Conv2d". + kernel_size (int): The kernel_size of embedding conv. Default: 16. + stride (int): The slide stride of embedding conv. + Default: 16. + padding (int | tuple | string): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int): The dilation rate of embedding conv. Default: 1. + bias (bool): Bias of embed conv. Default: True. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: None. + input_size (int | tuple | None): The size of input, which will be + used to calculate the out size. Only works when `dynamic_size` + is False. Default: None. + init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. + Default: None. + """ + + def __init__(self, + in_channels=3, + embed_dims=768, + conv_type='Conv2d', + kernel_size=16, + stride=16, + padding='corner', + dilation=1, + bias=True, + norm_cfg=None, + input_size=None, + init_cfg=None): + super().__init__(init_cfg=init_cfg) + + self.embed_dims = embed_dims + if stride is None: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of conv + padding = 0 + else: + self.adaptive_padding = None + padding = to_2tuple(padding) + + self.projection = build_conv_layer( + dict(type=conv_type), + in_channels=in_channels, + out_channels=embed_dims, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=bias) + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, embed_dims)[1] + else: + self.norm = None + + if input_size: + input_size = to_2tuple(input_size) + # `init_out_size` would be used outside to + # calculate the num_patches + # e.g. when `use_abs_pos_embed` outside + self.init_input_size = input_size + if self.adaptive_padding: + pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size) + input_h, input_w = input_size + input_h = input_h + pad_h + input_w = input_w + pad_w + input_size = (input_h, input_w) + + # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html + h_out = (input_size[0] + 2 * padding[0] - dilation[0] * + (kernel_size[0] - 1) - 1) // stride[0] + 1 + w_out = (input_size[1] + 2 * padding[1] - dilation[1] * + (kernel_size[1] - 1) - 1) // stride[1] + 1 + self.init_out_size = (h_out, w_out) + else: + self.init_input_size = None + self.init_out_size = None + + def forward(self, x): + """ + Args: + x (Tensor): Has shape (B, C, H, W). In most case, C is 3. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, out_h * out_w, embed_dims) + - out_size (tuple[int]): Spatial shape of x, arrange as + (out_h, out_w). + """ + + if self.adaptive_padding: + x = self.adaptive_padding(x) + + x = self.projection(x) + out_size = (x.shape[2], x.shape[3]) + x = x.flatten(2).transpose(1, 2) + if self.norm is not None: + x = self.norm(x) + return x, out_size + + +class PatchMerging(BaseModule): + """Merge patch feature map. + + This layer groups feature map by kernel_size, and applies norm and linear + layers to the grouped feature map ((used in Swin Transformer)). + Our implementation uses `nn.Unfold` to + merge patches, which is about 25% faster than the original + implementation. However, we need to modify pretrained + models for compatibility. + + Args: + in_channels (int): The num of input channels. + to gets fully covered by filter and stride you specified. + out_channels (int): The num of output channels. + kernel_size (int | tuple, optional): the kernel size in the unfold + layer. Defaults to 2. + stride (int | tuple, optional): the stride of the sliding blocks in the + unfold layer. Default: None. (Would be set as `kernel_size`) + padding (int | tuple | string ): The padding length of + embedding conv. When it is a string, it means the mode + of adaptive padding, support "same" and "corner" now. + Default: "corner". + dilation (int | tuple, optional): dilation parameter in the unfold + layer. Default: 1. + bias (bool, optional): Whether to add bias in linear layer or not. + Defaults: False. + norm_cfg (dict, optional): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (dict, optional): The extra config for initialization. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=2, + stride=None, + padding='corner', + dilation=1, + bias=False, + norm_cfg=dict(type='LN'), + init_cfg=None): + super().__init__(init_cfg=init_cfg) + self.in_channels = in_channels + self.out_channels = out_channels + if stride: + stride = stride + else: + stride = kernel_size + + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + + if isinstance(padding, str): + self.adaptive_padding = AdaptivePadding( + kernel_size=kernel_size, + stride=stride, + dilation=dilation, + padding=padding) + # disable the padding of unfold + padding = 0 + else: + self.adaptive_padding = None + + padding = to_2tuple(padding) + self.sampler = nn.Unfold( + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + stride=stride) + + sample_dim = kernel_size[0] * kernel_size[1] * in_channels + + if norm_cfg is not None: + self.norm = build_norm_layer(norm_cfg, sample_dim)[1] + else: + self.norm = None + + self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) + + def forward(self, x, input_size): + """ + Args: + x (Tensor): Has shape (B, H*W, C_in). + input_size (tuple[int]): The spatial shape of x, arrange as (H, W). + Default: None. + + Returns: + tuple: Contains merged results and its spatial shape. + + - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out) + - out_size (tuple[int]): Spatial shape of x, arrange as + (Merged_H, Merged_W). + """ + B, L, C = x.shape + assert isinstance(input_size, Sequence), f'Expect ' \ + f'input_size is ' \ + f'`Sequence` ' \ + f'but get {input_size}' + + H, W = input_size + assert L == H * W, 'input feature has wrong size' + + x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W + + if self.adaptive_padding: + x = self.adaptive_padding(x) + H, W = x.shape[-2:] + + # Use nn.Unfold to merge patch. About 25% faster than original method, + # but need to modify pretrained model for compatibility + # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2) + x = self.sampler(x) + + out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] * + (self.sampler.kernel_size[0] - 1) - + 1) // self.sampler.stride[0] + 1 + out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] * + (self.sampler.kernel_size[1] - 1) - + 1) // self.sampler.stride[1] + 1 + + output_size = (out_h, out_w) + x = x.transpose(1, 2) # B, H/2*W/2, 4*C + x = self.norm(x) if self.norm else x + x = self.reduction(x) + return x, output_size + + @ATTENTION.register_module() class MultiheadAttention(BaseModule): """A wrapper for ``torch.nn.MultiheadAttention``. @@ -87,12 +435,13 @@ class MultiheadAttention(BaseModule): init_cfg=None, batch_first=False, **kwargs): - super(MultiheadAttention, self).__init__(init_cfg) + super().__init__(init_cfg) if 'dropout' in kwargs: - warnings.warn('The arguments `dropout` in MultiheadAttention ' - 'has been deprecated, now you can separately ' - 'set `attn_drop`(float), proj_drop(float), ' - 'and `dropout_layer`(dict) ') + warnings.warn( + 'The arguments `dropout` in MultiheadAttention ' + 'has been deprecated, now you can separately ' + 'set `attn_drop`(float), proj_drop(float), ' + 'and `dropout_layer`(dict) ', DeprecationWarning) attn_drop = kwargs['dropout'] dropout_layer['drop_prob'] = kwargs.pop('dropout') @@ -154,9 +503,9 @@ class MultiheadAttention(BaseModule): Returns: Tensor: forwarded results with shape - [num_queries, bs, embed_dims] - if self.batch_first is False, else - [bs, num_queries embed_dims]. + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. """ if key is None: @@ -241,7 +590,7 @@ class FFN(BaseModule): add_identity=True, init_cfg=None, **kwargs): - super(FFN, self).__init__(init_cfg) + super().__init__(init_cfg) assert num_fcs >= 2, 'num_fcs should be no less ' \ f'than 2. got {num_fcs}.' self.embed_dims = embed_dims @@ -342,15 +691,15 @@ class BaseTransformerLayer(BaseModule): f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' - f'to a dict named `ffn_cfgs`. ') + f'to a dict named `ffn_cfgs`. ', DeprecationWarning) ffn_cfgs[new_name] = kwargs[ori_name] - super(BaseTransformerLayer, self).__init__(init_cfg) + super().__init__(init_cfg) self.batch_first = batch_first - assert set(operation_order) & set( - ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + assert set(operation_order) & { + 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ @@ -397,7 +746,7 @@ class BaseTransformerLayer(BaseModule): assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: - ffn_cfgs['embed_dims'] = self.embed_dims + ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( @@ -531,7 +880,7 @@ class TransformerLayerSequence(BaseModule): """ def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): - super(TransformerLayerSequence, self).__init__(init_cfg) + super().__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) diff --git a/mmcv/cnn/bricks/upsample.py b/mmcv/cnn/bricks/upsample.py index a1a353767d0ce8518f0d7289bed10dba0178ed12..d86c5f54a22ed26b09f66bd59659ff7ab1f5b3d9 100644 --- a/mmcv/cnn/bricks/upsample.py +++ b/mmcv/cnn/bricks/upsample.py @@ -1,4 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Dict + +import torch import torch.nn as nn import torch.nn.functional as F @@ -24,9 +27,9 @@ class PixelShufflePack(nn.Module): channels. """ - def __init__(self, in_channels, out_channels, scale_factor, - upsample_kernel): - super(PixelShufflePack, self).__init__() + def __init__(self, in_channels: int, out_channels: int, scale_factor: int, + upsample_kernel: int): + super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.scale_factor = scale_factor @@ -41,13 +44,13 @@ class PixelShufflePack(nn.Module): def init_weights(self): xavier_init(self.upsample_conv, distribution='uniform') - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.upsample_conv(x) x = F.pixel_shuffle(x, self.scale_factor) return x -def build_upsample_layer(cfg, *args, **kwargs): +def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module: """Build upsample layer. Args: @@ -55,7 +58,7 @@ def build_upsample_layer(cfg, *args, **kwargs): - type (str): Layer type. - scale_factor (int): Upsample ratio, which is not applicable to - deconv. + deconv. - layer args: Args needed to instantiate a upsample layer. args (argument list): Arguments passed to the ``__init__`` method of the corresponding conv layer. diff --git a/mmcv/cnn/bricks/wrappers.py b/mmcv/cnn/bricks/wrappers.py index 8aebf67bf52355a513f21756ee74fe510902d075..a07eff00e49970c7692ee3f2625c7f7aba9d7b22 100644 --- a/mmcv/cnn/bricks/wrappers.py +++ b/mmcv/cnn/bricks/wrappers.py @@ -21,19 +21,19 @@ else: TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) -def obsolete_torch_version(torch_version, version_threshold): +def obsolete_torch_version(torch_version, version_threshold) -> bool: return torch_version == 'parrots' or torch_version <= version_threshold class NewEmptyTensorOp(torch.autograd.Function): @staticmethod - def forward(ctx, x, new_shape): + def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor: ctx.shape = x.shape return x.new_empty(new_shape) @staticmethod - def backward(ctx, grad): + def backward(ctx, grad: torch.Tensor) -> tuple: shape = ctx.shape return NewEmptyTensorOp.apply(grad, shape), None @@ -41,7 +41,7 @@ class NewEmptyTensorOp(torch.autograd.Function): @CONV_LAYERS.register_module('Conv', force=True) class Conv2d(nn.Conv2d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, @@ -62,7 +62,7 @@ class Conv2d(nn.Conv2d): @CONV_LAYERS.register_module('Conv3d', force=True) class Conv3d(nn.Conv3d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, @@ -85,7 +85,7 @@ class Conv3d(nn.Conv3d): @UPSAMPLE_LAYERS.register_module('deconv', force=True) class ConvTranspose2d(nn.ConvTranspose2d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, @@ -108,7 +108,7 @@ class ConvTranspose2d(nn.ConvTranspose2d): @UPSAMPLE_LAYERS.register_module('deconv3d', force=True) class ConvTranspose3d(nn.ConvTranspose3d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): out_shape = [x.shape[0], self.out_channels] for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, @@ -128,7 +128,7 @@ class ConvTranspose3d(nn.ConvTranspose3d): class MaxPool2d(nn.MaxPool2d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # PyTorch 1.9 does not support empty tensor inference yet if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): out_shape = list(x.shape[:2]) @@ -146,7 +146,7 @@ class MaxPool2d(nn.MaxPool2d): class MaxPool3d(nn.MaxPool3d): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # PyTorch 1.9 does not support empty tensor inference yet if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): out_shape = list(x.shape[:2]) @@ -165,7 +165,7 @@ class MaxPool3d(nn.MaxPool3d): class Linear(torch.nn.Linear): - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: # empty tensor forward of Linear layer is supported in Pytorch 1.6 if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)): out_shape = [x.shape[0], self.out_features] diff --git a/mmcv/cnn/resnet.py b/mmcv/cnn/resnet.py index 1cb3ac057ee2d52c46fc94685b5d4e698aad8d5f..fb29e6256280b671acfbf73fd9a01f079749b260 100644 --- a/mmcv/cnn/resnet.py +++ b/mmcv/cnn/resnet.py @@ -1,13 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging +from typing import Optional, Sequence, Tuple, Union import torch.nn as nn import torch.utils.checkpoint as cp +from torch import Tensor from .utils import constant_init, kaiming_init -def conv3x3(in_planes, out_planes, stride=1, dilation=1): +def conv3x3(in_planes: int, + out_planes: int, + stride: int = 1, + dilation: int = 1): """3x3 convolution with padding.""" return nn.Conv2d( in_planes, @@ -23,14 +28,14 @@ class BasicBlock(nn.Module): expansion = 1 def __init__(self, - inplanes, - planes, - stride=1, - dilation=1, - downsample=None, - style='pytorch', - with_cp=False): - super(BasicBlock, self).__init__() + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + with_cp: bool = False): + super().__init__() assert style in ['pytorch', 'caffe'] self.conv1 = conv3x3(inplanes, planes, stride, dilation) self.bn1 = nn.BatchNorm2d(planes) @@ -42,7 +47,7 @@ class BasicBlock(nn.Module): self.dilation = dilation assert not with_cp - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: residual = x out = self.conv1(x) @@ -65,19 +70,19 @@ class Bottleneck(nn.Module): expansion = 4 def __init__(self, - inplanes, - planes, - stride=1, - dilation=1, - downsample=None, - style='pytorch', - with_cp=False): + inplanes: int, + planes: int, + stride: int = 1, + dilation: int = 1, + downsample: Optional[nn.Module] = None, + style: str = 'pytorch', + with_cp: bool = False): """Bottleneck block. If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is "caffe", the stride-two layer is the first 1x1 conv layer. """ - super(Bottleneck, self).__init__() + super().__init__() assert style in ['pytorch', 'caffe'] if style == 'pytorch': conv1_stride = 1 @@ -107,7 +112,7 @@ class Bottleneck(nn.Module): self.dilation = dilation self.with_cp = with_cp - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: def _inner_forward(x): residual = x @@ -140,14 +145,14 @@ class Bottleneck(nn.Module): return out -def make_res_layer(block, - inplanes, - planes, - blocks, - stride=1, - dilation=1, - style='pytorch', - with_cp=False): +def make_res_layer(block: nn.Module, + inplanes: int, + planes: int, + blocks: int, + stride: int = 1, + dilation: int = 1, + style: str = 'pytorch', + with_cp: bool = False) -> nn.Module: downsample = None if stride != 1 or inplanes != planes * block.expansion: downsample = nn.Sequential( @@ -208,22 +213,22 @@ class ResNet(nn.Module): } def __init__(self, - depth, - num_stages=4, - strides=(1, 2, 2, 2), - dilations=(1, 1, 1, 1), - out_indices=(0, 1, 2, 3), - style='pytorch', - frozen_stages=-1, - bn_eval=True, - bn_frozen=False, - with_cp=False): - super(ResNet, self).__init__() + depth: int, + num_stages: int = 4, + strides: Sequence[int] = (1, 2, 2, 2), + dilations: Sequence[int] = (1, 1, 1, 1), + out_indices: Sequence[int] = (0, 1, 2, 3), + style: str = 'pytorch', + frozen_stages: int = -1, + bn_eval: bool = True, + bn_frozen: bool = False, + with_cp: bool = False): + super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') assert num_stages >= 1 and num_stages <= 4 block, stage_blocks = self.arch_settings[depth] - stage_blocks = stage_blocks[:num_stages] + stage_blocks = stage_blocks[:num_stages] # type: ignore assert len(strides) == len(dilations) == num_stages assert max(out_indices) < num_stages @@ -234,7 +239,7 @@ class ResNet(nn.Module): self.bn_frozen = bn_frozen self.with_cp = with_cp - self.inplanes = 64 + self.inplanes: int = 64 self.conv1 = nn.Conv2d( 3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64) @@ -255,14 +260,15 @@ class ResNet(nn.Module): dilation=dilation, style=self.style, with_cp=with_cp) - self.inplanes = planes * block.expansion + self.inplanes = planes * block.expansion # type: ignore layer_name = f'layer{i + 1}' self.add_module(layer_name, res_layer) self.res_layers.append(layer_name) - self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1) + self.feat_dim = block.expansion * 64 * 2**( # type: ignore + len(stage_blocks) - 1) - def init_weights(self, pretrained=None): + def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint @@ -276,7 +282,7 @@ class ResNet(nn.Module): else: raise TypeError('pretrained must be a str or None') - def forward(self, x): + def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]: x = self.conv1(x) x = self.bn1(x) x = self.relu(x) @@ -292,8 +298,8 @@ class ResNet(nn.Module): else: return tuple(outs) - def train(self, mode=True): - super(ResNet, self).train(mode) + def train(self, mode: bool = True) -> None: + super().train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): diff --git a/mmcv/cnn/utils/flops_counter.py b/mmcv/cnn/utils/flops_counter.py index dceeb398bfc8a562d406136028381326ef55e0dc..150a55992a9561073626d26df503ba4ef37efa18 100644 --- a/mmcv/cnn/utils/flops_counter.py +++ b/mmcv/cnn/utils/flops_counter.py @@ -24,7 +24,9 @@ # SOFTWARE. import sys +import warnings from functools import partial +from typing import Any, Callable, Dict, Optional, TextIO, Tuple import numpy as np import torch @@ -33,13 +35,13 @@ import torch.nn as nn import mmcv -def get_model_complexity_info(model, - input_shape, - print_per_layer_stat=True, - as_strings=True, - input_constructor=None, - flush=False, - ost=sys.stdout): +def get_model_complexity_info(model: nn.Module, + input_shape: tuple, + print_per_layer_stat: bool = True, + as_strings: bool = True, + input_constructor: Optional[Callable] = None, + flush: bool = False, + ost: TextIO = sys.stdout) -> tuple: """Get complexity information of a model. This method can calculate FLOPs and parameter counts of a model with @@ -48,16 +50,16 @@ def get_model_complexity_info(model, Supported layers are listed as below: - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. - - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, - ``nn.ReLU6``. + - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, + ``nn.LeakyReLU``, ``nn.ReLU6``. - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, - ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, - ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, - ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, - ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. + ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, + ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, + ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, + ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, - ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, - ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. + ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, + ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. - Linear: ``nn.Linear``. - Deconvolution: ``nn.ConvTranspose2d``. - Upsample: ``nn.Upsample``. @@ -78,8 +80,8 @@ def get_model_complexity_info(model, Returns: tuple[float | str]: If ``as_strings`` is set to True, it will return - FLOPs and parameter counts in a string format. otherwise, it will - return those in a float number format. + FLOPs and parameter counts in a string format. otherwise, it will + return those in a float number format. """ assert type(input_shape) is tuple assert len(input_shape) >= 1 @@ -115,7 +117,9 @@ def get_model_complexity_info(model, return flops_count, params_count -def flops_to_string(flops, units='GFLOPs', precision=2): +def flops_to_string(flops: float, + units: Optional[str] = 'GFLOPs', + precision: int = 2) -> str: """Convert FLOPs number into a string. Note that Here we take a multiply-add counts as one FLOP. @@ -158,7 +162,9 @@ def flops_to_string(flops, units='GFLOPs', precision=2): return str(flops) + ' FLOPs' -def params_to_string(num_params, units=None, precision=2): +def params_to_string(num_params: float, + units: Optional[str] = None, + precision: int = 2) -> str: """Convert parameter number into a string. Args: @@ -195,13 +201,13 @@ def params_to_string(num_params, units=None, precision=2): return str(num_params) -def print_model_with_flops(model, - total_flops, - total_params, - units='GFLOPs', - precision=3, - ost=sys.stdout, - flush=False): +def print_model_with_flops(model: nn.Module, + total_flops: float, + total_params: float, + units: Optional[str] = 'GFLOPs', + precision: int = 3, + ost: TextIO = sys.stdout, + flush: bool = False) -> None: """Print a model with FLOPs for each layer. Args: @@ -276,10 +282,10 @@ def print_model_with_flops(model, return ', '.join([ params_to_string( accumulated_num_params, units='M', precision=precision), - '{:.3%} Params'.format(accumulated_num_params / total_params), + f'{accumulated_num_params / total_params:.3%} Params', flops_to_string( accumulated_flops_cost, units=units, precision=precision), - '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops), + f'{accumulated_flops_cost / total_flops:.3%} FLOPs', self.original_extra_repr() ]) @@ -304,7 +310,7 @@ def print_model_with_flops(model, model.apply(del_extra_repr) -def get_model_parameters_number(model): +def get_model_parameters_number(model: nn.Module) -> float: """Calculate parameter number of a model. Args: @@ -317,16 +323,16 @@ def get_model_parameters_number(model): return num_params -def add_flops_counting_methods(net_main_module): +def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module: # adding additional methods to the existing module object, # this is done this way so that each function has access to self object - net_main_module.start_flops_count = start_flops_count.__get__( + net_main_module.start_flops_count = start_flops_count.__get__( # type: ignore # noqa E501 net_main_module) - net_main_module.stop_flops_count = stop_flops_count.__get__( + net_main_module.stop_flops_count = stop_flops_count.__get__( # type: ignore # noqa E501 net_main_module) - net_main_module.reset_flops_count = reset_flops_count.__get__( + net_main_module.reset_flops_count = reset_flops_count.__get__( # type: ignore # noqa E501 net_main_module) - net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501 + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # type: ignore # noqa E501 net_main_module) net_main_module.reset_flops_count() @@ -334,7 +340,7 @@ def add_flops_counting_methods(net_main_module): return net_main_module -def compute_average_flops_cost(self): +def compute_average_flops_cost(self) -> Tuple[float, float]: """Compute average FLOPs cost. A method to compute average FLOPs cost, which will be available after @@ -352,7 +358,7 @@ def compute_average_flops_cost(self): return flops_sum / batches_count, params_sum -def start_flops_count(self): +def start_flops_count(self) -> None: """Activate the computation of mean flops consumption per image. A method to activate the computation of mean flops consumption per image. @@ -361,7 +367,7 @@ def start_flops_count(self): """ add_batch_counter_hook_function(self) - def add_flops_counter_hook_function(module): + def add_flops_counter_hook_function(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops_handle__'): return @@ -375,7 +381,7 @@ def start_flops_count(self): self.apply(partial(add_flops_counter_hook_function)) -def stop_flops_count(self): +def stop_flops_count(self) -> None: """Stop computing the mean flops consumption per image. A method to stop computing the mean flops consumption per image, which will @@ -386,7 +392,7 @@ def stop_flops_count(self): self.apply(remove_flops_counter_hook_function) -def reset_flops_count(self): +def reset_flops_count(self) -> None: """Reset statistics computed so far. A method to Reset computed statistics, which will be available after @@ -397,11 +403,13 @@ def reset_flops_count(self): # ---- Internal functions -def empty_flops_counter_hook(module, input, output): +def empty_flops_counter_hook(module: nn.Module, input: tuple, + output: Any) -> None: module.__flops__ += 0 -def upsample_flops_counter_hook(module, input, output): +def upsample_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: output_size = output[0] batch_size = output_size.shape[0] output_elements_count = batch_size @@ -410,39 +418,38 @@ def upsample_flops_counter_hook(module, input, output): module.__flops__ += int(output_elements_count) -def relu_flops_counter_hook(module, input, output): +def relu_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: active_elements_count = output.numel() module.__flops__ += int(active_elements_count) -def linear_flops_counter_hook(module, input, output): - input = input[0] +def linear_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: output_last_dim = output.shape[ -1] # pytorch checks dimensions, so here we don't care much - module.__flops__ += int(np.prod(input.shape) * output_last_dim) + module.__flops__ += int(np.prod(input[0].shape) * output_last_dim) -def pool_flops_counter_hook(module, input, output): - input = input[0] - module.__flops__ += int(np.prod(input.shape)) +def pool_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + module.__flops__ += int(np.prod(input[0].shape)) -def norm_flops_counter_hook(module, input, output): - input = input[0] - - batch_flops = np.prod(input.shape) +def norm_flops_counter_hook(module: nn.Module, input: tuple, + output: torch.Tensor) -> None: + batch_flops = np.prod(input[0].shape) if (getattr(module, 'affine', False) or getattr(module, 'elementwise_affine', False)): batch_flops *= 2 module.__flops__ += int(batch_flops) -def deconv_flops_counter_hook(conv_module, input, output): +def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple, + output: torch.Tensor) -> None: # Can have multiple inputs, getting the first one - input = input[0] - - batch_size = input.shape[0] - input_height, input_width = input.shape[2:] + batch_size = input[0].shape[0] + input_height, input_width = input[0].shape[2:] kernel_height, kernel_width = conv_module.kernel_size in_channels = conv_module.in_channels @@ -458,17 +465,16 @@ def deconv_flops_counter_hook(conv_module, input, output): bias_flops = 0 if conv_module.bias is not None: output_height, output_width = output.shape[2:] - bias_flops = out_channels * batch_size * output_height * output_height + bias_flops = out_channels * batch_size * output_height * output_width overall_flops = overall_conv_flops + bias_flops conv_module.__flops__ += int(overall_flops) -def conv_flops_counter_hook(conv_module, input, output): +def conv_flops_counter_hook(conv_module: nn.Module, input: tuple, + output: torch.Tensor) -> None: # Can have multiple inputs, getting the first one - input = input[0] - - batch_size = input.shape[0] + batch_size = input[0].shape[0] output_dims = list(output.shape[2:]) kernel_dims = list(conv_module.kernel_size) @@ -495,25 +501,23 @@ def conv_flops_counter_hook(conv_module, input, output): conv_module.__flops__ += int(overall_flops) -def batch_counter_hook(module, input, output): +def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None: batch_size = 1 if len(input) > 0: # Can have multiple inputs, getting the first one - input = input[0] - batch_size = len(input) + batch_size = len(input[0]) else: - pass - print('Warning! No positional inputs found for a module, ' - 'assuming batch size is 1.') + warnings.warn('No positional inputs found for a module, ' + 'assuming batch size is 1.') module.__batch_counter__ += batch_size -def add_batch_counter_variables_or_reset(module): +def add_batch_counter_variables_or_reset(module: nn.Module) -> None: module.__batch_counter__ = 0 -def add_batch_counter_hook_function(module): +def add_batch_counter_hook_function(module: nn.Module) -> None: if hasattr(module, '__batch_counter_handle__'): return @@ -521,36 +525,36 @@ def add_batch_counter_hook_function(module): module.__batch_counter_handle__ = handle -def remove_batch_counter_hook_function(module): +def remove_batch_counter_hook_function(module: nn.Module) -> None: if hasattr(module, '__batch_counter_handle__'): module.__batch_counter_handle__.remove() del module.__batch_counter_handle__ -def add_flops_counter_variable_or_reset(module): +def add_flops_counter_variable_or_reset(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops__') or hasattr(module, '__params__'): - print('Warning: variables __flops__ or __params__ are already ' - 'defined for the module' + type(module).__name__ + - ' ptflops can affect your code!') + warnings.warn('variables __flops__ or __params__ are already ' + 'defined for the module' + type(module).__name__ + + ' ptflops can affect your code!') module.__flops__ = 0 module.__params__ = get_model_parameters_number(module) -def is_supported_instance(module): +def is_supported_instance(module: nn.Module) -> bool: if type(module) in get_modules_mapping(): return True return False -def remove_flops_counter_hook_function(module): +def remove_flops_counter_hook_function(module: nn.Module) -> None: if is_supported_instance(module): if hasattr(module, '__flops_handle__'): module.__flops_handle__.remove() del module.__flops_handle__ -def get_modules_mapping(): +def get_modules_mapping() -> Dict: return { # convolutions nn.Conv1d: conv_flops_counter_hook, diff --git a/mmcv/cnn/utils/fuse_conv_bn.py b/mmcv/cnn/utils/fuse_conv_bn.py index cb7076f80bf37f7931185bf0293ffcc1ce19c8ef..6ccaab3bf1eb3ce615bad910d6dc45a467bb1fe4 100644 --- a/mmcv/cnn/utils/fuse_conv_bn.py +++ b/mmcv/cnn/utils/fuse_conv_bn.py @@ -3,7 +3,7 @@ import torch import torch.nn as nn -def _fuse_conv_bn(conv, bn): +def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module: """Fuse conv and bn into one module. Args: @@ -24,7 +24,7 @@ def _fuse_conv_bn(conv, bn): return conv -def fuse_conv_bn(module): +def fuse_conv_bn(module: nn.Module) -> nn.Module: """Recursively fuse conv and bn in a module. During inference, the functionary of batch norm layers is turned off diff --git a/mmcv/cnn/utils/sync_bn.py b/mmcv/cnn/utils/sync_bn.py index 8a79ff4a4f8dc70cf931fa319287682d4189e1a2..c534fc0e17506dde31c20529ce7bef64eef87140 100644 --- a/mmcv/cnn/utils/sync_bn.py +++ b/mmcv/cnn/utils/sync_bn.py @@ -1,9 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. import torch +import torch.nn as nn import mmcv -class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): +class _BatchNormXd(nn.modules.batchnorm._BatchNorm): """A general BatchNorm layer without input dimension check. Reproduced from @kapily's work: @@ -14,11 +16,11 @@ class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): SyncBatchNorm. """ - def _check_input_dim(self, input): + def _check_input_dim(self, input: torch.Tensor): return -def revert_sync_batchnorm(module): +def revert_sync_batchnorm(module: nn.Module) -> nn.Module: """Helper function to convert all `SyncBatchNorm` (SyncBN) and `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to `BatchNormXd` layers. diff --git a/mmcv/cnn/utils/weight_init.py b/mmcv/cnn/utils/weight_init.py index e1ac999e2470048ef05b3243b0d8b6959586785f..6e0d293ad4fb315462e34d5899ae6fccc4a7ba86 100644 --- a/mmcv/cnn/utils/weight_init.py +++ b/mmcv/cnn/utils/weight_init.py @@ -2,6 +2,7 @@ import copy import math import warnings +from typing import Dict, List, Optional, Union import numpy as np import torch @@ -13,7 +14,7 @@ from mmcv.utils import Registry, build_from_cfg, get_logger, print_log INITIALIZERS = Registry('initializer') -def update_init_info(module, init_info): +def update_init_info(module: nn.Module, init_info: str) -> None: """Update the `_params_init_info` in the module if the value of parameters are changed. @@ -45,14 +46,17 @@ def update_init_info(module, init_info): module._params_init_info[param]['tmp_mean_value'] = mean_value -def constant_init(module, val, bias=0): +def constant_init(module: nn.Module, val: float, bias: float = 0) -> None: if hasattr(module, 'weight') and module.weight is not None: nn.init.constant_(module.weight, val) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) -def xavier_init(module, gain=1, bias=0, distribution='normal'): +def xavier_init(module: nn.Module, + gain: float = 1, + bias: float = 0, + distribution: str = 'normal') -> None: assert distribution in ['uniform', 'normal'] if hasattr(module, 'weight') and module.weight is not None: if distribution == 'uniform': @@ -63,7 +67,10 @@ def xavier_init(module, gain=1, bias=0, distribution='normal'): nn.init.constant_(module.bias, bias) -def normal_init(module, mean=0, std=1, bias=0): +def normal_init(module: nn.Module, + mean: float = 0, + std: float = 1, + bias: float = 0) -> None: if hasattr(module, 'weight') and module.weight is not None: nn.init.normal_(module.weight, mean, std) if hasattr(module, 'bias') and module.bias is not None: @@ -82,19 +89,22 @@ def trunc_normal_init(module: nn.Module, nn.init.constant_(module.bias, bias) # type: ignore -def uniform_init(module, a=0, b=1, bias=0): +def uniform_init(module: nn.Module, + a: float = 0, + b: float = 1, + bias: float = 0) -> None: if hasattr(module, 'weight') and module.weight is not None: nn.init.uniform_(module.weight, a, b) if hasattr(module, 'bias') and module.bias is not None: nn.init.constant_(module.bias, bias) -def kaiming_init(module, - a=0, - mode='fan_out', - nonlinearity='relu', - bias=0, - distribution='normal'): +def kaiming_init(module: nn.Module, + a: float = 0, + mode: str = 'fan_out', + nonlinearity: str = 'relu', + bias: float = 0, + distribution: str = 'normal') -> None: assert distribution in ['uniform', 'normal'] if hasattr(module, 'weight') and module.weight is not None: if distribution == 'uniform': @@ -107,7 +117,7 @@ def kaiming_init(module, nn.init.constant_(module.bias, bias) -def caffe2_xavier_init(module, bias=0): +def caffe2_xavier_init(module: nn.Module, bias: float = 0) -> None: # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch # Acknowledgment to FAIR's internal code kaiming_init( @@ -119,19 +129,23 @@ def caffe2_xavier_init(module, bias=0): distribution='uniform') -def bias_init_with_prob(prior_prob): +def bias_init_with_prob(prior_prob: float) -> float: """initialize conv/fc bias value according to a given probability value.""" bias_init = float(-np.log((1 - prior_prob) / prior_prob)) return bias_init -def _get_bases_name(m): +def _get_bases_name(m: nn.Module) -> List[str]: return [b.__name__ for b in m.__class__.__bases__] -class BaseInit(object): +class BaseInit: - def __init__(self, *, bias=0, bias_prob=None, layer=None): + def __init__(self, + *, + bias: float = 0, + bias_prob: Optional[float] = None, + layer: Union[str, List, None] = None): self.wholemodule = False if not isinstance(bias, (int, float)): raise TypeError(f'bias must be a number, but got a {type(bias)}') @@ -154,7 +168,7 @@ class BaseInit(object): self.bias = bias self.layer = [layer] if isinstance(layer, str) else layer - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}, bias={self.bias}' return info @@ -172,11 +186,11 @@ class ConstantInit(BaseInit): Defaults to None. """ - def __init__(self, val, **kwargs): + def __init__(self, val: Union[int, float], **kwargs): super().__init__(**kwargs) self.val = val - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: @@ -191,7 +205,7 @@ class ConstantInit(BaseInit): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}' return info @@ -214,12 +228,15 @@ class XavierInit(BaseInit): Defaults to None. """ - def __init__(self, gain=1, distribution='normal', **kwargs): + def __init__(self, + gain: float = 1, + distribution: str = 'normal', + **kwargs): super().__init__(**kwargs) self.gain = gain self.distribution = distribution - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: @@ -234,7 +251,7 @@ class XavierInit(BaseInit): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: gain={self.gain}, ' \ f'distribution={self.distribution}, bias={self.bias}' return info @@ -257,12 +274,12 @@ class NormalInit(BaseInit): """ - def __init__(self, mean=0, std=1, **kwargs): + def __init__(self, mean: float = 0, std: float = 1, **kwargs): super().__init__(**kwargs) self.mean = mean self.std = std - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: @@ -277,7 +294,7 @@ class NormalInit(BaseInit): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: mean={self.mean},' \ f' std={self.std}, bias={self.bias}' return info @@ -355,12 +372,12 @@ class UniformInit(BaseInit): Defaults to None. """ - def __init__(self, a=0, b=1, **kwargs): + def __init__(self, a: float = 0., b: float = 1., **kwargs): super().__init__(**kwargs) self.a = a self.b = b - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: @@ -375,7 +392,7 @@ class UniformInit(BaseInit): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: a={self.a},' \ f' b={self.b}, bias={self.bias}' return info @@ -409,10 +426,10 @@ class KaimingInit(BaseInit): """ def __init__(self, - a=0, - mode='fan_out', - nonlinearity='relu', - distribution='normal', + a: float = 0, + mode: str = 'fan_out', + nonlinearity: str = 'relu', + distribution: str = 'normal', **kwargs): super().__init__(**kwargs) self.a = a @@ -420,7 +437,7 @@ class KaimingInit(BaseInit): self.nonlinearity = nonlinearity self.distribution = distribution - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: def init(m): if self.wholemodule: @@ -437,7 +454,7 @@ class KaimingInit(BaseInit): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \ f'nonlinearity={self.nonlinearity}, ' \ f'distribution ={self.distribution}, bias={self.bias}' @@ -456,12 +473,12 @@ class Caffe2XavierInit(KaimingInit): distribution='uniform', **kwargs) - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: super().__call__(module) @INITIALIZERS.register_module(name='Pretrained') -class PretrainedInit(object): +class PretrainedInit: """Initialize module by loading a pretrained model. Args: @@ -475,12 +492,15 @@ class PretrainedInit(object): map_location (str): map tensors into proper locations. """ - def __init__(self, checkpoint, prefix=None, map_location=None): + def __init__(self, + checkpoint: str, + prefix: Optional[str] = None, + map_location: Optional[str] = None): self.checkpoint = checkpoint self.prefix = prefix self.map_location = map_location - def __call__(self, module): + def __call__(self, module: nn.Module) -> None: from mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint, load_state_dict) logger = get_logger('mmcv') @@ -503,12 +523,14 @@ class PretrainedInit(object): if hasattr(module, '_params_init_info'): update_init_info(module, init_info=self._get_init_info()) - def _get_init_info(self): + def _get_init_info(self) -> str: info = f'{self.__class__.__name__}: load from {self.checkpoint}' return info -def _initialize(module, cfg, wholemodule=False): +def _initialize(module: nn.Module, + cfg: Dict, + wholemodule: bool = False) -> None: func = build_from_cfg(cfg, INITIALIZERS) # wholemodule flag is for override mode, there is no layer key in override # and initializer will give init values for the whole module with the name @@ -517,7 +539,8 @@ def _initialize(module, cfg, wholemodule=False): func(module) -def _initialize_override(module, override, cfg): +def _initialize_override(module: nn.Module, override: Union[Dict, List], + cfg: Dict) -> None: if not isinstance(override, (dict, list)): raise TypeError(f'override must be a dict or a list of dict, \ but got {type(override)}') @@ -547,8 +570,8 @@ def _initialize_override(module, override, cfg): f'but init_cfg is {cp_override}.') -def initialize(module, init_cfg): - """Initialize a module. +def initialize(module: nn.Module, init_cfg: Union[Dict, List[dict]]) -> None: + r"""Initialize a module. Args: module (``torch.nn.Module``): the module will be initialized. @@ -556,6 +579,7 @@ def initialize(module, init_cfg): define initializer. OpenMMLab has implemented 6 initializers including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``, ``Kaiming``, and ``Pretrained``. + Example: >>> module = nn.Linear(2, 3, bias=True) >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2) diff --git a/mmcv/cnn/vgg.py b/mmcv/cnn/vgg.py index 8778b649561a45a9652b1a15a26c2d171e58f3e1..a1d9ba211eb4b0056eb4127e19159e9ed5d5251f 100644 --- a/mmcv/cnn/vgg.py +++ b/mmcv/cnn/vgg.py @@ -1,12 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. import logging +from typing import List, Optional, Sequence, Tuple, Union import torch.nn as nn +from torch import Tensor from .utils import constant_init, kaiming_init, normal_init -def conv3x3(in_planes, out_planes, dilation=1): +def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module: """3x3 convolution with padding.""" return nn.Conv2d( in_planes, @@ -16,12 +18,12 @@ def conv3x3(in_planes, out_planes, dilation=1): dilation=dilation) -def make_vgg_layer(inplanes, - planes, - num_blocks, - dilation=1, - with_bn=False, - ceil_mode=False): +def make_vgg_layer(inplanes: int, + planes: int, + num_blocks: int, + dilation: int = 1, + with_bn: bool = False, + ceil_mode: bool = False) -> List[nn.Module]: layers = [] for _ in range(num_blocks): layers.append(conv3x3(inplanes, planes, dilation)) @@ -59,18 +61,18 @@ class VGG(nn.Module): } def __init__(self, - depth, - with_bn=False, - num_classes=-1, - num_stages=5, - dilations=(1, 1, 1, 1, 1), - out_indices=(0, 1, 2, 3, 4), - frozen_stages=-1, - bn_eval=True, - bn_frozen=False, - ceil_mode=False, - with_last_pool=True): - super(VGG, self).__init__() + depth: int, + with_bn: bool = False, + num_classes: int = -1, + num_stages: int = 5, + dilations: Sequence[int] = (1, 1, 1, 1, 1), + out_indices: Sequence[int] = (0, 1, 2, 3, 4), + frozen_stages: int = -1, + bn_eval: bool = True, + bn_frozen: bool = False, + ceil_mode: bool = False, + with_last_pool: bool = True): + super().__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for vgg') assert num_stages >= 1 and num_stages <= 5 @@ -122,7 +124,7 @@ class VGG(nn.Module): nn.Linear(4096, num_classes), ) - def init_weights(self, pretrained=None): + def init_weights(self, pretrained: Optional[str] = None) -> None: if isinstance(pretrained, str): logger = logging.getLogger() from ..runner import load_checkpoint @@ -138,7 +140,7 @@ class VGG(nn.Module): else: raise TypeError('pretrained must be a str or None') - def forward(self, x): + def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]: outs = [] vgg_layers = getattr(self, self.module_name) for i in range(len(self.stage_blocks)): @@ -156,8 +158,8 @@ class VGG(nn.Module): else: return tuple(outs) - def train(self, mode=True): - super(VGG, self).train(mode) + def train(self, mode: bool = True) -> None: + super().train(mode) if self.bn_eval: for m in self.modules(): if isinstance(m, nn.BatchNorm2d): diff --git a/mmcv/device/__init__.py b/mmcv/device/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ba217b0771bcfada461d7c61a78f41a274e5aa6a --- /dev/null +++ b/mmcv/device/__init__.py @@ -0,0 +1,6 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from . import ipu, mlu, mps +from .scatter_gather import scatter, scatter_kwargs +from .utils import get_device + +__all__ = ['mlu', 'ipu', 'mps', 'get_device', 'scatter', 'scatter_kwargs'] diff --git a/mmcv/device/_functions.py b/mmcv/device/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..462a7e4ddca14685047b7937e3054108e164cf91 --- /dev/null +++ b/mmcv/device/_functions.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch + +from mmcv.utils import deprecated_api_warning +from .utils import get_device + + +def scatter(input: Union[List, torch.Tensor], devices: List) -> List: + """scatter copies tensor to devices directly.""" + current_device = get_device() + if isinstance(input, list): + outputs = [scatter(_input, devices) for _input in input] + return outputs + elif isinstance(input, torch.Tensor): + output = input.contiguous() + return output.to(current_device) if devices != [-1] else output + else: + raise Exception(f'Unknown type {type(input)}.') + + +class Scatter: + + @staticmethod + @deprecated_api_warning({'target_mlus': 'target_devices'}, + cls_name='Scatter') + def forward(target_devices, input): + outputs = scatter(input, target_devices) + return tuple(outputs) if isinstance(outputs, list) else (outputs, ) diff --git a/mmcv/device/ipu/__init__.py b/mmcv/device/ipu/__init__.py new file mode 100755 index 0000000000000000000000000000000000000000..d550865ad20790f0eb79015abc866548c0f2f83b --- /dev/null +++ b/mmcv/device/ipu/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.utils import IS_IPU_AVAILABLE + +if IS_IPU_AVAILABLE: + from .dataloader import IPUDataLoader + from .hook_wrapper import IPUFp16OptimizerHook + from .model_wrapper import ipu_model_wrapper + from .runner import IPUBaseRunner, IPUEpochBasedRunner, IPUIterBasedRunner + from .utils import cfg2options + __all__ = [ + 'cfg2options', 'ipu_model_wrapper', 'IPUFp16OptimizerHook', + 'IPUDataLoader', 'IPUBaseRunner', 'IPUEpochBasedRunner', + 'IPUIterBasedRunner' + ] diff --git a/mmcv/device/ipu/dataloader.py b/mmcv/device/ipu/dataloader.py new file mode 100755 index 0000000000000000000000000000000000000000..1485df2f31facff79238c70d89fdd9030fddcbce --- /dev/null +++ b/mmcv/device/ipu/dataloader.py @@ -0,0 +1,157 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Mapping, Sequence +from functools import partial + +import poptorch +from torch.utils.data.dataloader import default_collate + +from mmcv.parallel import DataContainer + + +def collate(batch, samples_per_gpu=1): + """Put each data field into a tensor/DataContainer with outer dimension + batch size. + + TODO support for + :type:`~mmcv.parallel.DataContainer`. Currently, it will be ignored. + There are 3 cases. + + 1. cpu_only = True, e.g., meta data. + 2. cpu_only = False, stack = True, e.g., images tensors. + 3. cpu_only = False, stack = False, e.g., gt bboxes. + """ + + if not isinstance(batch, Sequence): + raise TypeError( + f'`batch` should be a sequence, but got {type(batch)}.') + + if isinstance(batch[0], DataContainer): + # TODO `DataContainer` will be supported in the future. + raise TypeError('DataContainer is not supported in ipu data loader.') + elif isinstance(batch[0], Sequence): + transposed = zip(*batch) + collated_batch = [] + for samples in transposed: + if not isinstance(samples[0], DataContainer): + # At present, we will skip the processing of datacontainer, + # which will reduce the performance of IPU DataLoder + collated_batch.append(collate(samples, samples_per_gpu)) + return collated_batch + elif isinstance(batch[0], Mapping): + collated_batch = {} + for key in batch[0]: + if not isinstance(batch[0][key], DataContainer): + # At present, we will skip the processing of datacontainer, + # which will reduce the performance of IPU DataLoder + collated_batch[key] = collate([d[key] for d in batch]) + return collated_batch + else: + return default_collate(batch) + + +class IPUDataLoader(poptorch.DataLoader): + """Thin wrapper of `torch.utils.data.DataLoader`. + + Compared with the pytorch DataLoder, this DataLoder changes the way of + calculation of batch size and adds the AsynchronousDataAccessor to + load and release data faster in cpu mode. + + If this data loader is used in a distributed execution environment, it will + ensure that each process uses a different subset of the dataset, providing + you first call ``options.randomSeed(N)`` with an integer N which is the + same across all hosts. + + Args: + dataset (torch.utils.data.Dataset): The dataset to get the data from. + options (poptorch.Options): Options that will be used to compile + and run the model. + batch_size (int, optional): This is the batch size in the conventional + sense of being the size that runs through an operation in the model + at any given time. + shuffle (bool, optional): set to ``True`` to have the data reshuffled + at every epoch (default: ``False``). + num_workers (int, optional): how many subprocesses to use for data + loading. ``0`` means that the data will be loaded in the main + process. (default: ``0``) + drop_last (bool, optional): If True and the number of elements in the + dataset is not a multiple of the combined batch size then the + incomplete batch at the end will be dropped. + persistent_workers (bool, optional): Re-use workers between + iterations if True. + auto_distributed_partitioning (bool, optional): If True, partitions the + dataset for distributed execution automatically. Otherwise, it is + assumed that partitioning has been handled manually. + mode (poptorch.DataLoaderMode, optional): If `DataLoaderMode.Async`, + uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access + the dataset. If `DataLoaderMode.Sync`, accesses the dataset + synchronously. + async_options (Dict[str, Any], optional): Options to pass to + :py:class:`~poptorch.AsynchronousDataAccessor`. + rebatched_worker_size (int, optional): When using AsyncRebatched: batch + size of the tensors loaded by the workers. + Default to the combined batch size. + If specified the ``rebatched_worker_size`` must be less than + or equal to the combined batch size. + kwargs (Dict[str, Any], optional): Other options to pass to PyTorch's + ``DataLoader`` constructor. + """ + + def __init__(self, + dataset, + options, + batch_size=1, + shuffle=False, + num_workers=0, + drop_last=True, + persistent_workers=True, + auto_distributed_partitioning=True, + mode='sync', + async_options=None, + rebatched_worker_size=None, + **kwargs): + """Lazy init: + + In many frameworks, the dataloader will be constructed before the + initialization of the ipu options, so the lazy init method is used + here, and the real initialization will not be done until the dataloader + needs to be used and the options are input. + """ + # lazy init: sometimes, we cannot get IPU options when build data + # loader + self.kwargs = { + 'dataset': dataset, + 'batch_size': batch_size, + 'shuffle': shuffle, + 'num_workers': num_workers, + 'drop_last': drop_last, + 'persistent_workers': persistent_workers, + 'auto_distributed_partitioning': auto_distributed_partitioning, + 'mode': mode, + 'collate_fn': partial(collate, samples_per_gpu=batch_size), + 'async_options': async_options, + 'rebatched_worker_size': rebatched_worker_size, + **kwargs + } + self.dataset = dataset + self.initialized = False + if options: + self.init(options=options) + + def init(self, options, **kwargs): + if not self.initialized: + kwargs = {**self.kwargs, **kwargs, 'options': options} + if kwargs['mode'] == 'sync': + kwargs['mode'] = poptorch.DataLoaderMode.Sync + elif kwargs['mode'] == 'async': + kwargs['mode'] = poptorch.DataLoaderMode.AsyncRebatched + if kwargs['async_options'] is None: + kwargs['async_options'] = { + 'load_indefinitely': True, + 'buffer_size': 8 + } + if kwargs['rebatched_worker_size'] is None: + kwargs['rebatched_worker_size'] = 128 + super().__init__(**kwargs) + self.initialized = True + + return self diff --git a/mmcv/device/ipu/hierarchical_data_manager.py b/mmcv/device/ipu/hierarchical_data_manager.py new file mode 100755 index 0000000000000000000000000000000000000000..a6f3b3cd2a139bcbc7852e7849071ab4b9fbb76f --- /dev/null +++ b/mmcv/device/ipu/hierarchical_data_manager.py @@ -0,0 +1,243 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch + +from mmcv.parallel import DataContainer + +# A customized None type for HierarchicalDataManager +HierarchicalDataNone = object() + + +class HierarchicalDataManager: + """A class manage all the tensors in the hierarchical data. + + At present, the input data structure accepted by IPU is limited, + when the input data structure of mmcv varies. + Here, an intermediate class is needed to get and update tensors + from the original data. + + HierarchicalDataManager will record a hierarchical input/output data in + self._hierarchical_data. For example, we have an input data: + {'img': tensorA, 'label': tensorB, 'img_metas': [tensorC, tensorD]} + To enable IPU to use the input, HierarchicalDataManager will collect + the torch tensors from self._hierarchical_data into a tuple like: + (tensorA, tensorB, tensorC, tensorD). + Meanwhile, the return of IPU is a tuple of tensors, HierarchicalDataManager + also have a function named update_all_tensors to update tensors in + self._hierarchical_data which is the output for upper calls. + + Args: + logger (:obj:`logging.Logger`): Logger used during running. + Defaults to None. + """ + + def __init__(self, logger=None): + self.atomic_types = (int, str, float, np.ndarray, type(None)) + self.warning = warnings.warn if logger is None else logger.warning + # enable or disable input data's shape and value check + self.quick_mode = False + self._hierarchical_data = None + + def quick(self): + self.quick_mode = True + + def compare_atomic_type(self, a, b): + """Compare data, supported datatypes are numpy array and python basic + types.""" + if isinstance(a, np.ndarray): + return np.all(a == b) + else: + return a == b + + def record_hierarchical_data(self, data): + """Record a hierarchical data.""" + if self._hierarchical_data is not None: + if isinstance(data, torch.Tensor): + assert isinstance(self._hierarchical_data, torch.Tensor), \ + 'original hierarchical data is not torch.tensor' + self._hierarchical_data = data + else: + self.update_hierarchical_data(data) + else: + self._hierarchical_data = data + + @property + def hierarchical_data(self): + return self._hierarchical_data + + def update_hierarchical_data(self, + dataA, + dataB=HierarchicalDataNone, + strict=True, + address='data'): + """Update dataB with dataA in-place. + + Args: + dataA (list or dict or tuple): New hierarchical data. + dataB (list or dict or tuple): hierarchical data to update. + if not specified, self.hierarchical_data will be updated then. + strict (bool, optional): If true, an error will be reported + when the following conditions occur: + 1. Non-torch.Tensor data changed. + 2. Torch.Tensor data shape changed. + address (str): Record the address of current data to be updated. + Default: 'data'. + """ + if dataB is HierarchicalDataNone: + dataB = self.hierarchical_data + + # Update with a da ta with the same structure + # but different values(tensors and basic python data types) + if isinstance(dataA, (tuple, list)): + for idx, node in enumerate(dataA): + new_address = '' + if not self.quick_mode: + new_address = address + f'[{str(idx)}]' + assert isinstance(node, type(dataB[idx])),\ + f'data structure changed: {new_address}' + if isinstance(node, torch.Tensor): + dataB[idx] = node + else: + self.update_hierarchical_data( + node, dataB[idx], strict, address=new_address) + elif isinstance(dataA, dict): + for k, v in dataA.items(): + new_address = '' + if not self.quick_mode: + new_address = address + f'[{str(k)}]' + assert isinstance(v, type(dataB[k])),\ + f'data structure changed: {new_address}' + if isinstance(v, torch.Tensor): + dataB[k] = v + else: + self.update_hierarchical_data( + v, dataB[k], strict, address=new_address) + elif isinstance(dataA, self.atomic_types): + if not self.quick_mode: + is_equal = self.compare_atomic_type(dataA, dataB) + if not is_equal: + if strict: + raise ValueError( + 'all data except torch.Tensor should be same, ' + f'but data({address}) is changed.') + else: + self.warning( + f'find a non-torch.Tensor data({type(dataA)}) ' + f'changed, and the address is {address}') + elif isinstance(dataA, DataContainer): + if not self.quick_mode: + assert isinstance(dataB, DataContainer) + new_address = address + '.data' + self.update_hierarchical_data( + dataA.data, dataB.data, False, address=new_address) + else: + raise NotImplementedError( + f'not supported datatype:{type(dataA)}, address is {address}') + + def collect_all_tensors(self, hierarchical_data=None): + """Collect torch.Tensor data from self.hierarchical_data to a list and + return.""" + # get a list of tensor from self._hierarchical_data + if hierarchical_data is None: + hierarchical_data = self._hierarchical_data + tensors = [] + if isinstance(hierarchical_data, torch.Tensor): + tensors = [hierarchical_data] + else: + self._collect_tensors(hierarchical_data, tensors) + return tensors + + def _collect_tensors(self, data, tensors): + if isinstance(data, (tuple, list)): + for node in data: + if isinstance(node, torch.Tensor): + tensors.append(node) + else: + self._collect_tensors(node, tensors) + elif isinstance(data, dict): + for v in data.values(): + if isinstance(v, torch.Tensor): + tensors.append(v) + else: + self._collect_tensors(v, tensors) + elif isinstance(data, self.atomic_types): + pass + elif isinstance(data, DataContainer): + self._collect_tensors(data.data, tensors) + else: + raise NotImplementedError(f'not supported datatype:{type(data)}') + + def update_all_tensors(self, tensors): + """Put tensors from tuple back to self.hierarchical_data.""" + if isinstance(self._hierarchical_data, torch.Tensor): + print(tensors, len(tensors)) + assert len(tensors) == 1 + assert isinstance(tensors[0], torch.Tensor) + self._hierarchical_data = tensors[0] + else: + # convert to list if tensors is tuple + tensors = list(tensors) + self._set_tensors(self._hierarchical_data, tensors) + return self.hierarchical_data + + def _set_tensors(self, data, tensors): + if isinstance(data, tuple): + data = list(data) + for idx in range(len(data)): + if isinstance(data[idx], torch.Tensor): + data[idx] = tensors.pop(0) + else: + self._set_tensors(data[idx], tensors) + data = tuple(data) + elif isinstance(data, list): + for idx in range(len(data)): + if isinstance(data[idx], torch.Tensor): + data[idx] = tensors.pop(0) + else: + self._set_tensors(data[idx], tensors) + elif isinstance(data, dict): + for k, v in data.items(): + if isinstance(v, torch.Tensor): + data[k] = tensors.pop(0) + else: + self._set_tensors(v, tensors) + elif isinstance(data, self.atomic_types): + pass + elif isinstance(data, DataContainer): + self._set_tensors(data.data, tensors) + else: + raise NotImplementedError(f'not supported datatype:{type(data)}') + + def clean_all_tensors(self): + """Delete tensors from self.hierarchical_data.""" + self._clean_tensors(self._hierarchical_data) + + def _clean_tensors(self, data): + if isinstance(data, tuple): + data = list(data) + for idx in range(len(data)): + if isinstance(data[idx], torch.Tensor): + data[idx] = None + else: + self._clean_tensors(data[idx]) + data = tuple(data) + elif isinstance(data, list): + for idx in range(len(data)): + if isinstance(data[idx], torch.Tensor): + data[idx] = None + else: + self._clean_tensors(data[idx]) + elif isinstance(data, dict): + for k, v in data.items(): + if isinstance(v, torch.Tensor): + data[k] = None + else: + self._clean_tensors(v) + elif isinstance(data, self.atomic_types): + pass + elif isinstance(data, DataContainer): + self._clean_tensors(data.data) + else: + raise NotImplementedError(f'not supported datatype:{type(data)}') diff --git a/mmcv/device/ipu/hook_wrapper.py b/mmcv/device/ipu/hook_wrapper.py new file mode 100755 index 0000000000000000000000000000000000000000..141afb86d05a42c06fb5c4355cb47cae18e9bb2f --- /dev/null +++ b/mmcv/device/ipu/hook_wrapper.py @@ -0,0 +1,105 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.runner import HOOKS, LrUpdaterHook, OptimizerHook +from mmcv.utils import TORCH_VERSION, digit_version + + +def wrap_lr_updater_hook(lr_hook_class): + """A wrapper function to wrap any subclass of LrUpdaterHook. + + IPU needs extra operations to upload optimizer settings. This wrapper will + override function(_set_lr) of a subclass of LrUpdaterHook. + """ + assert issubclass(lr_hook_class, LrUpdaterHook) + + class ipu_lr_hook_class(lr_hook_class): + + def _set_lr(self, runner, *args, **kwargs): + super()._set_lr(runner, *args, **kwargs) + # convert torch optimizer to poptorch optimizer + runner.model.setOptimizer(runner.optimizer) + + return ipu_lr_hook_class + + +def wrap_optimizer_hook(optimizer_hook_class): + """A wrapper function to wrap OptimizerHook. + + This is an non-intrusive implementation of wrapping optimizer hook (or you + need to change every config file to use IPU optimizer hook) IPU's clip-norm + implementation is different from pytorch, so there should be an error + raised when using clip-norm. + """ + + class ipu_optimizer_hook_class(OptimizerHook): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + if self.grad_clip is not None: + raise NotImplementedError('IPU does not support gradient clip') + + return ipu_optimizer_hook_class + + +if (TORCH_VERSION != 'parrots' + and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + + @HOOKS.register_module() + class IPUFp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook (using PyTorch's implementation). + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, + to take care of the optimization procedure. + + Args: + loss_scale (float | str | dict): Scale factor configuration. + If loss_scale is a float, static loss scaling will be used with + the specified scale. If loss_scale is a string, it must be + 'dynamic', then dynamic loss scaling will be used. + It can also be a dict containing arguments of GradScalar. + Defaults to 512. For Pytorch >= 1.6, mmcv uses official + implementation of GradScaler. If you use a dict version of + loss_scale to create GradScaler, please refer to: + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler + for the parameters. + + Examples: + >>> loss_scale = dict( + ... init_scale=65536.0, + ... growth_factor=2.0, + ... backoff_factor=0.5, + ... growth_interval=2000 + ... ) + >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale) + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + assert grad_clip is None,\ + 'IPU mode does not support `grad_clip` currently' + assert coalesce,\ + 'implemented all reduce in distributed training currently' + assert bucket_size_mb == -1,\ + '`bucket_size_mb` should not be set in IPU mode' + self.distributed = distributed + self._scale_update_param = None + if loss_scale == 'dynamic': + raise NotImplementedError( + 'IPU mode does not support dynamic loss scale currently') + elif isinstance(loss_scale, float): + self.loss_scale = loss_scale + elif isinstance(loss_scale, dict): + raise NotImplementedError( + 'IPU mode supports single scale currently') + else: + raise ValueError( + f'loss_scale should be float, but got {loss_scale} ') + + def after_train_iter(self, runner): + pass + +else: + raise RuntimeError('The IPU mode only supports torch 1.6 and above') diff --git a/mmcv/device/ipu/model_wrapper.py b/mmcv/device/ipu/model_wrapper.py new file mode 100755 index 0000000000000000000000000000000000000000..c345537e29b27cf7fff740269da8643c9570cd36 --- /dev/null +++ b/mmcv/device/ipu/model_wrapper.py @@ -0,0 +1,721 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import inspect +from collections import OrderedDict +from typing import Optional, Union + +import poptorch +import torch +import torch.nn as nn +from poptorch import PoplarExecutor, __version__, identity_loss +from poptorch._args_parser import ArgsParser + +from mmcv.runner import auto_fp16 +from .hierarchical_data_manager import HierarchicalDataManager +from .utils import compare_ndarray, model_sharding, recomputation_checkpoint + + +class DictArgsParser(ArgsParser): + """A helper class for handling model input. + + Args: + inputs (list): Inputs of model. + """ + + def __init__(self, inputs): + # Combine args and kwargs: + self._has_variadic_arguments = True + self._varnames = list(inputs.keys()) + self._defaults = [inspect.Parameter.empty for _ in self._varnames] + self._warned_not_contiguous_input = False + + +class WrappedNet(nn.Module): + """A net wrapper for model conversion. + + This wrapper will make some changes and add some extra functions to + training/inference model. + + Args: + model (:obj:`nn.Module`): The model to run. + inputs_manager (:obj:`HierarchicalDataManager`): A parser + converting inputs from tuple to dictionary. + outputs_manager (:obj:`HierarchicalDataManager`): A parser + converting outputs from dictionary to tuple. + inter_outputs_in_cpu (dict): Specify the features to be + recorded. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + """ + + def __init__(self, + model, + inputs_manager, + outputs_manager, + inter_outputs_in_cpu, + modules_to_record=None): + super().__init__() + self.model = model + self.inputs_manager = inputs_manager + self.outputs_manager = outputs_manager + self.training = model.training + # Register a hook function to capture the intermediate features + # generated by the network to align the outputs between ipu and cpu + # Used to confirm whether the implementation of CPU is consistent + # with the implementation of IPU + self.inter_outputs_in_cpu = inter_outputs_in_cpu + if modules_to_record is None: + modules_to_record = [] + + for idx, (name, module) in enumerate(model.named_modules()): + if name in modules_to_record or idx in modules_to_record: + features_hook = self.get_input_output_hook( + name, idx, self.inter_outputs_in_cpu) + module.register_forward_hook(hook=features_hook) + + def get_input_output_hook(self, name, idx, save_dict): + + def input_output_hook(module, fea_in, fea_out): + if isinstance(fea_in, tuple): + fea_in = list(fea_in) + if isinstance(fea_out, tuple): + fea_out = list(fea_out) + save_dict[name] = { + 'fea_in': fea_in, + 'fea_out': fea_out, + 'idx': idx + } + return None + + return input_output_hook + + def forward(self, inputs_tuple): + """This function is used to be compiled to ipu, the inputs and outputs + need to be tuples, so here we need to restore the input back to a + dictionary and convert the output to a tuple.""" + self.inputs_manager.update_all_tensors(inputs_tuple) + kwargs = {**(self.inputs_manager.hierarchical_data)} + if self.training: + outputs = self.forward_train(kwargs) + # tell poptorch which loss will be used finally + identity_loss(outputs['loss'], reduction='none') + else: + outputs = self.forward_eval(kwargs) + + if isinstance(outputs, torch.Tensor): + # currently not support single tensor output, + # need to wrap it with a dictionary, + # use a keyword to identify this case + outputs = {'output of WrappedNet: single tensor': outputs} + + # if there are some features need to be record, add extra outputs + for name in self.inter_outputs_in_cpu: + outputs[name] = self.inter_outputs_in_cpu[name] + + # record all the places of return tensors in the converting stage + # while in the real run stage, all the tensor are changed in-place + # that means the output can be obtained directly outside this function + self.outputs_manager.record_hierarchical_data(outputs) + plain_outputs = self.outputs_manager.collect_all_tensors() + return plain_outputs + + def forward_train(self, kwargs): + optimizer = kwargs.pop('optimizer') + outputs = self.train_step(kwargs, optimizer) + return outputs + + def train_step(self, data, optimizer=None, **kwargs): + """The iteration step during training. + + This method defines an iteration step during training, except for the + back propagation and optimizer updating, which are done in an optimizer + hook. Note that in some complicated cases or models, the whole process + including back propagation and optimizer updating are also defined in + this method, such as GAN. + + Args: + data (dict): The output of dataloader. + optimizer (:obj:`torch.optim.Optimizer`, optional): The + optimizer of runner is passed to ``train_step()``. This + argument is unused and reserved. + + Returns: + dict: Dict of outputs. The following fields are contained. + - loss (torch.Tensor): A tensor for back propagation, which \ + can be a weighted sum of multiple losses. + - log_vars (dict): Dict contains all the variables to be sent \ + to the logger. + - num_samples (int): Indicates the batch size (when the model \ + is DDP, it means the batch size on each GPU), which is \ + used for averaging the logs. + """ + losses = self.model(**data) + loss, log_vars = self._parse_losses(losses) + + outputs = dict( + loss=loss, log_vars=log_vars, num_samples=len(data['img'].data)) + + return outputs + + def _parse_losses(self, losses): + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(loss.mean() for loss in loss_value) + elif isinstance(loss_value, dict): + for name, value in loss_value.items(): + log_vars[name] = value + else: + raise TypeError( + f'{loss_name} is not a tensor or list of tensors') + + loss = sum(value for key, value in log_vars.items() if 'loss' in key) + log_vars['loss'] = loss + + return loss, log_vars + + def forward_eval(self, kwargs): + img = kwargs.pop('img') + img_metas = kwargs.pop('img_metas', None) + return_loss = kwargs.pop('return_loss') + assert not return_loss + # TODO Temporarily hard-code to close post_process, + # otherwise, in the third trace(_check_trace), + # post_process will convert output tensor to numpy array automatically, + # resulting in _check_trace failure + outputs = self.model( + img, + img_metas=img_metas, + return_loss=return_loss, + post_process=False) + return outputs + + +class MMPoplarExecutor(PoplarExecutor): + """An executor for inputs/outputs parsing, model compilation, data + alignment and IPU upload/download. + + Args: + model (:obj:`nn.Module`): The model to be compiled. + logger (:obj:`logging.Logger`): Logger used during running. + Defaults to None. + training (bool): Model in training mode or eval mode. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + args (argument list): Arguments passed to the `__init__` + method of PoplarExecutor. + kwargs (keyword arguments): Keyword arguments passed to the `__init__` + method of PoplarExecutor. + """ + + def __init__(self, + model, + logger=None, + training=True, + modules_to_record=None, + *args, + **kwargs): + # self.model == self._user_model: input pytorch model + # self._model: wrapped model which is used to compile + # and update weights, these two models use same weights + # wrapped model only accept and output tuple, so + # HierarchicalDataManager will convert dictionary + # to tuple and convert them back + self.inputs_manager = HierarchicalDataManager(logger=logger) + self.outputs_manager = HierarchicalDataManager(logger=logger) + self.logger = logger + # the features calculated by CPU + self.inter_outputs_in_cpu = {} + # the features calculated by IPU + self.inter_outputs_in_ipu = {} + if modules_to_record is None: + # It is possible that the IPU implementation of some operators + # is inconsistent with the expected (CPU), here you can use + # this method to confirm whether there is a problem + self.compare_with_cpu = False + else: + self.compare_with_cpu = True + # move model.fp16_enabled to self.fp16_enabled, + # modify the position where the input is automatically casted to half + if getattr(model, 'fp16_enabled', False): + model.fp16_enabled = False + self.fp16_enabled = True + # make torch.jit.trace convert self._model + model = WrappedNet( + model, + self.inputs_manager, + self.outputs_manager, + self.inter_outputs_in_cpu, + modules_to_record=modules_to_record) + super().__init__(model, training=training, *args, **kwargs) + # overwrite self._args_parser in train_step or val_step + self._args_parser = None + if training: + assert self.training + else: + assert not self.training + + @property + def training(self): + # If trying to get the attribute(training) of self, + # since the class has no training attribute, + # it will automatically look for the training attribute of self.model. + # However, the real attribute we want to check is self._training, + # self.model.training and self._training are often inconsistent. + # It is not clear whether it is a Poptorch bug or a special design, + # temporarily use this function to fix the problem + return self._training # comes from self.model._training + + @auto_fp16(supported_types=(PoplarExecutor, )) + def run_model(self, data_dict): + # this function is used to parse input_dict + # and convert to output_dict + if self.isCompiled(): + self.inputs_manager.record_hierarchical_data(data_dict) + inputs_tuple = tuple(self.inputs_manager.collect_all_tensors()) + else: + # get tensors out of data and put them in a tuple + self.inputs_manager.record_hierarchical_data(data_dict) + inputs_tuple = tuple(self.inputs_manager.collect_all_tensors()) + # turn logger in data manager off after compilation + self.inputs_manager.quick() + self.outputs_manager.quick() + + # parser args in the first iter + if self._args_parser is None: + self._args_parser = DictArgsParser({'args': inputs_tuple}) + + # run or convert model + # the plain_outputs will be used in converting stage + plain_outputs = self(inputs_tuple) + + self.inputs_manager.clean_all_tensors() + + # put list of tensors back to the output dict + # according to the same order + self.outputs_manager.update_all_tensors(plain_outputs) + # get the real output dictionary from self.outputs_manager + output_dict = self.outputs_manager.hierarchical_data + + # split output_dict into inter_outputs_in_ipu + # and output of the torch model + torch_model_output = {} + for name in output_dict: + if name in self.inter_outputs_in_cpu: + self.inter_outputs_in_ipu[name] = output_dict[name] + else: + torch_model_output[name] = output_dict[name] + + if 'output of WrappedNet: single tensor' in output_dict: + assert len(torch_model_output) == 1 + assert isinstance( + torch_model_output['output of WrappedNet: single tensor'], + torch.Tensor) + torch_model_output = \ + torch_model_output['output of WrappedNet: single tensor'] + + return torch_model_output + + def train_step(self, data, optimizer=None, **kwargs): + # arguments from mmcls/models/classifiers/base.py: + # BaseClassifier.train_step + assert self.training + assert len(kwargs) == 0 # TODO, support later if necessary + + # TODO support datacontainer as input + # currently, auto_fp16 and HierarchicalDataManager take too much + # time on traversing datacontainer + data['img_metas'] = None + num_samples = len(data['img'].data) + + # TODO we will ignore optimizer because it will not be used in model, + # support later if necessary + data['optimizer'] = None + output_dict = self.run_model(data) + + # outputs contained loss, log_vars, num_samples, + # only loss(torch.tensor) has been updated + # remove all unchanged vars, left torch.tensor + neat_output_dict = {'loss': output_dict['loss']} + + # re-parse outputs, get back log_vars and num_samples + loss, log_vars = self.model._parse_losses(neat_output_dict) + final_output_dict = dict( + loss=loss, log_vars=log_vars, num_samples=num_samples) + return final_output_dict + + def eval_call(self, img, img_metas=None, return_loss=True, **kwargs): + # arguments from mmdet/models/detectors/base.py:BaseDetector.forward + # tmp usssage for eval mode + assert not self.training + assert len(kwargs) == 0 # TODO, support later if necessary + assert not return_loss + data = {'img': img, 'img_metas': img_metas, 'return_loss': return_loss} + + output_dict = self.run_model(data) + + return output_dict + + def detachFromDevice(self): + if self.isCompiled() and self._is_attached: + super().detachFromDevice() + + def attachToDevice(self): + if self.isCompiled() and not self._is_attached: + super().attachToDevice() + + +class TrainEvalModel: + """A class maintaining training MMPoplarExecutor and inference + MMPoplarExecutor. + + Args: + train_model (:obj:`nn.Module`): The training model to be compiled. + ``train_model`` can be None if only executing validation. + eval_model (:obj:`nn.Module`): The inference model to be compiled. + options (mmcv.Config, dict): Options that will be used to compile + and run the model. + optimizer (:obj:`torch.optim.Optimizer`, optional): torch + optimizer, necessary if in training mode + logger (:obj:`logging.Logger`): Logger used during running. + Defaults to None. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + """ + + def __init__(self, + train_model, + eval_model, + options, + optimizer, + modules_to_record=None, + logger=None): + if train_model is None: + self._train_executor = None + self.training = False + else: + self._train_executor = get_training_model( + train_model, + options=options['training'], + optimizer=optimizer, + logger=logger, + modules_to_record=modules_to_record) + self.training = True + self._eval_executor = get_inference_model( + eval_model, options=options['inference'], logger=logger) + + @property + def executor(self): + if self.training: + return self._train_executor + else: + return self._eval_executor + + def train(self, mode: bool = True): + """Sets the module in training mode. + + This has any effect only on certain modules. See documentations of + particular modules for details of their behaviors in + training/evaluation mode, if they are affected, + e.g. :class:`Dropout`, :class:`BatchNorm`, etc. + + Args: + mode (bool): whether to set training mode (``True``) or evaluation + mode (``False``). Default: ``True``. + + Returns: + Module: self + """ + if not isinstance(mode, bool): + raise ValueError('training mode is expected to be boolean, ' + f'but got {type(mode)}') + if self._train_executor is None and mode: + raise RuntimeError( + 'The train_executor is not initialized.' + 'If you want to initialize train_executor,' + 'you need to input optimizer when converting pytorch model') + + if mode == self.training: + self.model.train(mode) + return self + else: + if self.isCompiled(): + # copy weights from IPU to cpu before off-load current session + self.copyWeightsToHost() + # detach the current session before change the mode, + # if is training mode and weights are updated, + # poptorch will copy weights from IPU to host + self.detachFromDevice() + + self.training = mode # session will changed with mode changing + self.model.train(mode) + + # after changing mode, attach the current new session, + # and this function will copy weights of model to device + self.attachToDevice() + return self + + def eval(self): + """Sets the module in evaluation mode. + + This has any effect only on certain modules. + See documentations of particular modules + for details of their behaviors in training/evaluation mode, + if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc. + + This is equivalent with :meth:`self.train(False) + `. + + See :ref:`locally-disable-grad-doc` for a comparison between + `.eval()` and several similar mechanisms that may be confused with it. + + Returns: + Module: self + """ + return self.train(False) + + def compare_data_between_ipu_and_cpu(self, inter_outputs_in_cpu, + inter_outputs_in_ipu): + for key, val in inter_outputs_in_cpu.items(): + is_tensor = isinstance(val['fea_in'], torch.Tensor) + fea_in_cpu = val['fea_in'] + fea_in_cpu_list = [fea_in_cpu] if is_tensor else fea_in_cpu + fea_in_ipu = inter_outputs_in_ipu[key]['fea_in'] + fea_in_ipu_list = [fea_in_ipu] if is_tensor else fea_in_ipu + + is_tensor = isinstance(val['fea_out'], torch.Tensor) + fea_out_cpu = val['fea_out'] + fea_out_cpu_list = [fea_out_cpu] if is_tensor else fea_out_cpu + fea_out_ipu = inter_outputs_in_ipu[key]['fea_out'] + fea_out_ipu_list = [fea_out_ipu] if is_tensor else fea_out_ipu + + print('comparing layer:', key) + for idx, (featA, featB) in \ + enumerate(zip(fea_in_cpu_list, fea_in_ipu_list)): + print('fea_in, tensor ', idx) + compare_ndarray(featA.detach().numpy(), featB.detach().numpy()) + for idx, (featA, featB) in \ + enumerate(zip(fea_out_cpu_list, fea_out_ipu_list)): + print('fea_out, tensor', idx) + compare_ndarray(featA.detach().numpy(), featB.detach().numpy()) + + # TODO Unified training and eval interface, + # merge train_step(train) and __call__(eval) together + def train_step(self, data, optimizer=None, **kwargs): + assert self.training, 'not supported train_step on eval mode' + inter_outputs_in_cpu = {} + if (self._train_executor.isCompiled() + and self._train_executor.compare_with_cpu): + self.copyWeightsToHost() + # run in CPU mode + self._train_executor.model.train_step(data, optimizer, **kwargs) + inter_outputs_in_cpu = { + **(self._train_executor.inter_outputs_in_cpu) + } + # run in IPU mode + result = self._train_executor.train_step(data, optimizer, **kwargs) + if (self._train_executor.isCompiled() + and self._train_executor.compare_with_cpu + and len(inter_outputs_in_cpu) > 0): + self.compare_data_between_ipu_and_cpu( + inter_outputs_in_cpu, + self._train_executor.inter_outputs_in_ipu) + return result + + # TODO Unified training and eval interface, + # merge train_step(train) and __call__(eval) together + def __call__(self, *args, **kwargs): + if self.training: + raise NotImplementedError('use train_step rather than __call__') + else: + return self._eval_executor.eval_call(*args, **kwargs) + + def __getattr__(self, attr): + return getattr(self.executor, attr) + + +def get_training_model(model: nn.Module, + options: Optional[poptorch.Options] = None, + optimizer: Optional[torch.optim.Optimizer] = None, + logger=None, + modules_to_record=None) -> poptorch.PoplarExecutor: + """Create a PopTorch training model from a PyTorch model, running on IPU + hardware in training mode. + + Note: + PopTorch makes a shallow copy of the model. Changes to the + parameters in the returned training model affect the original model + and vice versa. However, primitive variable types are not synced: for + example calling ``model.train()`` on the original model, which + changes the ``training`` bool of the model instance, will not alter the + model returned by this function. You may need to call ``model.train()`` + on your model before you call this function for correct behavior. + + Args: + model (:obj:`nn.Module`): The model to run. + options (poptorch.Options): Options that will be used to compile + and run the model. + optimizer (:obj:`torch.optim.Optimizer`, optional): The optimizers + to apply during training. + logger (:obj:`logging.Logger`): Logger used during running. + Defaults to None. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + + Returns: + The :class:`poptorch.PoplarExecutor` wrapper to use in place + of ``model``. + """ + # Create a copy of the original model in case it needs to be wrapped + maybe_wrapped_model = copy.copy(model) + + return MMPoplarExecutor( + model=maybe_wrapped_model, + logger=logger, + options=options, + training=True, + optimizer=optimizer, + user_model=model, + modules_to_record=modules_to_record, + poptorch_version=__version__) + + +def get_inference_model(model: Union[nn.Module, poptorch.PoplarExecutor], + options: Optional[poptorch.Options] = None, + logger=None) -> poptorch.PoplarExecutor: + """Create a PopTorch inference model from a PyTorch model, running on IPU + hardware in inference mode. + + Note: + PopTorch makes a shallow copy of the model. Changes to the + parameters in the returned inference model affect the original model + and vice versa. However, primitive variable types are not synced: for + example calling ``model.eval()`` on the original model will not alter + the model returned by this function. You may need to call + ``model.eval()`` on your model before you call this function for + correct behavior. + + Args: + model (:obj:`nn.Module`): The model to run. + options (poptorch.Options): Options that will be used to compile + and run the model. + logger (:obj:`logging.Logger`): Logger used during running. + Defaults to None. + + Returns: + The :class:`poptorch.PoplarExecutor` wrapper to use in place of + ``model``. + """ + + return MMPoplarExecutor( + model=copy.copy(model), + logger=logger, + options=options, + training=False, + poptorch_version=__version__) + + +def ipu_model_wrapper(model, + options, + optimizer=None, + logger=None, + modules_to_record=None, + ipu_model_cfg=None, + fp16_cfg=None): + """Convert torch model to IPU model. + + Args: + model (nn.Module): The target model to be converted. + options (dict[str, poptorch.Options]): IPU options, generated + by :func:`cfg2options`. + optimizer (:obj:`torch.optim.Optimizer`, optional): torch + optimizer, necessary if in training mode + logger (:obj:`logging.Logger`): Logger used during training. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + ipu_model_cfg (dict): A dictionary contains train_split_edges and + train_ckpt_nodes, See details in :func:`model_sharding` and + :func:`recomputation_checkpoint` functions. + fp16_cfg (dict): Config for IPU fp16 training. Currently supports + configs: `loss_scale`, `velocity_accum_type` and `accum_type`. + See details in + https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html + + Returns: + TrainEvalModel: IPU wrapped model. + """ + if ipu_model_cfg is None: + ipu_model_cfg = {} + training = model.training if optimizer is not None else False + # set mixed-precision + if fp16_cfg is not None: + from mmcv.runner import wrap_fp16_model + loss_scale = fp16_cfg['loss_scale'] + wrap_fp16_model(model) + model.half() + # TODO tmp ussage to set loss scaling for torch original optimizer + if optimizer is not None: + optimizer.loss_scaling = loss_scale + if fp16_cfg.get('velocity_accum_type', False): + if fp16_cfg['velocity_accum_type'] == 'half': + optimizer.velocity_accum_type = torch.half + else: + optimizer.velocity_accum_type = torch.float32 + if fp16_cfg.get('accum_type', False): + if fp16_cfg['accum_type'] == 'half': + optimizer.accum_type = torch.half + else: + optimizer.accum_type = torch.float32 + # TODO support feature alignment for fp16 + if modules_to_record is not None: + raise NotImplementedError( + 'Feature alignment for fp16 is not implemented') + + # set model partition + if optimizer is None: + train_model = None + else: + # split model into multi-IPUs if specified + train_model = model_sharding( + copy.copy(model).train(), + ipu_model_cfg.get('train_split_edges', [])) + + recomputation_checkpoint(train_model, + ipu_model_cfg.get('train_ckpt_nodes', [])) + + # TODO support feature alignment for gradient accumulation mode + gradient_accumulation = \ + getattr(options['training'].Training, 'gradient_accumulation', 1) + if gradient_accumulation > 1: + assert modules_to_record is None, \ + 'Feature alignment for grad-accumulation mode not implemented' + + # TODO support feature alignment for multi-replica mode + replication_factor = \ + getattr(options['training'], 'replication_factor', 1) + if replication_factor > 1: + assert modules_to_record is None, \ + 'Feature alignment for multi-replica mode not implemented' + + # TODO supports different model partitions between train and eval mode + assert len(ipu_model_cfg.get('eval_split_edges', [])) == 0,\ + 'Currently, BeginBlock can only be used once on the same model' + eval_model = copy.copy(model).eval() + + # wrap model for compilation + model = TrainEvalModel( + train_model, + eval_model, + options=options, + optimizer=optimizer, + logger=logger, + modules_to_record=modules_to_record) + model.train(training) + return model diff --git a/mmcv/device/ipu/runner.py b/mmcv/device/ipu/runner.py new file mode 100755 index 0000000000000000000000000000000000000000..e2d4922677e08b2d6b5132a01034de8b043fa3f1 --- /dev/null +++ b/mmcv/device/ipu/runner.py @@ -0,0 +1,142 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmcv.runner import (HOOKS, RUNNERS, BaseRunner, EpochBasedRunner, + IterBasedRunner) +from mmcv.utils import IS_IPU_AVAILABLE + +if IS_IPU_AVAILABLE: + from .dataloader import IPUDataLoader + from .hook_wrapper import (IPUFp16OptimizerHook, wrap_lr_updater_hook, + wrap_optimizer_hook) + from .model_wrapper import ipu_model_wrapper + from .utils import build_from_cfg_with_wrapper, cfg2options + + +class IPUBaseRunner(BaseRunner): + """A base runner for IPU. + + This runner has some extra processes for IPU which are shown below: + + 1. Parse options for IPU + 2. wrap pytorch model for IPU + 3. Raise errors while encountering illegal usage + 4. Input IPU options and initialize dataloader if finding an instance + of IPUDataLoader + + Args: + model (:obj:`nn.Module`): The model to run. + options_cfg (mmcv.Config, dict): Options that will be used to compile + and run the model. + modules_to_record (mmcv.Config, list): Index or name of modules which + will be recorded for output. It is necessary to specify output for + static graph of model training or inference. + ipu_model_cfg (mmcv.Config, dict): Config of model partition and + recomputing checkpoint + fp16_cfg (mmcv.Config): Config for fp16 training. + batch_processor (callable): A callable method that process a data + batch. Should be None for IPU runner + kwargs (Dict[str, Any], optional): Keyword arguments will be passed to + ``base_runner.BaseRunner``. + """ + + def __init__(self, + model, + options_cfg=None, + modules_to_record=None, + ipu_model_cfg=None, + fp16_cfg=None, + batch_processor=None, + **kwargs): + assert hasattr(model, 'train_step') and batch_processor is None,\ + 'only support model with train_step' + + if options_cfg is None: + options_cfg = {} + # call BaseRunner.__init__() here + super().__init__(model, **kwargs) + + # process options of ipu + if IS_IPU_AVAILABLE: + self.options = cfg2options(options_cfg) + self.model = ipu_model_wrapper( + self.model, + self.options, + self.optimizer, + self.logger, + modules_to_record=modules_to_record, + ipu_model_cfg=ipu_model_cfg, + fp16_cfg=fp16_cfg) + else: + raise NotImplementedError('cpu mode on IPURunner is not supported') + + def register_lr_hook(self, lr_config): + if lr_config is None: + return + assert isinstance(lr_config, dict) + assert 'policy' in lr_config + policy_type = lr_config.pop('policy') + # If the type of policy is all in lower case, + # e.g., 'cyclic', then its first letter will be capitalized, + # e.g., to be 'Cyclic'. + # This is for the convenient usage of Lr updater. + # Since this is not applicable for ` + # CosineAnnealingLrUpdater`, the string will not be changed + # if it contains capital letters. + if policy_type == policy_type.lower(): + policy_type = policy_type.title() + hook_type = policy_type + 'LrUpdaterHook' + lr_config['type'] = hook_type + hook = build_from_cfg_with_wrapper(lr_config, HOOKS, + wrap_lr_updater_hook) + self.register_hook(hook, priority='VERY_HIGH') + + def register_optimizer_hook(self, optimizer_config): + if optimizer_config is None: + return + assert isinstance(optimizer_config, (dict, IPUFp16OptimizerHook)) + if isinstance(optimizer_config, dict): + optimizer_config.setdefault('type', 'OptimizerHook') + hook = build_from_cfg_with_wrapper(optimizer_config, HOOKS, + wrap_optimizer_hook) + else: + hook = optimizer_config + self.register_hook(hook, priority='ABOVE_NORMAL') + + def run(self, data_loaders, workflow, *args, **kwargs): + for i, flow in enumerate(workflow): + mode, _ = flow + # initialize IPU dataloader if not initialized + assert isinstance(data_loaders[i], IPUDataLoader),\ + 'IPU runner can only work with `IPUDataLoader`' + data_loaders[i].init(options=self.get_options(mode)) + + super().run(data_loaders, workflow, *args, **kwargs) + + def get_options(self, mode): + if mode == 'train': + return self.options['training'] + elif mode == 'val': + return self.options['inference'] + else: + raise ValueError(f'mode should be train or val but got {mode}') + + +@RUNNERS.register_module() +class IPUEpochBasedRunner(IPUBaseRunner, EpochBasedRunner): + """Epoch-based Runner for IPU. + + The Inheritance order(MRO) is: IPUEpochBasedRunner -> IPUBaseRunner -> + EpochBasedRunner -> BaseRunner This runner train models epoch by epoch. + """ + pass + + +@RUNNERS.register_module() +class IPUIterBasedRunner(IPUBaseRunner, IterBasedRunner): + """Iteration-based Runner for IPU. + + The Inheritance order(MRO) is: IPUIterBasedRunner -> IPUBaseRunner -> + IterBasedRunner -> BaseRunner This runner train models iteration by + iteration. + """ + pass diff --git a/mmcv/device/ipu/utils.py b/mmcv/device/ipu/utils.py new file mode 100755 index 0000000000000000000000000000000000000000..79709db1ee1282e8daa6614ceb23481d3cd58338 --- /dev/null +++ b/mmcv/device/ipu/utils.py @@ -0,0 +1,244 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect + +import numpy as np +import popart +import poptorch +import torch +import torch.nn as nn + +from mmcv.utils import Registry + + +def _options_assigner(cfg, options_node): + # set popart.options by config + # cfg: dict, python data type + # options_node: python module or function + if isinstance(cfg, dict): + for key in cfg: + _options_assigner(cfg[key], getattr(options_node, key)) + elif isinstance(cfg, (int, float, str, list)): + if callable(options_node): + options_node(cfg) + else: + error_msg = f'options_node type {type(options_node)} not supported' + raise NotImplementedError(error_msg) + else: + error_msg = f'cfg type {type(cfg)} not supported' + raise NotImplementedError(error_msg) + + +def cfg2options(cfg): + """Parse dictionary to ipu options. + + Args: + cfg (dict): A dictionary of ipu settings. + + Returns: + dict[str, poptorch.Options]: Training options and inference options + of IPU. + """ + # set ipu options for inference and training by config + train_cfg = cfg.pop('train_cfg', {}) + eval_cfg = cfg.pop('eval_cfg', {}) + eval_cfg['replicationFactor'] = 1 # eval mode only use one replica + eval_cfg['executionStrategy'] = 'ShardedExecution' + # overwrite default ipu cfg with specified train cfgs + training_ipu_cfg = {**cfg, **train_cfg} + # overwrite default ipu cfg with specified eval cfgs + inference_ipu_cfg = {**cfg, **eval_cfg} + + ipu_options = { + 'training': _cast_to_options(training_ipu_cfg), + 'inference': _cast_to_options(inference_ipu_cfg) + } + + # TODO configure these codes + ipu_options['training']._Popart.set('disableGradAccumulationTensorStreams', + True) + ipu_options['training']._Popart.set( + 'accumulateOuterFragmentSettings.schedule', + int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized)) + ipu_options['training'].Precision.enableStochasticRounding(True) + + return ipu_options + + +def _cast_to_options(cfg): + # If it cannot be directly assigned, use if statement to parse it, + # and if it can be directly assigned, use _options_assigner to assign + options = poptorch.Options() + + if 'availableMemoryProportion' in cfg: + available_memory_proportion = cfg.pop('availableMemoryProportion') + mem_props = {} + for i, mem_prop in enumerate(available_memory_proportion): + mem_props[f'IPU{i}'] = mem_prop + options.setAvailableMemoryProportion(mem_props) + + if 'executionStrategy' in cfg: + execution_strategy = cfg.pop('executionStrategy') + if execution_strategy == 'SameAsIpu': + options.setExecutionStrategy( + poptorch.PipelinedExecution( + getattr(poptorch.AutoStage, execution_strategy))) + elif execution_strategy == 'ShardedExecution': + options.setExecutionStrategy(poptorch.ShardedExecution()) + else: + raise NotImplementedError( + 'executionStrategy should be "SameAsIpu" or "ShardedExecution"' + f', but got {execution_strategy}') + + if 'partialsType' in cfg: + partials_type = cfg.pop('partialsType') + options.Precision.setPartialsType(getattr( + torch, partials_type)) # half or float + + _options_assigner(cfg, options) + return options + + +def model_sharding(model, split_edges): + """split models in-place into multi-IPUs. + + Args: + model (nn.Module): The target model to be split. + split_edges (list of dict): Model layer names or layer numbers + of split edge. Each item of ``split_edges`` is a dictionary, + which may contain the following key-pairs: + + - layer_to_call: PyTorch module to assign to the block + - user_id (optional): A user defined identifier for the block. + - ipu_id: The id of the IPU to run on. + + Examples: + >>> split_edges = [ + ... dict(layer_to_call='model.conv1', ipu_id=0), + ... dict(layer_to_call='model.conv3', ipu_id=1)] + >>> sharding_model = model_sharding(torch_model, split_edges) + + Returns: + nn.Module: Split model. + """ + if len(split_edges) == 0: + return model + assert isinstance(split_edges, list) + spilt_edges_dict = {edge['layer_to_call']: edge for edge in split_edges} + + for idx, (name, module) in enumerate(model.named_modules()): + if idx in spilt_edges_dict and name in spilt_edges_dict: + raise ValueError( + 'The same layer is referenced twice while doing model' + f' partition: idx is {idx} and name is {name}') + + edge = spilt_edges_dict.pop(name, None) + edge = spilt_edges_dict.pop(idx, edge) + if edge is not None: + poptorch.BeginBlock(module, edge.get('user_id', name), + edge['ipu_id']) + + # ensure all split_edges are used + if len(spilt_edges_dict) > 0: + split_edge_names = list(spilt_edges_dict.keys()) + raise RuntimeError( + f'split_edges: {split_edge_names} are not contained in the model') + return model + + +def recomputation_checkpoint(model: nn.Module, module_names: list): + """Annotates the output of a module to be checkpointed instead of + recomputed. + + If recomputation mode is enabled, ipu will release the activations of + the middle layers to save memory. During the backward of gradient, + the activation of the middle layer will be recalculated again. + This function is used to declare the activations of some intermediate + layers that need to be saved in order to skip the recomputation of + some layers. + + Args: + model (nn.Module): The target model to apply recomputation + checkpoint. + module_names (list): Layer names of module. + """ + + def recompute_outputs(module, inputs, outputs): + if isinstance(outputs, tuple): + return tuple(poptorch.recomputationCheckpoint(y) for y in outputs) + else: + return poptorch.recomputationCheckpoint(outputs) + + for name, module in model.named_modules(): + if name in module_names: + module.register_forward_hook(recompute_outputs) + module_names.remove(name) + + # check all module_names are used + assert len(module_names) == 0,\ + f'recomputed nodes: {module_names} are not contained in the model' + + +def compare_ndarray(featA, featB, rtol=1e-3, atol=1e-5): + """Align data between two activations or weights.""" + try: + np.testing.assert_allclose(featA, featB, rtol=rtol, atol=atol) + except AssertionError as e: + print(e) + + +def build_from_cfg_with_wrapper(cfg, + registry, + wrapper_func=None, + default_args=None): + """Build a module from config dict and wrap module with "wrapper_func". + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + registry (:obj:`Registry`): The registry to search the type from. + default_args (dict, optional): Default initialization arguments. + wrapper_func (function): Used to wrap class + + Returns: + object: The constructed object. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + if default_args is None or 'type' not in default_args: + raise KeyError( + '`cfg` or `default_args` must contain the key "type", ' + f'but got {cfg}\n{default_args}') + if not isinstance(registry, Registry): + raise TypeError('registry must be an mmcv.Registry object, ' + f'but got {type(registry)}') + if not (isinstance(default_args, dict) or default_args is None): + raise TypeError('default_args must be a dict or None, ' + f'but got {type(default_args)}') + + args = cfg.copy() + + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + + obj_type = args.pop('type') + if isinstance(obj_type, str): + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError( + f'{obj_type} is not in the {registry.name} registry') + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + + if wrapper_func is None: + wrapped_obj_cls = obj_cls + else: + wrapped_obj_cls = wrapper_func(obj_cls) + try: + return wrapped_obj_cls(**args) + except Exception as e: + # Normal TypeError does not print class name. + raise type(e)(f'{wrapped_obj_cls.__name__}: {e}') diff --git a/mmcv/device/mlu/__init__.py b/mmcv/device/mlu/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..77c71ccf3ce38f3cbc9911f1d9d4b05a531771f2 --- /dev/null +++ b/mmcv/device/mlu/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_parallel import MLUDataParallel +from .distributed import MLUDistributedDataParallel + +__all__ = ['MLUDataParallel', 'MLUDistributedDataParallel'] diff --git a/mmcv/device/mlu/_functions.py b/mmcv/device/mlu/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..75660fa9b3635fed049cb150639244a658534824 --- /dev/null +++ b/mmcv/device/mlu/_functions.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import List, Union + +import torch + + +def scatter(input: Union[List, torch.Tensor], devices: List) -> List: + """scatter copies tensor to MLU directly.""" + if isinstance(input, list): + outputs = [scatter(_input, devices) for _input in input] + return outputs + elif isinstance(input, torch.Tensor): + output = input.contiguous() + return output.to('mlu') if devices != [-1] else output + else: + raise Exception(f'Unknown type {type(input)}.') + + +class Scatter: + + @staticmethod + def forward(target_mlus, input): + outputs = scatter(input, target_mlus) + return tuple(outputs) if isinstance(outputs, list) else (outputs, ) diff --git a/mmcv/device/mlu/data_parallel.py b/mmcv/device/mlu/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..ebe14c0a55c92f96ec7f782a591ac10b007942dc --- /dev/null +++ b/mmcv/device/mlu/data_parallel.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch + +from mmcv.parallel import MMDataParallel +from .scatter_gather import scatter_kwargs + + +class MLUDataParallel(MMDataParallel): + """The MLUDataParallel module that supports DataContainer. + + MLUDataParallel is a class inherited from MMDataParall, which supports + MLU training and inference only. + + The main differences with MMDataParallel: + + - It only supports single-card of MLU, and only use first card to + run training and inference. + + - It uses direct host-to-device copy instead of stream-background + scatter. + + .. warning:: + MLUDataParallel only supports single MLU training, if you need to + train with multiple MLUs, please use MLUDistributedDataParallel + instead. If you have multiple MLUs, you can set the environment + variable ``MLU_VISIBLE_DEVICES=0`` (or any other card number(s)) + to specify the running device. + + Args: + module (:class:`nn.Module`): Module to be encapsulated. + dim (int): Dimension used to scatter the data. Defaults to 0. + """ + + def __init__(self, *args, dim=0, **kwargs): + super().__init__(*args, dim=dim, **kwargs) + self.device_ids = [0] + self.src_device_obj = torch.device('mlu:0') + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) diff --git a/mmcv/device/mlu/distributed.py b/mmcv/device/mlu/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..3768c754c908b219fd5a770d69e6ed5416781ba8 --- /dev/null +++ b/mmcv/device/mlu/distributed.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from mmcv.parallel import MMDistributedDataParallel +from .scatter_gather import scatter_kwargs + + +class MLUDistributedDataParallel(MMDistributedDataParallel): + """The DDP module supports DataContainer. + + MLUDDP has one difference from MMDDP which moves data to MLU with coping + instead of scattering. + """ + + def to_kwargs(self, inputs, kwargs, device_id): + # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8 + # to move all tensors to device_id + return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) diff --git a/mmcv/device/mlu/scatter_gather.py b/mmcv/device/mlu/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..0b0c9b96f51252e4c510f66a2ec5fb7522716e29 --- /dev/null +++ b/mmcv/device/mlu/scatter_gather.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmcv.parallel.data_container import DataContainer +from ._functions import Scatter + + +def scatter(inputs, target_mlus, dim=0): + """Scatter inputs to target mlu. + + The only difference from original :func:`scatter` is to add support for + :type:`~mmcv.parallel.DataContainer`. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + if target_mlus != [-1]: + obj = obj.to('mlu') + return [obj] + else: + # for CPU inference we use self-implemented scatter + return Scatter.forward(target_mlus, obj) + if isinstance(obj, DataContainer): + if obj.cpu_only: + return obj.data + else: + return Scatter.forward(target_mlus, obj.data) + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + out = list(map(list, zip(*map(scatter_map, obj)))) + return out + if isinstance(obj, dict) and len(obj) > 0: + out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return out + return [obj for targets in target_mlus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +def scatter_kwargs(inputs, kwargs, target_mlus, dim=0): + """Scatter with support for kwargs dictionary.""" + inputs = scatter(inputs, target_mlus, dim) if inputs else [] + kwargs = scatter(kwargs, target_mlus, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/mmcv/device/mps/__init__.py b/mmcv/device/mps/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e28144ef0ae8cf65527cefc469d07c7ff854c688 --- /dev/null +++ b/mmcv/device/mps/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .data_parallel import MPSDataParallel + +__all__ = ['MPSDataParallel'] diff --git a/mmcv/device/mps/data_parallel.py b/mmcv/device/mps/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..7ae5396d24193376432ae98b792ec89fac678738 --- /dev/null +++ b/mmcv/device/mps/data_parallel.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +import torch + +from mmcv.parallel import MMDataParallel +from ..scatter_gather import scatter_kwargs + + +class MPSDataParallel(MMDataParallel): + """The MPSDataParallel module that supports DataContainer. + + MPSDataParallel is a class inherited from MMDataParall, which supports + MPS training and inference only. + + The main differences with MMDataParallel: + + - It only supports single-card of MPS, and only use first card to + run training and inference. + + - It uses direct host-to-device copy instead of stream-background + scatter. + + Args: + module (:class:`nn.Module`): Module to be encapsulated. + dim (int): Dimension used to scatter the data. Defaults to 0. + """ + + def __init__(self, *args, dim=0, **kwargs): + super().__init__(*args, dim=dim, **kwargs) + self.device_ids = [0] + self.src_device_obj = torch.device('mps:0') + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) diff --git a/mmcv/device/scatter_gather.py b/mmcv/device/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..744b0ca51e9de4cb7c43d60a986621461519f781 --- /dev/null +++ b/mmcv/device/scatter_gather.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from mmcv.parallel.data_container import DataContainer +from mmcv.utils import deprecated_api_warning +from ._functions import Scatter +from .utils import get_device + + +@deprecated_api_warning({'target_mlus': 'target_devices'}) +def scatter(inputs, target_devices, dim=0): + """Scatter inputs to target devices. + + The only difference from original :func:`scatter` is to add support for + :type:`~mmcv.parallel.DataContainer`. + """ + current_device = get_device() + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + if target_devices != [-1]: + obj = obj.to(current_device) + return [obj] + else: + # for CPU inference we use self-implemented scatter + return Scatter.forward(target_devices, obj) + if isinstance(obj, DataContainer): + if obj.cpu_only: + return obj.data + else: + return Scatter.forward(target_devices, obj.data) + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + out = list(map(list, zip(*map(scatter_map, obj)))) + return out + if isinstance(obj, dict) and len(obj) > 0: + out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return out + return [obj for _ in target_devices] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +@deprecated_api_warning({'target_mlus': 'target_devices'}) +def scatter_kwargs(inputs, kwargs, target_devices, dim=0): + """Scatter with support for kwargs dictionary.""" + inputs = scatter(inputs, target_devices, dim) if inputs else [] + kwargs = scatter(kwargs, target_devices, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/mmcv/device/utils.py b/mmcv/device/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..e2adec08dd98ad83cce3a9c28d3a6651808f7112 --- /dev/null +++ b/mmcv/device/utils.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE + + +def get_device() -> str: + """Returns the currently existing device type. + + Returns: + str: cuda | mlu | mps | cpu. + """ + if IS_CUDA_AVAILABLE: + return 'cuda' + elif IS_MLU_AVAILABLE: + return 'mlu' + elif IS_MPS_AVAILABLE: + return 'mps' + else: + return 'cpu' diff --git a/mmcv/engine/test.py b/mmcv/engine/test.py index f236b1cda2f39517bda3e4cce9badc19c6cbf190..83546caec47fb11952fd820b342c71b83b74fac2 100644 --- a/mmcv/engine/test.py +++ b/mmcv/engine/test.py @@ -4,15 +4,18 @@ import pickle import shutil import tempfile import time +from typing import Optional import torch import torch.distributed as dist +import torch.nn as nn +from torch.utils.data import DataLoader import mmcv from mmcv.runner import get_dist_info -def single_gpu_test(model, data_loader): +def single_gpu_test(model: nn.Module, data_loader: DataLoader) -> list: """Test model with a single gpu. This method tests model with a single gpu and displays test progress bar. @@ -41,7 +44,10 @@ def single_gpu_test(model, data_loader): return results -def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): +def multi_gpu_test(model: nn.Module, + data_loader: DataLoader, + tmpdir: Optional[str] = None, + gpu_collect: bool = False) -> Optional[list]: """Test model with multiple gpus. This method tests model with multiple gpus and collects the results @@ -82,13 +88,15 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): # collect results from all ranks if gpu_collect: - results = collect_results_gpu(results, len(dataset)) + result_from_ranks = collect_results_gpu(results, len(dataset)) else: - results = collect_results_cpu(results, len(dataset), tmpdir) - return results + result_from_ranks = collect_results_cpu(results, len(dataset), tmpdir) + return result_from_ranks -def collect_results_cpu(result_part, size, tmpdir=None): +def collect_results_cpu(result_part: list, + size: int, + tmpdir: Optional[str] = None) -> Optional[list]: """Collect results under cpu mode. On cpu mode, this function will save the results on different gpus to @@ -126,7 +134,8 @@ def collect_results_cpu(result_part, size, tmpdir=None): else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir - mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + part_file = osp.join(tmpdir, f'part_{rank}.pkl') # type: ignore + mmcv.dump(result_part, part_file) dist.barrier() # collect all parts if rank != 0: @@ -135,7 +144,7 @@ def collect_results_cpu(result_part, size, tmpdir=None): # load results of all parts from tmp dir part_list = [] for i in range(world_size): - part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_file = osp.join(tmpdir, f'part_{i}.pkl') # type: ignore part_result = mmcv.load(part_file) # When data is severely insufficient, an empty part_result # on a certain gpu could makes the overall outputs empty. @@ -148,11 +157,11 @@ def collect_results_cpu(result_part, size, tmpdir=None): # the dataloader may pad some samples ordered_results = ordered_results[:size] # remove tmp dir - shutil.rmtree(tmpdir) + shutil.rmtree(tmpdir) # type: ignore return ordered_results -def collect_results_gpu(result_part, size): +def collect_results_gpu(result_part: list, size: int) -> Optional[list]: """Collect results under gpu mode. On gpu mode, this function will encode results to gpu tensors and use gpu @@ -200,3 +209,5 @@ def collect_results_gpu(result_part, size): # the dataloader may pad some samples ordered_results = ordered_results[:size] return ordered_results + else: + return None diff --git a/mmcv/fileio/file_client.py b/mmcv/fileio/file_client.py index b2d622868cdd006dc7446bcde0dc54731c17116a..ee7c3164e2c631c546dfe3345c45f8b8394a9995 100644 --- a/mmcv/fileio/file_client.py +++ b/mmcv/fileio/file_client.py @@ -8,7 +8,7 @@ import warnings from abc import ABCMeta, abstractmethod from contextlib import contextmanager from pathlib import Path -from typing import Iterable, Iterator, Optional, Tuple, Union +from typing import Any, Generator, Iterator, Optional, Tuple, Union from urllib.request import urlopen import mmcv @@ -64,7 +64,8 @@ class CephBackend(BaseStorageBackend): raise ImportError('Please install ceph to enable CephBackend.') warnings.warn( - 'CephBackend will be deprecated, please use PetrelBackend instead') + 'CephBackend will be deprecated, please use PetrelBackend instead', + DeprecationWarning) self._client = ceph.S3Client() assert isinstance(path_mapping, dict) or path_mapping is None self.path_mapping = path_mapping @@ -209,9 +210,9 @@ class PetrelBackend(BaseStorageBackend): """ if not has_method(self._client, 'delete'): raise NotImplementedError( - ('Current version of Petrel Python SDK has not supported ' - 'the `delete` method, please use a higher version or dev' - ' branch instead.')) + 'Current version of Petrel Python SDK has not supported ' + 'the `delete` method, please use a higher version or dev' + ' branch instead.') filepath = self._map_path(filepath) filepath = self._format_path(filepath) @@ -229,9 +230,9 @@ class PetrelBackend(BaseStorageBackend): if not (has_method(self._client, 'contains') and has_method(self._client, 'isdir')): raise NotImplementedError( - ('Current version of Petrel Python SDK has not supported ' - 'the `contains` and `isdir` methods, please use a higher' - 'version or dev branch instead.')) + 'Current version of Petrel Python SDK has not supported ' + 'the `contains` and `isdir` methods, please use a higher' + 'version or dev branch instead.') filepath = self._map_path(filepath) filepath = self._format_path(filepath) @@ -246,13 +247,13 @@ class PetrelBackend(BaseStorageBackend): Returns: bool: Return ``True`` if ``filepath`` points to a directory, - ``False`` otherwise. + ``False`` otherwise. """ if not has_method(self._client, 'isdir'): raise NotImplementedError( - ('Current version of Petrel Python SDK has not supported ' - 'the `isdir` method, please use a higher version or dev' - ' branch instead.')) + 'Current version of Petrel Python SDK has not supported ' + 'the `isdir` method, please use a higher version or dev' + ' branch instead.') filepath = self._map_path(filepath) filepath = self._format_path(filepath) @@ -266,13 +267,13 @@ class PetrelBackend(BaseStorageBackend): Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` - otherwise. + otherwise. """ if not has_method(self._client, 'contains'): raise NotImplementedError( - ('Current version of Petrel Python SDK has not supported ' - 'the `contains` method, please use a higher version or ' - 'dev branch instead.')) + 'Current version of Petrel Python SDK has not supported ' + 'the `contains` method, please use a higher version or ' + 'dev branch instead.') filepath = self._map_path(filepath) filepath = self._format_path(filepath) @@ -297,7 +298,10 @@ class PetrelBackend(BaseStorageBackend): return '/'.join(formatted_paths) @contextmanager - def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + def get_local_path( + self, + filepath: Union[str, + Path]) -> Generator[Union[str, Path], None, None]: """Download a file from ``filepath`` and return a temporary path. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It @@ -362,9 +366,9 @@ class PetrelBackend(BaseStorageBackend): """ if not has_method(self._client, 'list'): raise NotImplementedError( - ('Current version of Petrel Python SDK has not supported ' - 'the `list` method, please use a higher version or dev' - ' branch instead.')) + 'Current version of Petrel Python SDK has not supported ' + 'the `list` method, please use a higher version or dev' + ' branch instead.') dir_path = self._map_path(dir_path) dir_path = self._format_path(dir_path) @@ -473,17 +477,16 @@ class LmdbBackend(BaseStorageBackend): readahead=False, **kwargs): try: - import lmdb + import lmdb # NOQA except ImportError: raise ImportError('Please install lmdb to enable LmdbBackend.') self.db_path = str(db_path) - self._client = lmdb.open( - self.db_path, - readonly=readonly, - lock=lock, - readahead=readahead, - **kwargs) + self.readonly = readonly + self.lock = lock + self.readahead = readahead + self.kwargs = kwargs + self._client = None def get(self, filepath): """Get values according to the filepath. @@ -491,14 +494,29 @@ class LmdbBackend(BaseStorageBackend): Args: filepath (str | obj:`Path`): Here, filepath is the lmdb key. """ - filepath = str(filepath) + if self._client is None: + self._client = self._get_client() + with self._client.begin(write=False) as txn: - value_buf = txn.get(filepath.encode('ascii')) + value_buf = txn.get(str(filepath).encode('utf-8')) return value_buf def get_text(self, filepath, encoding=None): raise NotImplementedError + def _get_client(self): + import lmdb + + return lmdb.open( + self.db_path, + readonly=self.readonly, + lock=self.lock, + readahead=self.readahead, + **self.kwargs) + + def __del__(self): + self._client.close() + class HardDiskBackend(BaseStorageBackend): """Raw hard disks storage backend.""" @@ -531,7 +549,7 @@ class HardDiskBackend(BaseStorageBackend): Returns: str: Expected text reading from ``filepath``. """ - with open(filepath, 'r', encoding=encoding) as f: + with open(filepath, encoding=encoding) as f: value_buf = f.read() return value_buf @@ -598,7 +616,7 @@ class HardDiskBackend(BaseStorageBackend): Returns: bool: Return ``True`` if ``filepath`` points to a directory, - ``False`` otherwise. + ``False`` otherwise. """ return osp.isdir(filepath) @@ -610,7 +628,7 @@ class HardDiskBackend(BaseStorageBackend): Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` - otherwise. + otherwise. """ return osp.isfile(filepath) @@ -631,7 +649,9 @@ class HardDiskBackend(BaseStorageBackend): @contextmanager def get_local_path( - self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]: + self, + filepath: Union[str, + Path]) -> Generator[Union[str, Path], None, None]: """Only for unified API and do nothing.""" yield filepath @@ -700,7 +720,8 @@ class HTTPBackend(BaseStorageBackend): return value_buf.decode(encoding) @contextmanager - def get_local_path(self, filepath: str) -> Iterable[str]: + def get_local_path( + self, filepath: str) -> Generator[Union[str, Path], None, None]: """Download a file from ``filepath``. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It @@ -770,19 +791,16 @@ class FileClient: 'petrel': PetrelBackend, 'http': HTTPBackend, } - # This collection is used to record the overridden backends, and when a - # backend appears in the collection, the singleton pattern is disabled for - # that backend, because if the singleton pattern is used, then the object - # returned will be the backend before overwriting - _overridden_backends = set() + _prefix_to_backends = { 's3': PetrelBackend, 'http': HTTPBackend, 'https': HTTPBackend, } - _overridden_prefixes = set() - _instances = {} + _instances: dict = {} + + client: Any def __new__(cls, backend=None, prefix=None, **kwargs): if backend is None and prefix is None: @@ -802,10 +820,7 @@ class FileClient: for key, value in kwargs.items(): arg_key += f':{key}:{value}' - # if a backend was overridden, it will create a new object - if (arg_key in cls._instances - and backend not in cls._overridden_backends - and prefix not in cls._overridden_prefixes): + if arg_key in cls._instances: _instance = cls._instances[arg_key] else: # create a new object and put it to _instance @@ -839,8 +854,8 @@ class FileClient: 's3' Returns: - str | None: Return the prefix of uri if the uri contains '://' - else ``None``. + str | None: Return the prefix of uri if the uri contains '://' else + ``None``. """ assert is_filepath(uri) uri = str(uri) @@ -899,7 +914,9 @@ class FileClient: 'add "force=True" if you want to override it') if name in cls._backends and force: - cls._overridden_backends.add(name) + for arg_key, instance in list(cls._instances.items()): + if isinstance(instance.client, cls._backends[name]): + cls._instances.pop(arg_key) cls._backends[name] = backend if prefixes is not None: @@ -911,7 +928,12 @@ class FileClient: if prefix not in cls._prefix_to_backends: cls._prefix_to_backends[prefix] = backend elif (prefix in cls._prefix_to_backends) and force: - cls._overridden_prefixes.add(prefix) + overridden_backend = cls._prefix_to_backends[prefix] + if isinstance(overridden_backend, list): + overridden_backend = tuple(overridden_backend) + for arg_key, instance in list(cls._instances.items()): + if isinstance(instance.client, overridden_backend): + cls._instances.pop(arg_key) cls._prefix_to_backends[prefix] = backend else: raise KeyError( @@ -987,7 +1009,7 @@ class FileClient: Returns: bytes | memoryview: Expected bytes object or a memory view of the - bytes object. + bytes object. """ return self.client.get(filepath) @@ -1060,7 +1082,7 @@ class FileClient: Returns: bool: Return ``True`` if ``filepath`` points to a directory, - ``False`` otherwise. + ``False`` otherwise. """ return self.client.isdir(filepath) @@ -1072,7 +1094,7 @@ class FileClient: Returns: bool: Return ``True`` if ``filepath`` points to a file, ``False`` - otherwise. + otherwise. """ return self.client.isfile(filepath) @@ -1092,7 +1114,10 @@ class FileClient: return self.client.join_path(filepath, *filepaths) @contextmanager - def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + def get_local_path( + self, + filepath: Union[str, + Path]) -> Generator[Union[str, Path], None, None]: """Download data from ``filepath`` and write the data to local path. ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It diff --git a/mmcv/fileio/handlers/base.py b/mmcv/fileio/handlers/base.py index 288878bc57282fbb2f12b32290152ca8e9d3cab0..0c9cc15b67cbf7d320c2b9c6cbd441a5d5adf235 100644 --- a/mmcv/fileio/handlers/base.py +++ b/mmcv/fileio/handlers/base.py @@ -21,10 +21,10 @@ class BaseFileHandler(metaclass=ABCMeta): def dump_to_str(self, obj, **kwargs): pass - def load_from_path(self, filepath, mode='r', **kwargs): + def load_from_path(self, filepath: str, mode: str = 'r', **kwargs): with open(filepath, mode) as f: return self.load_from_fileobj(f, **kwargs) - def dump_to_path(self, obj, filepath, mode='w', **kwargs): + def dump_to_path(self, obj, filepath: str, mode: str = 'w', **kwargs): with open(filepath, mode) as f: self.dump_to_fileobj(obj, f, **kwargs) diff --git a/mmcv/fileio/handlers/pickle_handler.py b/mmcv/fileio/handlers/pickle_handler.py index b37c79bed4ef9fd8913715e62dbe3fc5cafdc3aa..073856fd25a731b42f3cd19269ad95744b20598f 100644 --- a/mmcv/fileio/handlers/pickle_handler.py +++ b/mmcv/fileio/handlers/pickle_handler.py @@ -12,8 +12,7 @@ class PickleHandler(BaseFileHandler): return pickle.load(file, **kwargs) def load_from_path(self, filepath, **kwargs): - return super(PickleHandler, self).load_from_path( - filepath, mode='rb', **kwargs) + return super().load_from_path(filepath, mode='rb', **kwargs) def dump_to_str(self, obj, **kwargs): kwargs.setdefault('protocol', 2) @@ -24,5 +23,4 @@ class PickleHandler(BaseFileHandler): pickle.dump(obj, file, **kwargs) def dump_to_path(self, obj, filepath, **kwargs): - super(PickleHandler, self).dump_to_path( - obj, filepath, mode='wb', **kwargs) + super().dump_to_path(obj, filepath, mode='wb', **kwargs) diff --git a/mmcv/fileio/handlers/yaml_handler.py b/mmcv/fileio/handlers/yaml_handler.py index c5aa2eea1e8c76f8baf753d1c8c959dee665e543..1c1b077943d634b3ddcf5ee470855179b8308e9c 100644 --- a/mmcv/fileio/handlers/yaml_handler.py +++ b/mmcv/fileio/handlers/yaml_handler.py @@ -2,9 +2,10 @@ import yaml try: - from yaml import CLoader as Loader, CDumper as Dumper + from yaml import CDumper as Dumper + from yaml import CLoader as Loader except ImportError: - from yaml import Loader, Dumper + from yaml import Loader, Dumper # type: ignore from .base import BaseFileHandler # isort:skip diff --git a/mmcv/fileio/io.py b/mmcv/fileio/io.py index aaefde58aa3ea5b58f86249ce7e1c40c186eb8dd..91192103cf331e8ceb970d6f1f5ac050137c0871 100644 --- a/mmcv/fileio/io.py +++ b/mmcv/fileio/io.py @@ -1,11 +1,14 @@ # Copyright (c) OpenMMLab. All rights reserved. from io import BytesIO, StringIO from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, TextIO, Union -from ..utils import is_list_of, is_str +from ..utils import is_list_of from .file_client import FileClient from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler +FileLikeObject = Union[TextIO, StringIO, BytesIO] + file_handlers = { 'json': JsonHandler(), 'yaml': YamlHandler(), @@ -15,7 +18,10 @@ file_handlers = { } -def load(file, file_format=None, file_client_args=None, **kwargs): +def load(file: Union[str, Path, FileLikeObject], + file_format: Optional[str] = None, + file_client_args: Optional[Dict] = None, + **kwargs): """Load data from json/yaml/pickle files. This method provides a unified api for loading data from serialized files. @@ -45,13 +51,14 @@ def load(file, file_format=None, file_client_args=None, **kwargs): """ if isinstance(file, Path): file = str(file) - if file_format is None and is_str(file): + if file_format is None and isinstance(file, str): file_format = file.split('.')[-1] if file_format not in file_handlers: raise TypeError(f'Unsupported format: {file_format}') handler = file_handlers[file_format] - if is_str(file): + f: FileLikeObject + if isinstance(file, str): file_client = FileClient.infer_client(file_client_args, file) if handler.str_like: with StringIO(file_client.get_text(file)) as f: @@ -66,7 +73,11 @@ def load(file, file_format=None, file_client_args=None, **kwargs): return obj -def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): +def dump(obj: Any, + file: Optional[Union[str, Path, FileLikeObject]] = None, + file_format: Optional[str] = None, + file_client_args: Optional[Dict] = None, + **kwargs): """Dump data to json/yaml/pickle strings or files. This method provides a unified api for dumping data as strings or to files, @@ -96,18 +107,18 @@ def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): if isinstance(file, Path): file = str(file) if file_format is None: - if is_str(file): + if isinstance(file, str): file_format = file.split('.')[-1] elif file is None: raise ValueError( 'file_format must be specified since file is None') if file_format not in file_handlers: raise TypeError(f'Unsupported format: {file_format}') - + f: FileLikeObject handler = file_handlers[file_format] if file is None: return handler.dump_to_str(obj, **kwargs) - elif is_str(file): + elif isinstance(file, str): file_client = FileClient.infer_client(file_client_args, file) if handler.str_like: with StringIO() as f: @@ -123,7 +134,8 @@ def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): raise TypeError('"file" must be a filename str or a file-object') -def _register_handler(handler, file_formats): +def _register_handler(handler: BaseFileHandler, + file_formats: Union[str, List[str]]) -> None: """Register a handler for some file extensions. Args: @@ -142,7 +154,7 @@ def _register_handler(handler, file_formats): file_handlers[ext] = handler -def register_handler(file_formats, **kwargs): +def register_handler(file_formats: Union[str, list], **kwargs) -> Callable: def wrap(cls): _register_handler(cls(**kwargs), file_formats) diff --git a/mmcv/fileio/parse.py b/mmcv/fileio/parse.py index f60f0d611b8d75692221d0edd7dc993b0a6445c9..f28e59119325a1bb68b38dd884c59b68dbed6508 100644 --- a/mmcv/fileio/parse.py +++ b/mmcv/fileio/parse.py @@ -1,16 +1,18 @@ # Copyright (c) OpenMMLab. All rights reserved. from io import StringIO +from pathlib import Path +from typing import Dict, List, Optional, Union from .file_client import FileClient -def list_from_file(filename, - prefix='', - offset=0, - max_num=0, - encoding='utf-8', - file_client_args=None): +def list_from_file(filename: Union[str, Path], + prefix: str = '', + offset: int = 0, + max_num: int = 0, + encoding: str = 'utf-8', + file_client_args: Optional[Dict] = None) -> List: """Load a text file and parse the content as a list of strings. Note: @@ -52,10 +54,10 @@ def list_from_file(filename, return item_list -def dict_from_file(filename, - key_type=str, - encoding='utf-8', - file_client_args=None): +def dict_from_file(filename: Union[str, Path], + key_type: type = str, + encoding: str = 'utf-8', + file_client_args: Optional[Dict] = None) -> Dict: """Load a text file and parse the content as a dict. Each line of the text file will be two or more columns split by diff --git a/mmcv/image/__init__.py b/mmcv/image/__init__.py index d0051d609d3de4e7562e3fe638335c66617c4d91..92ecec4046a6f5ee25b4ea07215ed7c7c810dcfa 100644 --- a/mmcv/image/__init__.py +++ b/mmcv/image/__init__.py @@ -9,10 +9,10 @@ from .geometric import (cutout, imcrop, imflip, imflip_, impad, from .io import imfrombytes, imread, imwrite, supported_backends, use_backend from .misc import tensor2imgs from .photometric import (adjust_brightness, adjust_color, adjust_contrast, - adjust_lighting, adjust_sharpness, auto_contrast, - clahe, imdenormalize, imequalize, iminvert, - imnormalize, imnormalize_, lut_transform, posterize, - solarize) + adjust_hue, adjust_lighting, adjust_sharpness, + auto_contrast, clahe, imdenormalize, imequalize, + iminvert, imnormalize, imnormalize_, lut_transform, + posterize, solarize) __all__ = [ 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', @@ -24,5 +24,6 @@ __all__ = [ 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', - 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting' + 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting', + 'adjust_hue' ] diff --git a/mmcv/image/colorspace.py b/mmcv/image/colorspace.py index 814533952fdfda23d67cb6a3073692d8c1156add..08f9952408c8e0bb38b17c10e2089e900ed418c2 100644 --- a/mmcv/image/colorspace.py +++ b/mmcv/image/colorspace.py @@ -1,9 +1,11 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Callable, Union + import cv2 import numpy as np -def imconvert(img, src, dst): +def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray: """Convert an image from the src colorspace to dst colorspace. Args: @@ -19,7 +21,7 @@ def imconvert(img, src, dst): return out_img -def bgr2gray(img, keepdim=False): +def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: """Convert a BGR image to grayscale image. Args: @@ -36,7 +38,7 @@ def bgr2gray(img, keepdim=False): return out_img -def rgb2gray(img, keepdim=False): +def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray: """Convert a RGB image to grayscale image. Args: @@ -53,7 +55,7 @@ def rgb2gray(img, keepdim=False): return out_img -def gray2bgr(img): +def gray2bgr(img: np.ndarray) -> np.ndarray: """Convert a grayscale image to BGR image. Args: @@ -67,7 +69,7 @@ def gray2bgr(img): return out_img -def gray2rgb(img): +def gray2rgb(img: np.ndarray) -> np.ndarray: """Convert a grayscale image to RGB image. Args: @@ -81,7 +83,7 @@ def gray2rgb(img): return out_img -def _convert_input_type_range(img): +def _convert_input_type_range(img: np.ndarray) -> np.ndarray: """Convert the type and range of the input image. It converts the input image to np.float32 type and range of [0, 1]. @@ -109,7 +111,8 @@ def _convert_input_type_range(img): return img -def _convert_output_type_range(img, dst_type): +def _convert_output_type_range( + img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray: """Convert the type and range of the image according to dst_type. It converts the image to desired type and range. If `dst_type` is np.uint8, @@ -140,7 +143,7 @@ def _convert_output_type_range(img, dst_type): return img.astype(dst_type) -def rgb2ycbcr(img, y_only=False): +def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: """Convert a RGB image to YCbCr image. This function produces the same results as Matlab's `rgb2ycbcr` function. @@ -160,7 +163,7 @@ def rgb2ycbcr(img, y_only=False): Returns: ndarray: The converted YCbCr image. The output image has the same type - and range as input image. + and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) @@ -174,7 +177,7 @@ def rgb2ycbcr(img, y_only=False): return out_img -def bgr2ycbcr(img, y_only=False): +def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray: """Convert a BGR image to YCbCr image. The bgr version of rgb2ycbcr. @@ -194,7 +197,7 @@ def bgr2ycbcr(img, y_only=False): Returns: ndarray: The converted YCbCr image. The output image has the same type - and range as input image. + and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) @@ -208,7 +211,7 @@ def bgr2ycbcr(img, y_only=False): return out_img -def ycbcr2rgb(img): +def ycbcr2rgb(img: np.ndarray) -> np.ndarray: """Convert a YCbCr image to RGB image. This function produces the same results as Matlab's ycbcr2rgb function. @@ -227,7 +230,7 @@ def ycbcr2rgb(img): Returns: ndarray: The converted RGB image. The output image has the same type - and range as input image. + and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 @@ -240,7 +243,7 @@ def ycbcr2rgb(img): return out_img -def ycbcr2bgr(img): +def ycbcr2bgr(img: np.ndarray) -> np.ndarray: """Convert a YCbCr image to BGR image. The bgr version of ycbcr2rgb. @@ -259,7 +262,7 @@ def ycbcr2bgr(img): Returns: ndarray: The converted BGR image. The output image has the same type - and range as input image. + and range as input image. """ img_type = img.dtype img = _convert_input_type_range(img) * 255 @@ -272,11 +275,11 @@ def ycbcr2bgr(img): return out_img -def convert_color_factory(src, dst): +def convert_color_factory(src: str, dst: str) -> Callable: code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') - def convert_color(img): + def convert_color(img: np.ndarray) -> np.ndarray: out_img = cv2.cvtColor(img, code) return out_img diff --git a/mmcv/image/geometric.py b/mmcv/image/geometric.py index cf97c201cb4e43796c911919d03fb26a07ed817d..eecd795ea08127055cd8e90eb11c5e51fe586c18 100644 --- a/mmcv/image/geometric.py +++ b/mmcv/image/geometric.py @@ -37,15 +37,27 @@ cv2_interp_codes = { 'lanczos': cv2.INTER_LANCZOS4 } +# Pillow >=v9.1.0 use a slightly different naming scheme for filters. +# Set pillow_interp_codes according to the naming scheme used. if Image is not None: - pillow_interp_codes = { - 'nearest': Image.NEAREST, - 'bilinear': Image.BILINEAR, - 'bicubic': Image.BICUBIC, - 'box': Image.BOX, - 'lanczos': Image.LANCZOS, - 'hamming': Image.HAMMING - } + if hasattr(Image, 'Resampling'): + pillow_interp_codes = { + 'nearest': Image.Resampling.NEAREST, + 'bilinear': Image.Resampling.BILINEAR, + 'bicubic': Image.Resampling.BICUBIC, + 'box': Image.Resampling.BOX, + 'lanczos': Image.Resampling.LANCZOS, + 'hamming': Image.Resampling.HAMMING + } + else: + pillow_interp_codes = { + 'nearest': Image.NEAREST, + 'bilinear': Image.BILINEAR, + 'bicubic': Image.BICUBIC, + 'box': Image.BOX, + 'lanczos': Image.LANCZOS, + 'hamming': Image.HAMMING + } def imresize(img, @@ -70,7 +82,7 @@ def imresize(img, Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or - `resized_img`. + `resized_img`. """ h, w = img.shape[:2] if backend is None: @@ -130,7 +142,7 @@ def imresize_to_multiple(img, Returns: tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or - `resized_img`. + `resized_img`. """ h, w = img.shape[:2] if size is not None and scale_factor is not None: @@ -145,7 +157,7 @@ def imresize_to_multiple(img, size = _scale_size((w, h), scale_factor) divisor = to_2tuple(divisor) - size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)]) + size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor)) resized_img, w_scale, h_scale = imresize( img, size, @@ -175,7 +187,7 @@ def imresize_like(img, Returns: tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or - `resized_img`. + `resized_img`. """ h, w = dst_img.shape[:2] return imresize(img, (w, h), return_scale, interpolation, backend=backend) @@ -460,18 +472,17 @@ def impad(img, areas when padding_mode is 'constant'. Default: 0. padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default: constant. - - constant: pads with a constant value, this value is specified - with pad_val. + with pad_val. - edge: pads with the last value at the edge of the image. - - reflect: pads with reflection of image without repeating the - last value on the edge. For example, padding [1, 2, 3, 4] - with 2 elements on both sides in reflect mode will result - in [3, 2, 1, 2, 3, 4, 3, 2]. - - symmetric: pads with reflection of image repeating the last - value on the edge. For example, padding [1, 2, 3, 4] with - 2 elements on both sides in symmetric mode will result in - [2, 1, 1, 2, 3, 4, 4, 3] + - reflect: pads with reflection of image without repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with 2 + elements on both sides in reflect mode will result in + [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last value + on the edge. For example, padding [1, 2, 3, 4] with 2 elements on + both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] Returns: ndarray: The padded image. @@ -479,7 +490,9 @@ def impad(img, assert (shape is not None) ^ (padding is not None) if shape is not None: - padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0]) + width = max(shape[1] - img.shape[1], 0) + height = max(shape[0] - img.shape[0], 0) + padding = (0, 0, width, height) # check pad_val if isinstance(pad_val, tuple): diff --git a/mmcv/image/io.py b/mmcv/image/io.py index d47aaa845256e4e991582a939733c45d62a4de38..ae81b561a84cccfa4923364679dce56d762db1bc 100644 --- a/mmcv/image/io.py +++ b/mmcv/image/io.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import io import os.path as osp +import warnings from pathlib import Path import cv2 @@ -8,7 +9,8 @@ import numpy as np from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, IMREAD_UNCHANGED) -from mmcv.utils import check_file_exist, is_str, mkdir_or_exist +from mmcv.fileio import FileClient +from mmcv.utils import is_filepath, is_str try: from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG @@ -137,9 +139,16 @@ def _pillow2array(img, flag='color', channel_order='bgr'): return array -def imread(img_or_path, flag='color', channel_order='bgr', backend=None): +def imread(img_or_path, + flag='color', + channel_order='bgr', + backend=None, + file_client_args=None): """Read an image. + Note: + In v1.4.1 and later, add `file_client_args` parameters. + Args: img_or_path (ndarray or str or Path): Either a numpy array or str or pathlib.Path. If it is a numpy array (loaded image), then @@ -157,44 +166,42 @@ def imread(img_or_path, flag='color', channel_order='bgr', backend=None): `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is None, the global imread_backend specified by ``mmcv.use_backend()`` will be used. Default: None. + file_client_args (dict | None): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. Returns: ndarray: Loaded image array. + + Examples: + >>> import mmcv + >>> img_path = '/path/to/img.jpg' + >>> img = mmcv.imread(img_path) + >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb', + ... backend='cv2') + >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr', + ... backend='pillow') + >>> s3_img_path = 's3://bucket/img.jpg' + >>> # infer the file backend by the prefix s3 + >>> img = mmcv.imread(s3_img_path) + >>> # manually set the file backend petrel + >>> img = mmcv.imread(s3_img_path, file_client_args={ + ... 'backend': 'petrel'}) + >>> http_img_path = 'http://path/to/img.jpg' + >>> img = mmcv.imread(http_img_path) + >>> img = mmcv.imread(http_img_path, file_client_args={ + ... 'backend': 'http'}) """ - if backend is None: - backend = imread_backend - if backend not in supported_backends: - raise ValueError(f'backend: {backend} is not supported. Supported ' - "backends are 'cv2', 'turbojpeg', 'pillow'") if isinstance(img_or_path, Path): img_or_path = str(img_or_path) if isinstance(img_or_path, np.ndarray): return img_or_path elif is_str(img_or_path): - check_file_exist(img_or_path, - f'img file does not exist: {img_or_path}') - if backend == 'turbojpeg': - with open(img_or_path, 'rb') as in_file: - img = jpeg.decode(in_file.read(), - _jpegflag(flag, channel_order)) - if img.shape[-1] == 1: - img = img[:, :, 0] - return img - elif backend == 'pillow': - img = Image.open(img_or_path) - img = _pillow2array(img, flag, channel_order) - return img - elif backend == 'tifffile': - img = tifffile.imread(img_or_path) - return img - else: - flag = imread_flags[flag] if is_str(flag) else flag - img = cv2.imread(img_or_path, flag) - if flag == IMREAD_COLOR and channel_order == 'rgb': - cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) - return img + file_client = FileClient.infer_client(file_client_args, img_or_path) + img_bytes = file_client.get(img_or_path) + return imfrombytes(img_bytes, flag, channel_order, backend) else: raise TypeError('"img" must be a numpy array or a str or ' 'a pathlib.Path object') @@ -206,29 +213,45 @@ def imfrombytes(content, flag='color', channel_order='bgr', backend=None): Args: content (bytes): Image bytes got from files or other streams. flag (str): Same as :func:`imread`. + channel_order (str): The channel order of the output, candidates + are 'bgr' and 'rgb'. Default to 'bgr'. backend (str | None): The image decoding backend type. Options are - `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the - global imread_backend specified by ``mmcv.use_backend()`` will be - used. Default: None. + `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is + None, the global imread_backend specified by ``mmcv.use_backend()`` + will be used. Default: None. Returns: ndarray: Loaded image array. + + Examples: + >>> img_path = '/path/to/img.jpg' + >>> with open(img_path, 'rb') as f: + >>> img_buff = f.read() + >>> img = mmcv.imfrombytes(img_buff) + >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb') + >>> img = mmcv.imfrombytes(img_buff, backend='pillow') + >>> img = mmcv.imfrombytes(img_buff, backend='cv2') """ if backend is None: backend = imread_backend if backend not in supported_backends: - raise ValueError(f'backend: {backend} is not supported. Supported ' - "backends are 'cv2', 'turbojpeg', 'pillow'") + raise ValueError( + f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'") if backend == 'turbojpeg': img = jpeg.decode(content, _jpegflag(flag, channel_order)) if img.shape[-1] == 1: img = img[:, :, 0] return img elif backend == 'pillow': - buff = io.BytesIO(content) - img = Image.open(buff) - img = _pillow2array(img, flag, channel_order) + with io.BytesIO(content) as buff: + img = Image.open(buff) + img = _pillow2array(img, flag, channel_order) + return img + elif backend == 'tifffile': + with io.BytesIO(content) as buff: + img = tifffile.imread(buff) return img else: img_np = np.frombuffer(content, np.uint8) @@ -239,20 +262,53 @@ def imfrombytes(content, flag='color', channel_order='bgr', backend=None): return img -def imwrite(img, file_path, params=None, auto_mkdir=True): +def imwrite(img, + file_path, + params=None, + auto_mkdir=None, + file_client_args=None): """Write image to file. + Note: + In v1.4.1 and later, add `file_client_args` parameters. + + Warning: + The parameter `auto_mkdir` will be deprecated in the future and every + file clients will make directory automatically. + Args: img (ndarray): Image array to be written. file_path (str): Image file path. params (None or list): Same as opencv :func:`imwrite` interface. auto_mkdir (bool): If the parent folder of `file_path` does not exist, - whether to create it automatically. + whether to create it automatically. It will be deprecated. + file_client_args (dict | None): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. Returns: bool: Successful or not. + + Examples: + >>> # write to hard disk client + >>> ret = mmcv.imwrite(img, '/path/to/img.jpg') + >>> # infer the file backend by the prefix s3 + >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg') + >>> # manually set the file backend petrel + >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', file_client_args={ + ... 'backend': 'petrel'}) """ - if auto_mkdir: - dir_name = osp.abspath(osp.dirname(file_path)) - mkdir_or_exist(dir_name) - return cv2.imwrite(file_path, img, params) + assert is_filepath(file_path) + file_path = str(file_path) + if auto_mkdir is not None: + warnings.warn( + 'The parameter `auto_mkdir` will be deprecated in the future and ' + 'every file clients will make directory automatically.') + file_client = FileClient.infer_client(file_client_args, file_path) + img_ext = osp.splitext(file_path)[-1] + # Encode image according to image suffix. + # For example, if image path is '/path/your/img.jpg', the encode + # format is '.jpg'. + flag, img_buff = cv2.imencode(img_ext, img, params) + file_client.put(img_buff.tobytes(), file_path) + return flag diff --git a/mmcv/image/misc.py b/mmcv/image/misc.py index dfc4a9c6e4c073a672a9a94a06bf0bf2a418c228..43934a689dd7ac6d35b772b7ce9921ff3b1fff50 100644 --- a/mmcv/image/misc.py +++ b/mmcv/image/misc.py @@ -9,18 +9,21 @@ except ImportError: torch = None -def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): - """Convert tensor to 3-channel images. +def tensor2imgs(tensor, mean=None, std=None, to_rgb=True): + """Convert tensor to 3-channel images or 1-channel gray images. Args: tensor (torch.Tensor): Tensor that contains multiple images, shape ( - N, C, H, W). - mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0). - std (tuple[float], optional): Standard deviation of images. - Defaults to (1, 1, 1). + N, C, H, W). :math:`C` can be either 3 or 1. + mean (tuple[float], optional): Mean of images. If None, + (0, 0, 0) will be used for tensor with 3-channel, + while (0, ) for tensor with 1-channel. Defaults to None. + std (tuple[float], optional): Standard deviation of images. If None, + (1, 1, 1) will be used for tensor with 3-channel, + while (1, ) for tensor with 1-channel. Defaults to None. to_rgb (bool, optional): Whether the tensor was converted to RGB format in the first place. If so, convert it back to BGR. - Defaults to True. + For the tensor with 1 channel, it must be False. Defaults to True. Returns: list[np.ndarray]: A list that contains multiple images. @@ -29,8 +32,14 @@ def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): if torch is None: raise RuntimeError('pytorch is not installed') assert torch.is_tensor(tensor) and tensor.ndim == 4 - assert len(mean) == 3 - assert len(std) == 3 + channels = tensor.size(1) + assert channels in [1, 3] + if mean is None: + mean = (0, ) * channels + if std is None: + std = (1, ) * channels + assert (channels == len(mean) == len(std) == 3) or \ + (channels == len(mean) == len(std) == 1 and not to_rgb) num_imgs = tensor.size(0) mean = np.array(mean, dtype=np.float32) diff --git a/mmcv/image/photometric.py b/mmcv/image/photometric.py index 5085d012019c0cbf56f66f421a378278c1a058ae..b41cea7172ae0ece858d868b73dc65deaea3510c 100644 --- a/mmcv/image/photometric.py +++ b/mmcv/image/photometric.py @@ -426,3 +426,46 @@ def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): clahe = cv2.createCLAHE(clip_limit, tile_grid_size) return clahe.apply(np.array(img, dtype=np.uint8)) + + +def adjust_hue(img: np.ndarray, hue_factor: float) -> np.ndarray: + """Adjust hue of an image. + + The image hue is adjusted by converting the image to HSV and cyclically + shifting the intensities in the hue channel (H). The image is then + converted back to original image mode. + + `hue_factor` is the amount of shift in H channel and must be in the + interval `[-0.5, 0.5]`. + + Modified from + https://github.com/pytorch/vision/blob/main/torchvision/ + transforms/functional.py + + Args: + img (ndarray): Image to be adjusted. + hue_factor (float): How much to shift the hue channel. Should be in + [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in + HSV space in positive and negative direction respectively. + 0 means no shift. Therefore, both -0.5 and 0.5 will give an image + with complementary colors while 0 gives the original image. + + Returns: + ndarray: Hue adjusted image. + """ + + if not (-0.5 <= hue_factor <= 0.5): + raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].') + if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})): + raise TypeError('img should be ndarray with dim=[2 or 3].') + + dtype = img.dtype + img = img.astype(np.uint8) + hsv_img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV_FULL) + h, s, v = cv2.split(hsv_img) + h = h.astype(np.uint8) + # uint8 addition take cares of rotation across boundaries + with np.errstate(over='ignore'): + h += np.uint8(hue_factor * 255) + hsv_img = cv2.merge([h, s, v]) + return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2RGB_FULL).astype(dtype) diff --git a/mmcv/model_zoo/torchvision_0.12.json b/mmcv/model_zoo/torchvision_0.12.json new file mode 100644 index 0000000000000000000000000000000000000000..06defe67484dff91cf6f69109324cb1dd9d64bc3 --- /dev/null +++ b/mmcv/model_zoo/torchvision_0.12.json @@ -0,0 +1,57 @@ +{ + "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth", + "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth", + "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth", + "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth", + "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth", + "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth", + "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth", + "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth", + "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth", + "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth", + "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth", + "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth", + "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth", + "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth", + "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth", + "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth", + "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth", + "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth", + "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth", + "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth", + "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth", + "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth", + "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth", + "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth", + "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth", + "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth", + "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth", + "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth", + "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth", + "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth", + "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth", + "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth", + "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth", + "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth", + "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth", + "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth", + "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth", + "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth", + "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth", + "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth", + "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth", + "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth", + "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth", + "shufflenetv2_x1.5": null, + "shufflenetv2_x2.0": null, + "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth", + "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth", + "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth", + "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth", + "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth", + "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth", + "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth", + "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth", + "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth", + "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth" +} diff --git a/mmcv/onnx/info.py b/mmcv/onnx/info.py index e599973689245ff7c279bed0640842a9f0891750..b8325a9c0d0dc3b48b77e9da307341059017ea28 100644 --- a/mmcv/onnx/info.py +++ b/mmcv/onnx/info.py @@ -1,10 +1,24 @@ # Copyright (c) OpenMMLab. All rights reserved. import os +import warnings import torch -def is_custom_op_loaded(): +def is_custom_op_loaded() -> bool: + + # Following strings of text style are from colorama package + bright_style, reset_style = '\x1b[1m', '\x1b[0m' + red_text, blue_text = '\x1b[31m', '\x1b[34m' + white_background = '\x1b[107m' + + msg = white_background + bright_style + red_text + msg += 'DeprecationWarning: This function will be deprecated in future. ' + msg += blue_text + 'Welcome to use the unified model deployment toolbox ' + msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy' + msg += reset_style + warnings.warn(msg) + flag = False try: from ..tensorrt import is_tensorrt_plugin_loaded diff --git a/mmcv/onnx/onnx_utils/symbolic_helper.py b/mmcv/onnx/onnx_utils/symbolic_helper.py index a9a31eb4aeb24b6057acf9d4c352ee7e940377dd..cc9e96f8fbbb0cadec23411ddf93b31a90d049d0 100644 --- a/mmcv/onnx/onnx_utils/symbolic_helper.py +++ b/mmcv/onnx/onnx_utils/symbolic_helper.py @@ -59,7 +59,7 @@ def _parse_arg(value, desc): raise RuntimeError( "ONNX symbolic doesn't know to interpret ListConstruct node") - raise RuntimeError('Unexpected node type: {}'.format(value.node().kind())) + raise RuntimeError(f'Unexpected node type: {value.node().kind()}') def _maybe_get_const(value, desc): @@ -328,4 +328,4 @@ cast_pytorch_to_onnx = { # Global set to store the list of quantized operators in the network. # This is currently only used in the conversion of quantized ops from PT # -> C2 via ONNX. -_quantized_ops = set() +_quantized_ops: set = set() diff --git a/mmcv/onnx/symbolic.py b/mmcv/onnx/symbolic.py index 94cc1c620d01c4fa062cc4576fcb591f90923a65..3599b3f26683ea2d1907aa5e839e02e474791370 100644 --- a/mmcv/onnx/symbolic.py +++ b/mmcv/onnx/symbolic.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. """Modified from https://github.com/pytorch/pytorch.""" import os +import warnings import numpy as np import torch @@ -409,8 +410,8 @@ def cummin(g, input, dim): @parse_args('v', 'v', 'is') def roll(g, input, shifts, dims): - from torch.onnx.symbolic_opset9 import squeeze from packaging import version + from torch.onnx.symbolic_opset9 import squeeze input_shape = g.op('Shape', input) need_flatten = len(dims) == 0 @@ -467,6 +468,18 @@ def roll(g, input, shifts, dims): def register_extra_symbolics(opset=11): + # Following strings of text style are from colorama package + bright_style, reset_style = '\x1b[1m', '\x1b[0m' + red_text, blue_text = '\x1b[31m', '\x1b[34m' + white_background = '\x1b[107m' + + msg = white_background + bright_style + red_text + msg += 'DeprecationWarning: This function will be deprecated in future. ' + msg += blue_text + 'Welcome to use the unified model deployment toolbox ' + msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy' + msg += reset_style + warnings.warn(msg) + register_op('one_hot', one_hot, '', opset) register_op('im2col', im2col, '', opset) register_op('topk', topk, '', opset) diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py old mode 100644 new mode 100755 index 999e090a458ee148ceca0649f1e3806a40e909bd..a65f14fff5f92039947d82a291fca09408f69f87 --- a/mmcv/ops/__init__.py +++ b/mmcv/ops/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from .active_rotated_filter import active_rotated_filter from .assign_score_withk import assign_score_withk from .ball_query import ball_query from .bbox import bbox_overlaps @@ -6,7 +7,9 @@ from .border_align import BorderAlign, border_align from .box_iou_rotated import box_iou_rotated from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive from .cc_attention import CrissCrossAttention +from .chamfer_distance import chamfer_distance from .contour_expand import contour_expand +from .convex_iou import convex_giou, convex_iou from .corner_pool import CornerPool from .correlation import Correlation from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d @@ -16,6 +19,7 @@ from .deprecated_wrappers import Conv2d_deprecated as Conv2d from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d from .deprecated_wrappers import Linear_deprecated as Linear from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d +from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, sigmoid_focal_loss, softmax_focal_loss) from .furthest_point_sample import (furthest_point_sample, @@ -25,9 +29,11 @@ from .gather_points import gather_points from .group_points import GroupAll, QueryAndGroup, grouping_operation from .info import (get_compiler_version, get_compiling_cuda_version, get_onnxruntime_op_path) -from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev +from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d, + nms3d_normal, nms_bev, nms_normal_bev) from .knn import knn from .masked_conv import MaskedConv2d, masked_conv2d +from .min_area_polygons import min_area_polygons from .modulated_deform_conv import (ModulatedDeformConv2d, ModulatedDeformConv2dPack, modulated_deform_conv2d) @@ -38,15 +44,25 @@ from .point_sample import (SimpleRoIAlign, point_sample, rel_roi_point_to_rel_img_point) from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, points_in_boxes_part) +from .points_in_polygons import points_in_polygons from .points_sampler import PointsSampler +from .prroi_pool import PrRoIPool, prroi_pool from .psa_mask import PSAMask +from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated from .roi_align import RoIAlign, roi_align from .roi_align_rotated import RoIAlignRotated, roi_align_rotated from .roi_pool import RoIPool, roi_pool from .roiaware_pool3d import RoIAwarePool3d from .roipoint_pool3d import RoIPointPool3d +from .rotated_feature_align import rotated_feature_align from .saconv import SAConv2d from .scatter_points import DynamicScatter, dynamic_scatter +from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d, + SparseConvTranspose3d, SparseInverseConv2d, + SparseInverseConv3d, SubMConv2d, SubMConv3d) +from .sparse_modules import SparseModule, SparseSequential +from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d +from .sparse_structure import SparseConvTensor, scatter_nd from .sync_bn import SyncBatchNorm from .three_interpolate import three_interpolate from .three_nn import three_nn @@ -70,12 +86,21 @@ __all__ = [ 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', 'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query', 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', + 'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated', 'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup', 'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn', 'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign', 'border_align', 'gather_points', 'furthest_point_sample', 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', - 'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization', - 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', - 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all' + 'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev', + 'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization', + 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d', + 'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d', + 'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d', + 'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d', + 'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part', + 'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons', + 'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou', + 'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance', + 'PrRoIPool', 'prroi_pool' ] diff --git a/mmcv/ops/active_rotated_filter.py b/mmcv/ops/active_rotated_filter.py new file mode 100644 index 0000000000000000000000000000000000000000..46c2aa7806ab62a6d0544f6dc1fb609af3a8a483 --- /dev/null +++ b/mmcv/ops/active_rotated_filter.py @@ -0,0 +1,64 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', + ['active_rotated_filter_forward', 'active_rotated_filter_backward']) + + +class ActiveRotatedFilterFunction(Function): + """Encoding the orientation information and generating orientation- + sensitive features. + + The details are described in the paper `Align Deep Features for Oriented + Object Detection _`. + """ + + @staticmethod + def forward(ctx, input: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + input (torch.Tensor): Input features with shape + [num_output_planes, num_input_planes, num_orientations, H, W]. + indices (torch.Tensor): Indices with shape + [num_orientations, H, W, num_rotations]. + + Returns: + torch.Tensor: Refined features with shape [num_output_planes * + num_rotations, num_input_planes * num_orientations, H, W]. + """ + ctx.save_for_backward(input, indices) + op, ip, o, h, w = input.size() + o, h, w, r = indices.size() + output = input.new_zeros((op * r, ip * o, h, w)) + ext_module.active_rotated_filter_forward(input, indices, output) + + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]: + """ + Args: + grad_output (torch.Tensor): The gradiant of output features + with shape [num_output_planes * num_rotations, + num_input_planes * num_orientations, H, W]. + + Returns: + torch.Tensor: The gradiant of input features with shape + [num_output_planes, num_input_planes, num_orientations, H, W]. + """ + input, indices = ctx.saved_tensors + grad_in = torch.zeros_like(input) + ext_module.active_rotated_filter_backward(grad_out, indices, grad_in) + return grad_in, None + + +active_rotated_filter = ActiveRotatedFilterFunction.apply diff --git a/mmcv/ops/assign_score_withk.py b/mmcv/ops/assign_score_withk.py index 4906adaa2cffd1b46912fbe7d4f87ef2f9fa0012..deca0892bddc52b51e9d2543a9e893f0bd67ebdb 100644 --- a/mmcv/ops/assign_score_withk.py +++ b/mmcv/ops/assign_score_withk.py @@ -1,3 +1,6 @@ +from typing import Tuple + +import torch from torch.autograd import Function from ..utils import ext_loader @@ -27,11 +30,11 @@ class AssignScoreWithK(Function): @staticmethod def forward(ctx, - scores, - point_features, - center_features, - knn_idx, - aggregate='sum'): + scores: torch.Tensor, + point_features: torch.Tensor, + center_features: torch.Tensor, + knn_idx: torch.Tensor, + aggregate: str = 'sum') -> torch.Tensor: """ Args: scores (torch.Tensor): (B, npoint, K, M), predicted scores to @@ -78,15 +81,20 @@ class AssignScoreWithK(Function): return output @staticmethod - def backward(ctx, grad_out): + def backward( + ctx, grad_out: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]: """ Args: grad_out (torch.Tensor): (B, out_dim, npoint, K) Returns: - grad_scores (torch.Tensor): (B, npoint, K, M) - grad_point_features (torch.Tensor): (B, N, M, out_dim) - grad_center_features (torch.Tensor): (B, N, M, out_dim) + tuple[torch.Tensor]: A tuple contains five elements. The first one + is the gradient of ``scores`` whose shape is (B, npoint, K, M). The + second is the gradient of ``point_features`` whose shape is + (B, N, M, out_dim). The third is the gradient of + ``center_features`` with the shape of (B, N, M, out_dim). The last + two are ``None``. """ _, point_features, center_features, scores, knn_idx = ctx.saved_tensors diff --git a/mmcv/ops/ball_query.py b/mmcv/ops/ball_query.py index d0466847c6e5c1239e359a0397568413ebc1504a..d24e0446ca81a19a9e2d4b822cb32533f941d78f 100644 --- a/mmcv/ops/ball_query.py +++ b/mmcv/ops/ball_query.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import torch from torch.autograd import Function @@ -18,12 +20,13 @@ class BallQuery(Function): min_radius (float): minimum radius of the balls. max_radius (float): maximum radius of the balls. sample_num (int): maximum number of features in the balls. - xyz (Tensor): (B, N, 3) xyz coordinates of the features. - center_xyz (Tensor): (B, npoint, 3) centers of the ball query. + xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features. + center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball + query. Returns: - Tensor: (B, npoint, nsample) tensor with the indices of - the features that form the query balls. + torch.Tensor: (B, npoint, nsample) tensor with the indices of the + features that form the query balls. """ assert center_xyz.is_contiguous() assert xyz.is_contiguous() @@ -48,7 +51,7 @@ class BallQuery(Function): return idx @staticmethod - def backward(ctx, a=None): + def backward(ctx, a=None) -> Tuple[None, None, None, None]: return None, None, None, None diff --git a/mmcv/ops/bbox.py b/mmcv/ops/bbox.py index 0c4d58b6c91f652933974f519acd3403a833e906..bf6bd43bbb0adcb4b6d104a815f73ed2e5912069 100644 --- a/mmcv/ops/bbox.py +++ b/mmcv/ops/bbox.py @@ -1,10 +1,57 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch + from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) -def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): +def _bbox_overlaps_cpu(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + offset: int = 0) -> torch.Tensor: + assert mode in ['iou', 'iof'] + + if aligned: + lt = torch.max(bboxes1[:, :2], bboxes2[:, :2]) # [rows, 2] + rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:]) # [rows, 2] + + wh = (rb - lt + offset).clamp(min=0) # [rows, 2] + overlap = wh[:, 0] * wh[:, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( + bboxes1[:, 3] - bboxes1[:, 1] + offset) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( + bboxes2[:, 3] - bboxes2[:, 1] + offset) + ious = overlap / (area1 + area2 - overlap) + else: + ious = overlap / area1 + else: + lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2]) # [rows, cols, 2] + rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:]) # [rows, cols, 2] + + wh = (rb - lt + offset).clamp(min=0) # [rows, cols, 2] + overlap = wh[:, :, 0] * wh[:, :, 1] + area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * ( + bboxes1[:, 3] - bboxes1[:, 1] + offset) + + if mode == 'iou': + area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * ( + bboxes2[:, 3] - bboxes2[:, 1] + offset) + ious = overlap / (area1[:, None] + area2 - overlap) + else: + ious = overlap / (area1[:, None]) + + return ious + + +def bbox_overlaps(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + offset: int = 0) -> torch.Tensor: """Calculate overlap between two set of bboxes. If ``aligned`` is ``False``, then calculate the ious between each bbox @@ -12,14 +59,16 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): bboxes1 and bboxes2. Args: - bboxes1 (Tensor): shape (m, 4) in format or empty. - bboxes2 (Tensor): shape (n, 4) in format or empty. - If aligned is ``True``, then m and n must be equal. + bboxes1 (torch.Tensor): shape (m, 4) in format or + empty. + bboxes2 (torch.Tensor): shape (n, 4) in format or + empty. If aligned is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or iof (intersection over foreground). Returns: - ious(Tensor): shape (m, n) if aligned == False else shape (m, 1) + torch.Tensor: Return the ious betweens boxes. If ``aligned`` is + ``False``, the shape of ious is (m, n) else (m, 1). Example: >>> bboxes1 = torch.FloatTensor([ @@ -63,10 +112,19 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): if rows * cols == 0: return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols) - if aligned: - ious = bboxes1.new_zeros(rows) + if bboxes1.device.type == 'cpu': + return _bbox_overlaps_cpu( + bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset) else: - ious = bboxes1.new_zeros((rows, cols)) - ext_module.bbox_overlaps( - bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) - return ious + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows, cols)) + ext_module.bbox_overlaps( + bboxes1, + bboxes2, + ious, + mode=mode_flag, + aligned=aligned, + offset=offset) + return ious diff --git a/mmcv/ops/border_align.py b/mmcv/ops/border_align.py index ff305be328e9b0a15e1bbb5e6b41beb940f55c81..c09501b962cfce10b1da87e6b651d61911eb8406 100644 --- a/mmcv/ops/border_align.py +++ b/mmcv/ops/border_align.py @@ -2,6 +2,8 @@ # modified from # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py +from typing import Tuple + import torch import torch.nn as nn from torch.autograd import Function @@ -21,7 +23,8 @@ class BorderAlignFunction(Function): 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) @staticmethod - def forward(ctx, input, boxes, pool_size): + def forward(ctx, input: torch.Tensor, boxes: torch.Tensor, + pool_size: int) -> torch.Tensor: ctx.pool_size = pool_size ctx.input_shape = input.size() @@ -45,7 +48,8 @@ class BorderAlignFunction(Function): @staticmethod @once_differentiable - def backward(ctx, grad_output): + def backward(ctx, + grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]: boxes, argmax_idx = ctx.saved_tensors grad_input = grad_output.new_zeros(ctx.input_shape) # complex head architecture may cause grad_output uncontiguous @@ -72,24 +76,25 @@ class BorderAlign(nn.Module): For each border line (e.g. top, left, bottom or right) of each box, border_align does the following: - 1. uniformly samples `pool_size`+1 positions on this line, involving \ - the start and end points. - 2. the corresponding features on these points are computed by \ - bilinear interpolation. - 3. max pooling over all the `pool_size`+1 positions are used for \ - computing pooled feature. + + 1. uniformly samples ``pool_size`` +1 positions on this line, involving + the start and end points. + 2. the corresponding features on these points are computed by bilinear + interpolation. + 3. max pooling over all the ``pool_size`` +1 positions are used for + computing pooled feature. Args: pool_size (int): number of positions sampled over the boxes' borders (e.g. top, bottom, left, right). - """ - def __init__(self, pool_size): - super(BorderAlign, self).__init__() + def __init__(self, pool_size: int): + super().__init__() self.pool_size = pool_size - def forward(self, input, boxes): + def forward(self, input: torch.Tensor, + boxes: torch.Tensor) -> torch.Tensor: """ Args: input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), @@ -98,8 +103,8 @@ class BorderAlign(nn.Module): boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). Returns: - Tensor: Pooled features with shape [N,C,H*W,4]. The order is - (top,left,bottom,right) for the last dimension. + torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is + (top,left,bottom,right) for the last dimension. """ return border_align(input, boxes, self.pool_size) diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py index 2d78015e9c2a9e7a52859b4e18f84a9aa63481a0..2443af27c92146ed4328e8f94b1415c7e72c542b 100644 --- a/mmcv/ops/box_iou_rotated.py +++ b/mmcv/ops/box_iou_rotated.py @@ -1,10 +1,16 @@ # Copyright (c) OpenMMLab. All rights reserved. +import torch + from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) -def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): +def box_iou_rotated(bboxes1: torch.Tensor, + bboxes2: torch.Tensor, + mode: str = 'iou', + aligned: bool = False, + clockwise: bool = True) -> torch.Tensor: """Return intersection-over-union (Jaccard index) of boxes. Both sets of boxes are expected to be in @@ -14,18 +20,110 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. - Arguments: - boxes1 (Tensor): rotated bboxes 1. \ - It has shape (N, 5), indicating (x, y, w, h, theta) for each row. - Note that theta is in radian. - boxes2 (Tensor): rotated bboxes 2. \ - It has shape (M, 5), indicating (x, y, w, h, theta) for each row. - Note that theta is in radian. + .. note:: + The operator assumes: + + 1) The positive direction along x axis is left -> right. + + 2) The positive direction along y axis is top -> down. + + 3) The w border is in parallel with x axis when angle = 0. + + However, there are 2 opposite definitions of the positive angular + direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports + both definitions and uses CW by default. + + Please set ``clockwise=False`` if you are using the CCW definition. + + The coordinate system when ``clockwise`` is ``True`` (default) + + .. code-block:: none + + 0-------------------> x (0 rad) + | A-------------B + | | | + | | box h + | | angle=0 | + | D------w------C + v + y (pi/2 rad) + + In such coordination system the rotation matrix is + + .. math:: + \\begin{pmatrix} + \\cos\\alpha & -\\sin\\alpha \\\\ + \\sin\\alpha & \\cos\\alpha + \\end{pmatrix} + + The coordinates of the corner point A can be calculated as: + + .. math:: + P_A= + \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} + = + \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + + \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\ + \\sin\\alpha & \\cos\\alpha\\end{pmatrix} + \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ + = + \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha + \\\\ + y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} + + + The coordinate system when ``clockwise`` is ``False`` + + .. code-block:: none + + 0-------------------> x (0 rad) + | A-------------B + | | | + | | box h + | | angle=0 | + | D------w------C + v + y (-pi/2 rad) + + In such coordination system the rotation matrix is + + .. math:: + \\begin{pmatrix} + \\cos\\alpha & \\sin\\alpha \\\\ + -\\sin\\alpha & \\cos\\alpha + \\end{pmatrix} + + The coordinates of the corner point A can be calculated as: + + .. math:: + P_A= + \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix} + = + \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} + + \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\ + -\\sin\\alpha & \\cos\\alpha\\end{pmatrix} + \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\ + = + \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha + \\\\ + y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix} + + Args: + boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5), + indicating (x, y, w, h, theta) for each row. Note that theta is in + radian. + boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5), + indicating (x, y, w, h, theta) for each row. Note that theta is in + radian. mode (str): "iou" (intersection over union) or iof (intersection over foreground). + clockwise (bool): flag indicating whether the positive angular + orientation is clockwise. default True. + `New in version 1.4.3.` Returns: - ious(Tensor): shape (N, M) if aligned == False else shape (N,) + torch.Tensor: Return the ious betweens boxes. If ``aligned`` is + ``False``, the shape of ious is (N, M) else (N,). """ assert mode in ['iou', 'iof'] mode_dict = {'iou': 0, 'iof': 1} @@ -35,7 +133,12 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): if aligned: ious = bboxes1.new_zeros(rows) else: - ious = bboxes1.new_zeros((rows * cols)) + ious = bboxes1.new_zeros(rows * cols) + if not clockwise: + flip_mat = bboxes1.new_ones(bboxes1.shape[-1]) + flip_mat[-1] = -1 + bboxes1 = bboxes1 * flip_mat + bboxes2 = bboxes2 * flip_mat bboxes1 = bboxes1.contiguous() bboxes2 = bboxes2.contiguous() ext_module.box_iou_rotated( diff --git a/mmcv/ops/carafe.py b/mmcv/ops/carafe.py index 5154cb3abfccfbbe0a1b2daa67018dbf80aaf6d2..18230c08074f5309e791810a4774e294084c3f5b 100644 --- a/mmcv/ops/carafe.py +++ b/mmcv/ops/carafe.py @@ -1,7 +1,10 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import torch import torch.nn as nn import torch.nn.functional as F +from torch import Tensor from torch.autograd import Function from torch.nn.modules.module import Module @@ -17,7 +20,8 @@ ext_module = ext_loader.load_ext('_ext', [ class CARAFENaiveFunction(Function): @staticmethod - def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: return g.op( 'mmcv::MMCVCARAFENaive', features, @@ -27,7 +31,8 @@ class CARAFENaiveFunction(Function): scale_factor_f=scale_factor) @staticmethod - def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor @@ -50,12 +55,15 @@ class CARAFENaiveFunction(Function): group_size=group_size, scale_factor=scale_factor) - if features.requires_grad or masks.requires_grad: + if features.requires_grad or masks.requires_grad or \ + torch.__version__ == 'parrots': ctx.save_for_backward(features, masks) return output @staticmethod - def backward(ctx, grad_output): + def backward( + ctx, + grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: assert grad_output.is_cuda features, masks = ctx.saved_tensors @@ -83,8 +91,8 @@ carafe_naive = CARAFENaiveFunction.apply class CARAFENaive(Module): - def __init__(self, kernel_size, group_size, scale_factor): - super(CARAFENaive, self).__init__() + def __init__(self, kernel_size: int, group_size: int, scale_factor: int): + super().__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) @@ -92,7 +100,7 @@ class CARAFENaive(Module): self.group_size = group_size self.scale_factor = scale_factor - def forward(self, features, masks): + def forward(self, features: Tensor, masks: Tensor) -> Tensor: return carafe_naive(features, masks, self.kernel_size, self.group_size, self.scale_factor) @@ -100,7 +108,8 @@ class CARAFENaive(Module): class CARAFEFunction(Function): @staticmethod - def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: return g.op( 'mmcv::MMCVCARAFE', features, @@ -110,7 +119,8 @@ class CARAFEFunction(Function): scale_factor_f=scale_factor) @staticmethod - def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int, + group_size: int, scale_factor: int) -> Tensor: assert scale_factor >= 1 assert masks.size(1) == kernel_size * kernel_size * group_size assert masks.size(-1) == features.size(-1) * scale_factor @@ -139,12 +149,15 @@ class CARAFEFunction(Function): group_size=group_size, scale_factor=scale_factor) - if features.requires_grad or masks.requires_grad: + if features.requires_grad or masks.requires_grad or \ + torch.__version__ == 'parrots': ctx.save_for_backward(features, masks, rfeatures) return output @staticmethod - def backward(ctx, grad_output): + def backward( + ctx, + grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]: assert grad_output.is_cuda features, masks, rfeatures = ctx.saved_tensors @@ -180,7 +193,8 @@ carafe = CARAFEFunction.apply class CARAFE(Module): """ CARAFE: Content-Aware ReAssembly of FEatures - Please refer to https://arxiv.org/abs/1905.02188 for more details. + Please refer to `CARAFE: Content-Aware ReAssembly of FEatures + `_ for more details. Args: kernel_size (int): reassemble kernel size @@ -191,8 +205,8 @@ class CARAFE(Module): upsampled feature map """ - def __init__(self, kernel_size, group_size, scale_factor): - super(CARAFE, self).__init__() + def __init__(self, kernel_size: int, group_size: int, scale_factor: int): + super().__init__() assert isinstance(kernel_size, int) and isinstance( group_size, int) and isinstance(scale_factor, int) @@ -200,7 +214,7 @@ class CARAFE(Module): self.group_size = group_size self.scale_factor = scale_factor - def forward(self, features, masks): + def forward(self, features: Tensor, masks: Tensor) -> Tensor: return carafe(features, masks, self.kernel_size, self.group_size, self.scale_factor) @@ -211,8 +225,8 @@ class CARAFEPack(nn.Module): compressor 2) content encoder 3) CARAFE op. Official implementation of ICCV 2019 paper - CARAFE: Content-Aware ReAssembly of FEatures - Please refer to https://arxiv.org/abs/1905.02188 for more details. + `CARAFE: Content-Aware ReAssembly of FEatures + `_. Args: channels (int): input feature channels @@ -228,14 +242,14 @@ class CARAFEPack(nn.Module): """ def __init__(self, - channels, - scale_factor, - up_kernel=5, - up_group=1, - encoder_kernel=3, - encoder_dilation=1, - compressed_channels=64): - super(CARAFEPack, self).__init__() + channels: int, + scale_factor: int, + up_kernel: int = 5, + up_group: int = 1, + encoder_kernel: int = 3, + encoder_dilation: int = 1, + compressed_channels: int = 64): + super().__init__() self.channels = channels self.scale_factor = scale_factor self.up_kernel = up_kernel @@ -261,7 +275,7 @@ class CARAFEPack(nn.Module): xavier_init(m, distribution='uniform') normal_init(self.content_encoder, std=0.001) - def kernel_normalizer(self, mask): + def kernel_normalizer(self, mask: Tensor) -> Tensor: mask = F.pixel_shuffle(mask, self.scale_factor) n, mask_c, h, w = mask.size() # use float division explicitly, @@ -274,11 +288,11 @@ class CARAFEPack(nn.Module): return mask - def feature_reassemble(self, x, mask): + def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor: x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) return x - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: compressed_x = self.channel_compressor(x) mask = self.content_encoder(compressed_x) mask = self.kernel_normalizer(mask) diff --git a/mmcv/ops/cc_attention.py b/mmcv/ops/cc_attention.py index ff8dd4c56849d504d265346316e2f8abb0a66598..9e5d3325263f18f6b5eb0bfbc522eeaef1999e3b 100644 --- a/mmcv/ops/cc_attention.py +++ b/mmcv/ops/cc_attention.py @@ -6,7 +6,7 @@ import torch.nn.functional as F from mmcv.cnn import PLUGIN_LAYERS, Scale -def NEG_INF_DIAG(n, device): +def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor: """Returns a diagonal matrix of size [n, n]. The diagonal are all "-inf". This is for avoiding calculating the @@ -41,7 +41,7 @@ class CrissCrossAttention(nn.Module): in_channels (int): Channels of the input feature map. """ - def __init__(self, in_channels): + def __init__(self, in_channels: int) -> None: super().__init__() self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) @@ -49,14 +49,15 @@ class CrissCrossAttention(nn.Module): self.gamma = Scale(0.) self.in_channels = in_channels - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: """forward function of Criss-Cross Attention. Args: - x (Tensor): Input feature. \ - shape (batch_size, in_channels, height, width) + x (torch.Tensor): Input feature with the shape of + (batch_size, in_channels, height, width). + Returns: - Tensor: Output of the layer, with shape of \ + torch.Tensor: Output of the layer, with the shape of (batch_size, in_channels, height, width) """ B, C, H, W = x.size() @@ -77,7 +78,7 @@ class CrissCrossAttention(nn.Module): return out - def __repr__(self): + def __repr__(self) -> str: s = self.__class__.__name__ s += f'(in_channels={self.in_channels})' return s diff --git a/mmcv/ops/chamfer_distance.py b/mmcv/ops/chamfer_distance.py new file mode 100644 index 0000000000000000000000000000000000000000..d68eafb47c85418c374a1eaf086478e3fc0cb1d1 --- /dev/null +++ b/mmcv/ops/chamfer_distance.py @@ -0,0 +1,95 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Sequence, Tuple + +import torch +from torch import Tensor +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward']) + + +class ChamferDistanceFunction(Function): + """This is an implementation of the 2D Chamfer Distance. + + It has been used in the paper `Oriented RepPoints for Aerial Object + Detection (CVPR 2022) _`. + """ + + @staticmethod + def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]: + """ + Args: + xyz1 (Tensor): Point set with shape (B, N, 2). + xyz2 (Tensor): Point set with shape (B, N, 2). + + Returns: + Sequence[Tensor]: + + - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with + shape (B, N). + - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with + shape (B, N). + - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2) + with shape (B, N), which be used in compute gradient. + - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2) + with shape (B, N), which be used in compute gradient. + """ + batch_size, n, _ = xyz1.size() + _, m, _ = xyz2.size() + device = xyz1.device + xyz1 = xyz1.contiguous() + xyz2 = xyz2.contiguous() + + dist1 = torch.zeros(batch_size, n).to(device) + dist2 = torch.zeros(batch_size, m).to(device) + idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device) + idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device) + + ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1, + idx2) + ctx.save_for_backward(xyz1, xyz2, idx1, idx2) + return dist1, dist2, idx1, idx2 + + @staticmethod + @once_differentiable + def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor, + grad_idx1: Tensor, + grad_idx2: Tensor) -> Tuple[Tensor, Tensor]: + """ + + Args: + grad_dist1 (Tensor): Gradient of chamfer distance + (xyz1 to xyz2) with shape (B, N). + grad_dist2 (Tensor): Gradient of chamfer distance + (xyz2 to xyz1) with shape (B, N). + grad_idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2) + with shape (B, N), which be used in compute gradient. + grad_idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2) + with shape (B, N), which be used in compute gradient. + + Returns: + Tuple[Tensor, Tensor]: + + - grad_xyz1 (Tensor): Gradient of the point set with shape \ + (B, N, 2). + - grad_xyz2 (Tensor):Gradient of the point set with shape \ + (B, N, 2). + """ + xyz1, xyz2, idx1, idx2 = ctx.saved_tensors + device = grad_dist1.device + grad_dist1 = grad_dist1.contiguous() + grad_dist2 = grad_dist2.contiguous() + grad_xyz1 = torch.zeros(xyz1.size()).to(device) + grad_xyz2 = torch.zeros(xyz2.size()).to(device) + + ext_module.chamfer_distance_backward(xyz1, xyz2, grad_xyz1, grad_xyz2, + grad_dist1, grad_dist2, idx1, + idx2) + return grad_xyz1, grad_xyz2 + + +chamfer_distance = ChamferDistanceFunction.apply diff --git a/mmcv/ops/contour_expand.py b/mmcv/ops/contour_expand.py index ea1111e1768b5f27e118bf7dbc0d9c70a7afd6d7..7184609ad9b64d421c17fdfe4a1a0dbeb62d64c8 100644 --- a/mmcv/ops/contour_expand.py +++ b/mmcv/ops/contour_expand.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Union + import numpy as np import torch @@ -7,21 +9,22 @@ from ..utils import ext_loader ext_module = ext_loader.load_ext('_ext', ['contour_expand']) -def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, - kernel_num): +def contour_expand(kernel_mask: Union[np.array, torch.Tensor], + internal_kernel_label: Union[np.array, torch.Tensor], + min_kernel_area: int, kernel_num: int) -> list: """Expand kernel contours so that foreground pixels are assigned into instances. - Arguments: - kernel_mask (np.array or Tensor): The instance kernel mask with + Args: + kernel_mask (np.array or torch.Tensor): The instance kernel mask with size hxw. - internal_kernel_label (np.array or Tensor): The instance internal + internal_kernel_label (np.array or torch.Tensor): The instance internal kernel label with size hxw. min_kernel_area (int): The minimum kernel area. kernel_num (int): The instance kernel number. Returns: - label (list): The instance index map with size hxw. + list: The instance index map with size hxw. """ assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) @@ -42,7 +45,7 @@ def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, internal_kernel_label, min_kernel_area=min_kernel_area, kernel_num=kernel_num) - label = label.tolist() + label = label.tolist() # type: ignore else: label = ext_module.contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, kernel_num) diff --git a/mmcv/ops/convex_iou.py b/mmcv/ops/convex_iou.py new file mode 100644 index 0000000000000000000000000000000000000000..50050363ac5b08cfa8f86dd186ab7087fac6f48a --- /dev/null +++ b/mmcv/ops/convex_iou.py @@ -0,0 +1,52 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou']) + + +def convex_giou(pointsets: torch.Tensor, + polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Return generalized intersection-over-union (Jaccard index) between point + sets and polygons. + + Args: + pointsets (torch.Tensor): It has shape (N, 18), + indicating (x1, y1, x2, y2, ..., x9, y9) for each row. + polygons (torch.Tensor): It has shape (N, 8), + indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. + + Returns: + tuple[torch.Tensor, torch.Tensor]: The first element is the gious + between point sets and polygons with the shape (N,). The second + element is the gradient of point sets with the shape (N, 18). + """ + output = pointsets.new_zeros((pointsets.size(0), 19)) + ext_module.convex_giou(pointsets, polygons, output) + convex_giou = output[:, -1] + points_grad = output[:, 0:-1] + return convex_giou, points_grad + + +def convex_iou(pointsets: torch.Tensor, + polygons: torch.Tensor) -> torch.Tensor: + """Return intersection-over-union (Jaccard index) between point sets and + polygons. + + Args: + pointsets (torch.Tensor): It has shape (N, 18), + indicating (x1, y1, x2, y2, ..., x9, y9) for each row. + polygons (torch.Tensor): It has shape (K, 8), + indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row. + + Returns: + torch.Tensor: Return the ious between point sets and polygons with the + shape (N, K). + """ + N, K = pointsets.size(0), polygons.size(0) + ious = pointsets.new_zeros((N, K)) + ext_module.convex_iou(pointsets, polygons, ious) + return ious diff --git a/mmcv/ops/corner_pool.py b/mmcv/ops/corner_pool.py index a33d798b43d405e4c86bee4cd6389be21ca9c637..17ce24952a3b229fb552f450429c948e70aefa19 100644 --- a/mmcv/ops/corner_pool.py +++ b/mmcv/ops/corner_pool.py @@ -1,101 +1,90 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch -from torch import nn +from torch import Tensor, nn from torch.autograd import Function -from ..utils import ext_loader +_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} -ext_module = ext_loader.load_ext('_ext', [ - 'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward', - 'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward', - 'right_pool_forward', 'right_pool_backward' -]) -_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} +def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor: + size = x.size(dim) + output = x.clone() + + ind = 1 + while ind < size: + if flip: + cur_start = 0 + cur_len = size - ind + next_start = ind + next_len = size - ind + else: + cur_start = ind + cur_len = size - ind + next_start = 0 + next_len = size - ind + + # max_temp should be cloned for backward computation + max_temp = output.narrow(dim, cur_start, cur_len).clone() + cur_temp = output.narrow(dim, cur_start, cur_len) + next_temp = output.narrow(dim, next_start, next_len) + + cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp) + + ind = ind << 1 + + return output class TopPoolFunction(Function): @staticmethod - def symbolic(g, input): + def symbolic(g, input: Tensor) -> Tensor: output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top'])) return output @staticmethod - def forward(ctx, input): - output = ext_module.top_pool_forward(input) - ctx.save_for_backward(input) - return output - - @staticmethod - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output = ext_module.top_pool_backward(input, grad_output) - return output + def forward(ctx, input: Tensor) -> Tensor: + return _corner_pool(input, 2, True) class BottomPoolFunction(Function): @staticmethod - def symbolic(g, input): + def symbolic(g, input: Tensor) -> Tensor: output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom'])) return output @staticmethod - def forward(ctx, input): - output = ext_module.bottom_pool_forward(input) - ctx.save_for_backward(input) - return output - - @staticmethod - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output = ext_module.bottom_pool_backward(input, grad_output) - return output + def forward(ctx, input: Tensor) -> Tensor: + return _corner_pool(input, 2, False) class LeftPoolFunction(Function): @staticmethod - def symbolic(g, input): + def symbolic(g, input: Tensor) -> Tensor: output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left'])) return output @staticmethod - def forward(ctx, input): - output = ext_module.left_pool_forward(input) - ctx.save_for_backward(input) - return output - - @staticmethod - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output = ext_module.left_pool_backward(input, grad_output) - return output + def forward(ctx, input: Tensor) -> Tensor: + return _corner_pool(input, 3, True) class RightPoolFunction(Function): @staticmethod - def symbolic(g, input): + def symbolic(g, input: Tensor) -> Tensor: output = g.op( 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right'])) return output @staticmethod - def forward(ctx, input): - output = ext_module.right_pool_forward(input) - ctx.save_for_backward(input) - return output - - @staticmethod - def backward(ctx, grad_output): - input, = ctx.saved_tensors - output = ext_module.right_pool_backward(input, grad_output) - return output + def forward(ctx, input: Tensor) -> Tensor: + return _corner_pool(input, 3, False) class CornerPool(nn.Module): @@ -104,11 +93,13 @@ class CornerPool(nn.Module): Corner Pooling is a new type of pooling layer that helps a convolutional network better localize corners of bounding boxes. - Please refer to https://arxiv.org/abs/1808.01244 for more details. + Please refer to `CornerNet: Detecting Objects as Paired Keypoints + `_ for more details. + Code is modified from https://github.com/princeton-vl/CornerNet-Lite. Args: - mode(str): Pooling orientation for the pooling layer + mode (str): Pooling orientation for the pooling layer - 'bottom': Bottom Pooling - 'left': Left Pooling @@ -133,13 +124,13 @@ class CornerPool(nn.Module): 'top': (2, True), } - def __init__(self, mode): - super(CornerPool, self).__init__() + def __init__(self, mode: str): + super().__init__() assert mode in self.pool_functions self.mode = mode - self.corner_pool = self.pool_functions[mode] + self.corner_pool: Function = self.pool_functions[mode] - def forward(self, x): + def forward(self, x: Tensor) -> Tensor: if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0': if torch.onnx.is_in_onnx_export(): assert torch.__version__ >= '1.7.0', \ @@ -158,4 +149,8 @@ class CornerPool(nn.Module): pool_tensor = pool_tensor.flip(dim) return pool_tensor else: - return self.corner_pool.apply(x) + if torch.onnx.is_in_onnx_export(): + return self.corner_pool.apply(x) + else: + dim, flip = self.cummax_dim_flip[self.mode] + return _corner_pool(x, dim, flip) diff --git a/mmcv/ops/correlation.py b/mmcv/ops/correlation.py index 3d0b79c301b29915dfaf4d2b1846c59be73127d3..319b7646782637e9ebaac4ef07b82d1f460031b5 100644 --- a/mmcv/ops/correlation.py +++ b/mmcv/ops/correlation.py @@ -1,4 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + import torch from torch import Tensor, nn from torch.autograd import Function @@ -15,14 +17,14 @@ class CorrelationFunction(Function): @staticmethod def forward(ctx, - input1, - input2, - kernel_size=1, - max_displacement=1, - stride=1, - padding=1, - dilation=1, - dilation_patch=1): + input1: Tensor, + input2: Tensor, + kernel_size: int = 1, + max_displacement: int = 1, + stride: int = 1, + padding: int = 1, + dilation: int = 1, + dilation_patch: int = 1) -> Tensor: ctx.save_for_backward(input1, input2) @@ -60,7 +62,9 @@ class CorrelationFunction(Function): @staticmethod @once_differentiable - def backward(ctx, grad_output): + def backward( + ctx, grad_output: Tensor + ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]: input1, input2 = ctx.saved_tensors kH, kW = ctx.kernel_size diff --git a/mmcv/ops/csrc/README.md b/mmcv/ops/csrc/README.md index 3bc02004017a0d607131b4de168b320c3beed23c..dbc82b534b1ab27593361b3053cb61e12fbd420e 100644 --- a/mmcv/ops/csrc/README.md +++ b/mmcv/ops/csrc/README.md @@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the │ ├── pytorch_cpp_helper.hpp │ ├── pytorch_cuda_helper.hpp │ ├── pytorch_device_registry.hpp -│   └── cuda -│   ├── common_cuda_helper.hpp -│   ├── parrots_cudawarpfunction.cuh -│   ├── ... -│   └── ops_cuda_kernel.cuh +│   ├── cuda +│   │ ├── common_cuda_helper.hpp +│   │ ├── parrots_cudawarpfunction.cuh +│   │ ├── ... +│   │ └── ops_cuda_kernel.cuh +|   ├── mps +│   │ ├── MPSLibrary.h +│   │ ├── ... +│   │ └── MPSUtils.h +|   ├── mlu +│   │ └── ... +|   └── utils +│   │ └── ... ├── onnxruntime │   ├── onnxruntime_register.h │   ├── onnxruntime_session_options_config_keys.h @@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the │   ├── cuda │   │   ├── ... │   │   └── ops_cuda.cu -│   └── cpu +│   ├── cpu +│   │   ├── ... +│   │   └── ops.cpp +│   ├── mps +│   │   ├── ... +│   |   └── op_mps.mm +│   └── mlu │      ├── ... -│      └── ops.cpp +│      └── op_mlu.cpp └── tensorrt ├── trt_cuda_helper.cuh ├── trt_plugin_helper.hpp @@ -63,108 +77,113 @@ This folder contains all non-python code for MMCV custom ops. Please follow the - `common`: This directory contains all tools and shared codes. - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax. -- `onnxruntime`: **ONNX Runtime** support for custom ops. + - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**. + - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device. + - `utils`: The kernels and utils of spconv. +- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy). - `cpu`: CPU implementation of supported ops. - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory. - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory. - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops. - `cpu`: This directory contain cpu implementations of corresponding custom ops. -- `tensorrt`: **TensorRT** support for custom ops. + - `mlu`: This directory contain launchers of each MLU kernels. + - `mps`: MPS ops implementation and launchers. +- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy). - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`. ## How to add new PyTorch ops? 1. (Optional) Add shared kernel in `common` to support special hardware platform. - ```c++ - // src/common/cuda/new_ops_cuda_kernel.cuh - - template - __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { - // forward here - } - - ``` - - Add cuda kernel launcher in `pytorch/cuda`. - - ```c++ - // src/pytorch/cuda - #include - - void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ - // initialize - at::cuda::CUDAGuard device_guard(input.device()); - cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - ... - AT_DISPATCH_FLOATING_TYPES_AND_HALF( - input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { - new_ops_forward_cuda_kernel - <<>>( - input.data_ptr(), output.data_ptr(),...); - })); - AT_CUDA_CHECK(cudaGetLastError()); - } - ``` + ```c++ + // src/common/cuda/new_ops_cuda_kernel.cuh + + template + __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) { + // forward here + } + + ``` + + Add cuda kernel launcher in `pytorch/cuda`. + + ```c++ + // src/pytorch/cuda + #include + + void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){ + // initialize + at::cuda::CUDAGuard device_guard(input.device()); + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + ... + AT_DISPATCH_FLOATING_TYPES_AND_HALF( + input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] { + new_ops_forward_cuda_kernel + <<>>( + input.data_ptr(), output.data_ptr(),...); + })); + AT_CUDA_CHECK(cudaGetLastError()); + } + ``` 2. Register implementation for different devices. - ```c++ - // src/pytorch/cuda/cudabind.cpp - ... + ```c++ + // src/pytorch/cuda/cudabind.cpp + ... - Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ - // implement cuda forward here - // use `NewOpsForwardCUDAKernelLauncher` here - } - // declare interface here. - Tensor new_ops_forward_impl(Tensor input, Tensor output, ...); - // register the implementation for given device (CUDA here). - REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda); - ``` + Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){ + // implement cuda forward here + // use `NewOpsForwardCUDAKernelLauncher` here + } + // declare interface here. + Tensor new_ops_forward_impl(Tensor input, Tensor output, ...); + // register the implementation for given device (CUDA here). + REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda); + ``` 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type. - ```c++ - // src/pytorch/new_ops.cpp - Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){ - // dispatch the implementation according to the device type of input. - DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...); - } - ... + ```c++ + // src/pytorch/new_ops.cpp + Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){ + // dispatch the implementation according to the device type of input. + DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...); + } + ... - Tensor new_ops_forward(Tensor input, Tensor output, ...){ - return new_ops_forward_impl(input, output, ...); - } - ``` + Tensor new_ops_forward(Tensor input, Tensor output, ...){ + return new_ops_forward_impl(input, output, ...); + } + ``` 4. Binding the implementation in `pytorch/pybind.cpp` - ```c++ - // src/pytorch/pybind.cpp + ```c++ + // src/pytorch/pybind.cpp - ... + ... - Tensor new_ops_forward(Tensor input, Tensor output, ...); + Tensor new_ops_forward(Tensor input, Tensor output, ...); - ... + ... - // bind with pybind11 - m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", - py::arg("input"), py::arg("output"), ...); + // bind with pybind11 + m.def("new_ops_forward", &new_ops_forward, "new_ops_forward", + py::arg("input"), py::arg("output"), ...); - ... + ... - ``` + ``` 5. Build MMCV again. Enjoy new ops in python - ```python - from ..utils import ext_loader - ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) + ```python + from ..utils import ext_loader + ext_module = ext_loader.load_ext('_ext', ['new_ops_forward']) - ... + ... - ext_module.new_ops_forward(input, output, ...) + ext_module.new_ops_forward(input, output, ...) - ``` + ``` diff --git a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp index 67190dc10eb245bb2bea23133ac984cd1c5a4888..243200e156f1384b625d6bac7fa4c68e533d9441 100644 --- a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp +++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp @@ -220,6 +220,10 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], return temp > 0; } }); + // compute distance to origin after sort, since the points are now different. + for (int i = 0; i < num_in; i++) { + dist[i] = dot_2d(q[i], q[i]); + } #endif // Step 4: diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh @@ -0,0 +1,59 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu +#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH +#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void active_rotated_filter_forward_cuda_kernel( + const int nthreads, const scalar_t* weight_data, const int* indices_data, + const int num_input_planes, const int num_output_planes, + const int num_orientations, const int num_rotations, const int nEntry, + scalar_t* output_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int l = index % nEntry; + int j = (index / nEntry) % num_input_planes; + int i = index / nEntry / num_input_planes; + int k; + scalar_t val = *(weight_data + index); + for (k = 0; k < num_rotations; k++) { + int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; + scalar_t* target = output_data + + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + idx; + *target = val; + } + } +} + +template +__global__ void active_rotated_filter_backward_cuda_kernel( + const int nthreads, const scalar_t* gradWeight_data, + const int* indices_data, const int num_input_planes, + const int num_output_planes, const int num_orientations, + const int num_rotations, const int nEntry, scalar_t* weight_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int l = index % nEntry; + int j = (index / nEntry) % num_input_planes; + int i = index / nEntry / num_input_planes; + int k; + scalar_t* val = weight_data + index; + *val = 0; + scalar_t tmp = 0; + for (k = 0; k < num_rotations; k++) { + int idx = (int)(*(indices_data + l * num_rotations + k)) - 1; + scalar_t target = + *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) + + k * (num_input_planes * nEntry) + j * (nEntry) + idx); + tmp = tmp + target; + } + *val = tmp; + } +} +#endif // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh index 056d12334b555bbbf14253382736bd6329805559..9f9250844b9ceeca0df0377640c3d28e3f61cecc 100644 --- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh @@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel( const int O, const int aggregate, const T* points, const T* centers, const T* scores, const int64_t* knn_idx, T* output) { // ----- parallel loop for B, N1, K and O --------- - long i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= B * N1 * K * O) return; - // ------- loop for M ---------- - const int b = (int)(i / (O * N1 * K)); - const int o = (int)(i % (O * N1 * K) / (N1 * K)); - const int n = (int)(i % (N1 * K) / K); - const int k = (int)(i % K); - const int cn = (int)knn_idx[b * K * N1 + n * K + - 0]; // The first neighbor is the center point - const int kn = (int)knn_idx[b * K * N1 + n * K + k]; - if (kn >= N0 || - kn < 0) { // if index overflows, it is out of the neighborhood range - return; - } - assert(b < B); - assert(kn < N0); - assert(cn < N0); - assert(o < O); - assert(n < N1); - const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; - T val = output[out_idx]; - for (int m = 0; m < M; m++) { - val += points[b * N0 * M * O + kn * M * O + m * O + o] * - scores[b * N1 * K * M + n * K * M + k * M + m] - - centers[b * N0 * M * O + cn * M * O + m * O + o] * - scores[b * N1 * K * M + n * K * M + k * M + m]; + CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) { + // ------- loop for M ---------- + const int b = (int)(i / (O * N1 * K)); + const int o = (int)(i % (O * N1 * K) / (N1 * K)); + const int n = (int)(i % (N1 * K) / K); + const int k = (int)(i % K); + const int cn = (int)knn_idx[b * K * N1 + n * K + + 0]; // The first neighbor is the center point + const int kn = (int)knn_idx[b * K * N1 + n * K + k]; + if (kn >= N0 || + kn < 0) { // if index overflows, it is out of the neighborhood range + return; + } + assert(b < B); + assert(kn < N0); + assert(cn < N0); + assert(o < O); + assert(n < N1); + const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k; + T val = output[out_idx]; + for (int m = 0; m < M; m++) { + val += points[b * N0 * M * O + kn * M * O + m * O + o] * + scores[b * N1 * K * M + n * K * M + k * M + m] - + centers[b * N0 * M * O + cn * M * O + m * O + o] * + scores[b * N1 * K * M + n * K * M + k * M + m]; + } + output[out_idx] = val; } - output[out_idx] = val; } template @@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel( const int O, const int aggregate, const T* grad_out, const T* scores, const int64_t* knn_idx, T* grad_points, T* grad_centers) { // ----- parallel loop for B, M, O --------- - long i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= B * M * O) return; - int b = (int)(i / (M * O)); - int m = (int)(i % (M * O) / O); - int o = (int)(i % O); + CUDA_1D_KERNEL_LOOP(i, B * M * O) { + int b = (int)(i / (M * O)); + int m = (int)(i % (M * O) / O); + int o = (int)(i % O); - // ----- loop for N,K --------- - for (int n = 0; n < N; n++) { - for (int k = 0; k < K; k++) { - int kn = knn_idx[b * N * K + n * K + k]; - int cn = knn_idx[b * N * K + n * K + 0]; - if (kn >= N0 || - kn < 0) { // if index overflows, it is out of the neighborhood range - continue; + // ----- loop for N,K --------- + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + int kn = knn_idx[b * N * K + n * K + k]; + int cn = knn_idx[b * N * K + n * K + 0]; + if (kn >= N0 || kn < 0) { // if index overflows, it is out of the + // neighborhood range + continue; + } + atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o, + scores[b * N * K * M + n * K * M + k * M + m] * + grad_out[b * O * N * K + o * N * K + n * K + k]); + atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o, + -scores[b * N * K * M + n * K * M + k * M + m] * + grad_out[b * O * N * K + o * N * K + n * K + k]); } - atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o, - scores[b * N * K * M + n * K * M + k * M + m] * - grad_out[b * O * N * K + o * N * K + n * K + k]); - atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o, - -scores[b * N * K * M + n * K * M + k * M + m] * - grad_out[b * O * N * K + o * N * K + n * K + k]); } } } @@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel( const int O, const int aggregate, const T* grad_out, const T* points, const T* centers, const int64_t* knn_idx, T* grad_scores) { // ----- parallel loop for B, N, K, M --------- - long i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= B * N * K * M) return; - const int b = (int)(i / (N * M * K)); - const int n = (int)(i % (N * M * K) / M / K); - const int k = (int)(i % (M * K) / M); - const int m = (int)(i % M); - const int cn = knn_idx[b * N * K + n * K + 0]; - const int kn = knn_idx[b * N * K + n * K + k]; - if (kn >= N0 || - kn < 0) { // if index overflows, it is out of the neighborhood range - return; - } + CUDA_1D_KERNEL_LOOP(i, B * N * K * M) { + const int b = (int)(i / (N * M * K)); + const int n = (int)(i % (N * M * K) / M / K); + const int k = (int)(i % (M * K) / M); + const int m = (int)(i % M); + const int cn = knn_idx[b * N * K + n * K + 0]; + const int kn = knn_idx[b * N * K + n * K + k]; + if (kn >= N0 || + kn < 0) { // if index overflows, it is out of the neighborhood range + return; + } - // -------------- loop for O ------------------------ - const int out_idx = b * N * K * M + n * K * M + k * M + m; - T val = grad_scores[out_idx]; - for (int o = 0; o < O; o++) { - val += (points[b * N0 * M * O + kn * M * O + m * O + o] - - centers[b * N0 * M * O + cn * M * O + m * O + o]) * - grad_out[b * O * N * K + o * N * K + n * K + k]; + // -------------- loop for O ------------------------ + const int out_idx = b * N * K * M + n * K * M + k * M + m; + T val = grad_scores[out_idx]; + for (int o = 0; o < O; o++) { + val += (points[b * N0 * M * O + kn * M * O + m * O + o] - + centers[b * N0 * M * O + cn * M * O + m * O + o]) * + grad_out[b * O * N * K + o * N * K + n * K + k]; + } + grad_scores[out_idx] = val; } - grad_scores[out_idx] = val; } #endif // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh index ba2af01b5e4c67ec8498ac167e26a5116d853b62..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e 100644 --- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh @@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m, // output: // idx: (B, M, nsample) int bs_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= b || pt_idx >= m) return; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b) return; - new_xyz += bs_idx * m * 3 + pt_idx * 3; - xyz += bs_idx * n * 3; - idx += bs_idx * m * nsample + pt_idx * nsample; + new_xyz += bs_idx * m * 3 + pt_idx * 3; + xyz += bs_idx * n * 3; + idx += bs_idx * m * nsample + pt_idx * nsample; - float max_radius2 = max_radius * max_radius; - float min_radius2 = min_radius * min_radius; - T new_x = new_xyz[0]; - T new_y = new_xyz[1]; - T new_z = new_xyz[2]; + float max_radius2 = max_radius * max_radius; + float min_radius2 = min_radius * min_radius; + T new_x = new_xyz[0]; + T new_y = new_xyz[1]; + T new_z = new_xyz[2]; - int cnt = 0; - for (int k = 0; k < n; ++k) { - T x = xyz[k * 3 + 0]; - T y = xyz[k * 3 + 1]; - T z = xyz[k * 3 + 2]; - T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + - (new_z - z) * (new_z - z); - if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { - if (cnt == 0) { - for (int l = 0; l < nsample; ++l) { - idx[l] = k; + int cnt = 0; + for (int k = 0; k < n; ++k) { + T x = xyz[k * 3 + 0]; + T y = xyz[k * 3 + 1]; + T z = xyz[k * 3 + 2]; + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) { + if (cnt == 0) { + for (int l = 0; l < nsample; ++l) { + idx[l] = k; + } } + idx[cnt] = k; + ++cnt; + if (cnt >= nsample) break; } - idx[cnt] = k; - ++cnt; - if (cnt >= nsample) break; } } } diff --git a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh index 249c9e85009d00af2bee5380a0013135f36c303b..15bd91eca629895d3a99dde3fe6614036ca31dc9 100644 --- a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh @@ -8,6 +8,27 @@ #include "pytorch_cuda_helper.hpp" #endif +template +__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1, + T& y1, T& x2, T& y2) { + x1 = bbox[base]; + y1 = bbox[base + 1]; + x2 = bbox[base + 2]; + y2 = bbox[base + 3]; +} + +template <> +__device__ __forceinline__ void load_bbox(const float* bbox, + const int base, float& x1, + float& y1, float& x2, + float& y2) { + const float4 bbox_offset = reinterpret_cast(bbox + base)[0]; + x1 = bbox_offset.x; + y1 = bbox_offset.y; + x2 = bbox_offset.z; + y2 = bbox_offset.w; +} + template __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, T* ious, const int num_bbox1, @@ -16,69 +37,111 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2, const int offset) { if (aligned) { CUDA_1D_KERNEL_LOOP(index, num_bbox1) { - int b1 = index; - int b2 = index; - - int base1 = b1 * 4; - T b1_x1 = bbox1[base1]; - T b1_y1 = bbox1[base1 + 1]; - T b1_x2 = bbox1[base1 + 2]; - T b1_y2 = bbox1[base1 + 3]; - T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); - - int base2 = b2 * 4; - T b2_x1 = bbox2[base2]; - T b2_y1 = bbox2[base2 + 1]; - T b2_x2 = bbox2[base2 + 2]; - T b2_y2 = bbox2[base2 + 3]; - T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); - - T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); - T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); - T width = fmaxf(right - left + offset, 0.f); - T height = fmaxf(bottom - top + offset, 0.f); - T interS = width * height; - T baseS = 1.0; - if (mode == 0) { - baseS = fmaxf(b1_area + b2_area - interS, T(offset)); - } else if (mode == 1) { - baseS = fmaxf(b1_area, T(offset)); - } + const int b1 = index; + const int b2 = index; + + const int base1 = b1 << 2; // b1 * 4 + T b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); + + const int base2 = b2 << 2; // b2 * 4 + T b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); + + const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); + const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); + const T width = fmaxf(right - left + offset, 0.f); + const T height = fmaxf(bottom - top + offset, 0.f); + const T interS = width * height; + + const T baseS = + fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } else { CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) { - int b1 = index / num_bbox2; - int b2 = index % num_bbox2; - - int base1 = b1 * 4; - T b1_x1 = bbox1[base1]; - T b1_y1 = bbox1[base1 + 1]; - T b1_x2 = bbox1[base1 + 2]; - T b1_y2 = bbox1[base1 + 3]; - T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); - - int base2 = b2 * 4; - T b2_x1 = bbox2[base2]; - T b2_y1 = bbox2[base2 + 1]; - T b2_x2 = bbox2[base2 + 2]; - T b2_y2 = bbox2[base2 + 3]; - T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); - - T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); - T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); - T width = fmaxf(right - left + offset, 0.f); - T height = fmaxf(bottom - top + offset, 0.f); - T interS = width * height; - T baseS = 1.0; - if (mode == 0) { - baseS = fmaxf(b1_area + b2_area - interS, T(offset)); - } else if (mode == 1) { - baseS = fmaxf(b1_area, T(offset)); - } + const int b1 = index / num_bbox2; + const int b2 = index % num_bbox2; + + const int base1 = b1 << 2; // b1 * 4 + T b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset); + + const int base2 = b2 << 2; // b2 * 4 + T b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset); + + const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2); + const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2); + const T width = fmaxf(right - left + offset, 0.f); + const T height = fmaxf(bottom - top + offset, 0.f); + const T interS = width * height; + + const T baseS = + fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset)); ious[index] = interS / baseS; } } } +#if __CUDA_ARCH__ >= 530 +__device__ __forceinline__ __half __half_area(const __half x1, const __half y1, + const __half x2, const __half y2, + const __half offset) { + const __half half_w = __hadd(__hsub(x2, x1), offset); + const __half half_h = __hadd(__hsub(y2, y1), offset); + return __hmul(half_w, half_h); +} + +__device__ __forceinline__ __half __half_max(const __half a, const __half b) { + return __hge(a, b) ? a : b; +} + +__device__ __forceinline__ __half __half_min(const __half a, const __half b) { + return __hle(a, b) ? a : b; +} + +// fp16 won't provide much increase when aligned==true. It is useful when +// aligned==false, which would give you ~40% bonus. +__device__ void bbox_overlaps_cuda_kernel_half( + const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1, + const int num_bbox2, const int mode, const bool aligned, const int offset) { + const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2; + const __half h_offset = __int2half_rn(offset); + CUDA_1D_KERNEL_LOOP(index, num_output) { + const int b1 = aligned ? index : index / num_bbox2; + const int b2 = aligned ? index : index % num_bbox2; + + const int base1 = b1 << 2; + __half b1_x1, b1_y1, b1_x2, b1_y2; + load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2); + const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset); + + const int base2 = b2 << 2; + __half b2_x1, b2_y1, b2_x2, b2_y2; + load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2); + const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset); + + const __half left = __half_max(b1_x1, b2_x1), + right = __half_min(b1_x2, b2_x2); + const __half top = __half_max(b1_y1, b2_y1), + bottom = __half_min(b1_y2, b2_y2); + const __half width = + __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f)); + const __half height = + __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f)); + const __half interS = __hmul(width, height); + + const __half baseS = __half_max( + mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area, + h_offset); + ious[index] = __hdiv(interS, baseS); + } +} +#endif // __CUDA_ARCH__ >= 530 + #endif // BBOX_OVERLAPS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh index 07beeda57f70389d067e16b549b1a6042780a624..e7fa990fea1849f626baa0b81a726564373216a8 100644 --- a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh @@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h, #ifndef HIP_DIFF /* TODO: move this to a common place */ template -__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) { +__device__ inline scalar_t min(scalar_t a, scalar_t b) { return a < b ? a : b; } template -__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) { +__device__ inline scalar_t max(scalar_t a, scalar_t b) { return a > b ? a : b; } #endif diff --git a/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh @@ -0,0 +1,101 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu +#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH +#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAX_SHARED_SCALAR_T 6144 // 49152 / 8 = 6144 + +template +__global__ void chamfer_distance_forward_cuda_kernel(int b, int n, + const scalar_t* xyz, int m, + const scalar_t* xyz2, + scalar_t* result, + int* result_i) { + __shared__ scalar_t buf[MAX_SHARED_SCALAR_T]; + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) { + int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2; + for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) { + buf[j] = xyz2[(i * m + k2) * 2 + j]; + } + __syncthreads(); + for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { + scalar_t x1 = xyz[(i * n + j) * 2 + 0]; + scalar_t y1 = xyz[(i * n + j) * 2 + 1]; + int best_i = 0; + scalar_t best = 1e10; + int end_ka = end_k & (~2); + if (end_ka == THREADS_PER_BLOCK) { + for (int k = 0; k < THREADS_PER_BLOCK; k += 4) { +#pragma unroll + for (int j = 0; j < 4; ++j) { + scalar_t x2 = buf[(k + j) * 2] - x1; + scalar_t y2 = buf[(k + j) * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (d < best) { + best = d; + best_i = k + k2 + j; + } + } + } + } else { + for (int k = 0; k < end_ka; k += 4) { +#pragma unroll + for (int j = 0; j < 4; ++j) { + scalar_t x2 = buf[(k + j) * 2] - x1; + scalar_t y2 = buf[(k + j) * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (d < best) { + best = d; + best_i = k + k2 + j; + } + } + } + } + for (int k = end_ka; k < end_k; k++) { + scalar_t x2 = buf[k * 2 + 0] - x1; + scalar_t y2 = buf[k * 2 + 1] - y1; + scalar_t d = x2 * x2 + y2 * y2; + if (k == 0 || d < best) { + best = d; + best_i = k + k2; + } + } + if (k2 == 0 || result[(i * n + j)] > best) { + result[(i * n + j)] = best; + result_i[(i * n + j)] = best_i; + } + } + __syncthreads(); + } + } +} + +template +__global__ void chamfer_distance_backward_cuda_kernel( + int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2, + const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1, + scalar_t* grad_xyz2) { + for (int i = blockIdx.x; i < b; i += gridDim.x) { + for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) { + scalar_t x1 = xyz1[(i * n + j) * 2 + 0]; + scalar_t y1 = xyz1[(i * n + j) * 2 + 1]; + int j2 = idx1[i * n + j]; + scalar_t x2 = xyz2[(i * m + j2) * 2 + 0]; + scalar_t y2 = xyz2[(i * m + j2) * 2 + 1]; + scalar_t g = grad_dist1[i * n + j] * 2; + atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2)); + atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2)); + atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2))); + atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2))); + } + } +} +#endif // CHAMFER_DISTANCE_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp index dc5df1730ee20f7f97c5cbf14c7f8da849820feb..b12aa9a26a2cc162fd89f68ccc97e17749090a41 100644 --- a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp +++ b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp @@ -7,12 +7,20 @@ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) -#define THREADS_PER_BLOCK 512 +#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \ + for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) \ + for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \ + j += blockDim.y * gridDim.y) + +#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m) \ + for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \ + for (size_t j = blockIdx.y; j < (m); j += gridDim.y) -#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) +#define THREADS_PER_BLOCK 512 -inline int GET_BLOCKS(const int N) { - int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; +inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) { + int optimal_block_num = (N + num_threads - 1) / num_threads; int max_block_num = 4096; return min(optimal_block_num, max_block_num); } diff --git a/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh @@ -0,0 +1,831 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef CONVEX_IOU_CUDA_KERNEL_CUH +#define CONVEX_IOU_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAXN 100 +#define NMAX 512 +__device__ const double EPS = 1E-8; + +__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); } + +struct Point { + double x, y; + __device__ Point() {} + __device__ Point(double x, double y) : x(x), y(y) {} +}; + +__device__ inline bool point_same(Point& a, Point& b) { + return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0; +} + +__device__ inline void swap1(Point* a, Point* b) { + Point temp; + temp.x = a->x; + temp.y = a->y; + + a->x = b->x; + a->y = b->y; + + b->x = temp.x; + b->y = temp.y; +} + +__device__ inline void reverse1(Point* a, const int n) { + for (int i = 0; i < (n - 1) / 2.0; i++) { + Point* j = &(a[i]); + Point* k = &(a[n - 1 - i]); + swap1(j, k); + } +} + +__device__ inline double cross(Point o, Point a, Point b) { + return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); +} + +__device__ inline double dis(Point a, Point b) { + return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); +} +__device__ inline double area(Point* ps, int n) { + ps[n] = ps[0]; + double res = 0; + for (int i = 0; i < n; i++) { + res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; + } + return res / 2.0; +} +__device__ inline double polygon_area_grad(Point* ps, int n, + int* polygon_to_pred_index, + int n_pred, double* grad_C) { + ps[n] = ps[0]; + double partion_grad[4 * 30 + 2]; + double res = 0; + for (int i = 0; i < n; i++) { + res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x; + partion_grad[i * 4 + 2] = ps[i + 1].y; + partion_grad[i * 4 + 3] = -ps[i + 1].x; + if (i != n - 1) { + partion_grad[i * 4 + 4] = -ps[i].y; + partion_grad[i * 4 + 5] = ps[i].x; + } else { + partion_grad[0] = -ps[i].y; + partion_grad[1] = ps[i].x; + } + } + for (int i = 0; i < n; i++) { + for (int j = 0; j < n_pred; j++) { + if (i == polygon_to_pred_index[j]) { + grad_C[2 * polygon_to_pred_index[j + n_pred]] = + (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2; + break; + } + } + for (int j = 0; j < n_pred; j++) { + if (i == polygon_to_pred_index[j]) { + grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] = + (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2; + break; + } + } + } + + return res / 2.0; +} + +__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p, + double* cut_grad, int m, int n, int i) { + double s1, s2; + double s2_s1_2; + double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd; + double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd; + s1 = cross(a, b, c); + s2 = cross(a, b, d); + + ds1_dxc = -(b.y - a.y); + ds1_dyc = b.x - a.x; + ds2_dxd = ds1_dxc; + ds2_dyd = ds1_dyc; + s2_s1_2 = (s2 - s1) * (s2 - s1); + + if (sig(s1) == 0 && sig(s2) == 0) return 2; + if (sig(s2 - s1) == 0) return 0; + + dxp_dxc = + ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) / + (s2_s1_2); + dxp_dyc = + ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) / + (s2_s1_2); + dxp_dxd = + ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) / + (s2_s1_2); + dxp_dyd = + ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) / + (s2_s1_2); + + dyp_dxc = + ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) / + (s2_s1_2); + dyp_dyc = + ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) / + (s2_s1_2); + dyp_dxd = + ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) / + (s2_s1_2); + dyp_dyd = + ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) / + (s2_s1_2); + + p.x = (c.x * s2 - d.x * s1) / (s2 - s1); + p.y = (c.y * s2 - d.y * s1) / (s2 - s1); + if (i == n - 1) { + cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; + cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; + cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; + cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; + cut_grad[4 * n * m + 0] = dxp_dxd; // + dyp_dxd; + cut_grad[4 * n * m + 1] = dyp_dxd; + cut_grad[4 * n * m + 2] = dxp_dyd; // + dyp_dyd; + cut_grad[4 * n * m + 3] = dyp_dyd; + } else { + cut_grad[4 * n * m + 4 * i] = dxp_dxc; // + dyp_dxc; + cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc; + cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc; // + dyp_dyc; + cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc; + cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd; // + dyp_dxd; + cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd; + cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd; // + dyp_dyd; + cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd; + } + + return 1; +} +__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b, + double* cut_grad) { + Point pp[MAXN]; + double ccur_grad[MAXN] = {}; + int m = 0; + p[n] = p[0]; + int k = n; + for (int i = 0; i < n; i++) { + if (sig(cross(a, b, p[i])) > 0) { + pp[m] = p[i]; + ccur_grad[4 * n * m + 4 * i] = 1.0; + ccur_grad[4 * n * m + 4 * i + 3] = 1.0; + m++; + } + if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { + lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i); + m++; + } + } + + n = 0; + for (int i = 0; i < m; i++) { + if (!i || !(point_same(pp[i], pp[i - 1]))) { + p[n] = pp[i]; + for (int j = 0; j < 4 * k; j++) { + cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j]; + } + n++; + } + } + + while (n > 1 && point_same(p[n - 1], p[0])) n--; +} + +__device__ inline double intersectArea(Point a, Point b, Point c, Point d, + double* grad_AB, int order, + int convex_n) { + Point o(0, 0); + int res_flag = 0; + int s1 = sig(cross(o, a, b)); + int s2 = sig(cross(o, c, d)); + if (s1 == 0 || s2 == 0) return 0.0; + if (s1 == -1) { + Point* i = &a; + Point* j = &b; + swap1(i, j); + res_flag = 1; + } + if (s2 == -1) { + Point* i = &c; + Point* j = &d; + swap1(i, j); + } + Point p[10] = {o, a, b}; + int n = 3, n0 = 3, n1, n2, n3; + double cut_grad1[MAXN] = {}; + double cut_grad2[MAXN] = {}; + double cut_grad3[MAXN] = {}; + double p1_p_grad[10][10] = {}; + double p2_p1_grad[10][10] = {}; + double p3_p2_grad[10][10] = {}; + + double p3_p1_grad[10][10] = {}; + double p3_p_grad[10][10] = {}; + + // 1 + polygon_cut(p, n, o, c, cut_grad1); + n1 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n0; j++) { + if (!(j % 2)) { + p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j]; + } else { + p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j]; + } + } + } + + // 2 + polygon_cut(p, n, c, d, cut_grad2); + n2 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n1; j++) { + if (!(j % 2)) { + p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j]; + } else { + p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j]; + } + } + } + // 3 + polygon_cut(p, n, d, o, cut_grad3); + n3 = n; + for (int i = 0; i < n; i++) { + for (int j = 0; j < 4 * n2; j++) { + if (!(j % 2)) { + p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j]; + } else { + p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j]; + } + } + } + + // mul + // p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1) + for (int i = 0; i < 2 * n3; i++) { + for (int j = 0; j < 2 * n1; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n2; m++) { + sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j]; + } + p3_p1_grad[i][j] = sum; + } + } + + // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0) + for (int i = 0; i < 2 * n3; i++) { + for (int j = 0; j < 2 * n0; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n1; m++) { + sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j]; + } + p3_p_grad[i][j] = sum; + } + } + + // calculate S_grad + int polygon_index_box_index[20]; + double grad_polygon[20]; + double S_grad[6]; + + for (int i = 0; i < n3; i++) { + polygon_index_box_index[i] = i; + polygon_index_box_index[i + n3] = i; + } + + double res = + polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon); + + if (s1 * s2 == -1) { + for (int j = 0; j < 2 * 3; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n3; m++) { + sum = sum - grad_polygon[m] * p3_p_grad[m][j]; + } + S_grad[j] = sum; + } + + if (order != convex_n - 1) { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[2 * order + 2] += S_grad[2]; + grad_AB[2 * order + 3] += S_grad[3]; + + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[2 * order + 2] += S_grad[4]; + grad_AB[2 * order + 3] += S_grad[5]; + } + } else { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[0] += S_grad[2]; + grad_AB[1] += S_grad[3]; + + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[0] += S_grad[4]; + grad_AB[1] += S_grad[5]; + } + } + res = -res; + } else { + for (int j = 0; j < 2 * 3; j++) { + double sum = 0.0; + for (int m = 0; m < 2 * n3; m++) { + sum = sum + grad_polygon[m] * p3_p_grad[m][j]; + } + S_grad[j] = sum; + } + + if (order != convex_n - 1) { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[2 * order + 2] += S_grad[2]; + grad_AB[2 * order + 3] += S_grad[3]; + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[2 * order + 2] += S_grad[4]; + grad_AB[2 * order + 3] += S_grad[5]; + } + } else { + if (res_flag) { + grad_AB[2 * order] += S_grad[4]; + grad_AB[2 * order + 1] += S_grad[5]; + grad_AB[0] += S_grad[2]; + grad_AB[1] += S_grad[3]; + } else { + grad_AB[2 * order] += S_grad[2]; + grad_AB[2 * order + 1] += S_grad[3]; + grad_AB[0] += S_grad[4]; + grad_AB[1] += S_grad[5]; + } + } + } + return res; +} + +__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2, + double* grad_AB) { + if (area(ps1, n1) < 0) reverse1(ps1, n1); + if (area(ps2, n2) < 0) reverse1(ps2, n2); + ps1[n1] = ps1[0]; + ps2[n2] = ps2[0]; + double res = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + res += + intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1); + } + } + return res; +} + +__device__ inline void Jarvis(Point* in_poly, int& n_poly) { + Point p_max, p_k; + int max_index, k_index; + int Stack[NMAX] = {}, top1, top2; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point* j = &(in_poly[0]); + Point* k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]]; + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]]; + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; +} + +__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2, + int n2, double* grad_C) { + Point polygon[MAXN]; + int n = n1 + n2, n_poly = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n - n1; j++) { + if (point_same(ps1[i], ps2[j])) { + for (int k = j; k < n - n1 - 1; k++) { + ps2[k] = ps2[k + 1]; + } + n2--; + break; + } + } + } + n_poly = n1 + n2; + for (int i = 0; i < n_poly; i++) { + if (i < n1) { + polygon[i] = ps1[i]; + } else { + polygon[i] = ps2[i - n1]; + } + } + + Jarvis(polygon, n_poly); + + int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1}; + int n_pred = 0; + for (int i = 0; i < n_poly; i++) { + for (int j = 0; j < n1; j++) { + if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) { + polygon_to_pred_index[n_pred] = i; + polygon_to_pred_index[n_pred + n1] = j; + n_pred += 1; + break; + } + } + } + if (n_pred == 0) { + double polygon_area = fabs(area(polygon, n_poly)); + for (int i = 0; i < 18; i++) { + grad_C[i] = 0.0; + } + return polygon_area; + } else { + double polygon_area = + polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C); + if (polygon_area < 0) { + for (int i = 0; i < 18; i++) { + grad_C[i] = -grad_C[i]; + } + } + return fabs(polygon_area); + } +} + +// convex_find and get the polygon_index_box_index +__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly, + int* points_to_convex_ind) { + int n_input = n_poly; + Point input_poly[20]; + for (int i = 0; i < n_input; i++) { + input_poly[i].x = in_poly[i].x; + input_poly[i].y = in_poly[i].y; + } + Point p_max, p_k; + int max_index, k_index; + int Stack[20], top1, top2; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point* j = &(in_poly[0]); + Point* k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + for (int i = 0; i <= top1; i++) { + right_point[i] = in_poly[Stack[i]]; + } + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + + for (int i = top2 - 1; i >= 0; i--) { + left_point[i] = in_poly[Stack[i]]; + } + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; + for (int i = 0; i < n_poly; i++) { + for (int j = 0; j < n_input; j++) { + if (point_same(in_poly[i], input_poly[j])) { + points_to_convex_ind[i] = j; + break; + } + } + } +} + +template +__device__ inline float devrIoU(T const* const p, T const* const q, + T* point_grad, const int idx) { + Point ps1[MAXN], ps2[MAXN]; + + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = (double)p[i * 2]; + convex[i].y = (double)p[i * 2 + 1]; + } + int n_convex = 9; + int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; + Jarvis_and_index(convex, n_convex, points_to_convex_ind); + + int n1 = n_convex; + int n2 = 4; + + for (int i = 0; i < n1; i++) { + ps1[i].x = (double)convex[i].x; + ps1[i].y = (double)convex[i].y; + } + + for (int i = 0; i < n2; i++) { + ps2[i].x = (double)q[i * 2]; + ps2[i].y = (double)q[i * 2 + 1]; + } + + int polygon_index_box_index[18]; + for (int i = 0; i < n1; i++) { + polygon_index_box_index[i] = i; + polygon_index_box_index[i + n1] = i; + } + + double grad_A[18] = {}; + double grad_AB[18] = {}; + double grad_C[18] = {}; + + double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB); + double S_pred = + polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A); + if (S_pred < 0) { + for (int i = 0; i < n_convex * 2; i++) { + grad_A[i] = -grad_A[i]; + } + } + double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; + + double iou = inter_area / union_area; + double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C); + + // printf("%d:live\n", idx); + double rot_giou = iou - (polygon_area - union_area) / polygon_area; + + float grad_point_temp[18] = {}; + + for (int i = 0; i < n_convex; i++) { + int grad_point = points_to_convex_ind[i]; + grad_point_temp[2 * grad_point] = + (float)((union_area + inter_area) / (union_area * union_area) * + grad_AB[2 * i] - + iou / union_area * grad_A[2 * i] - + 1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) - + (union_area) / polygon_area / polygon_area * grad_C[2 * i]); + grad_point_temp[2 * grad_point + 1] = + (float)((union_area + inter_area) / (union_area * union_area) * + grad_AB[2 * i + 1] - + iou / union_area * grad_A[2 * i + 1] - + 1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) - + (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]); + } + + for (int i = 0; i < 9; i++) { + point_grad[2 * i] = grad_point_temp[2 * i]; + point_grad[2 * i + 1] = grad_point_temp[2 * i + 1]; + } + return (float)rot_giou; +} + +template +__global__ void convex_giou_cuda_kernel(const int ex_n_boxes, + const int gt_n_boxes, const T* ex_boxes, + const T* gt_boxes, T* point_grad) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T* cur_box = ex_boxes + index * 18; + const T* cur_gt_box = gt_boxes + index * 8; + T* cur_grad = point_grad + index * 19; + T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x); + cur_grad[18] = giou; + } +} + +__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) { + double s1, s2; + s1 = cross(a, b, c); + s2 = cross(a, b, d); + if (sig(s1) == 0 && sig(s2) == 0) return 2; + if (sig(s2 - s1) == 0) return 0; + p.x = (c.x * s2 - d.x * s1) / (s2 - s1); + p.y = (c.y * s2 - d.y * s1) / (s2 - s1); + return 1; +} + +__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) { + Point pp[MAXN]; + int m = 0; + p[n] = p[0]; + for (int i = 0; i < n; i++) { + if (sig(cross(a, b, p[i])) > 0) { + pp[m] = p[i]; + m++; + } + if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) { + lineCross(a, b, p[i], p[i + 1], pp[m]); + m++; + } + } + n = 0; + for (int i = 0; i < m; i++) { + if (!i || !(point_same(pp[i], pp[i - 1]))) { + p[n] = pp[i]; + n++; + } + } + + while (n > 1 && point_same(p[n - 1], p[0])) n--; +} + +__device__ inline double intersectArea(Point a, Point b, Point c, Point d) { + Point o(0, 0); + int s1 = sig(cross(o, a, b)); + int s2 = sig(cross(o, c, d)); + if (s1 == 0 || s2 == 0) return 0.0; + if (s1 == -1) { + Point* i = &a; + Point* j = &b; + swap1(i, j); + } + if (s2 == -1) { + Point* i = &c; + Point* j = &d; + swap1(i, j); + } + Point p[10] = {o, a, b}; + int n = 3; + + polygon_cut(p, n, o, c); + polygon_cut(p, n, c, d); + polygon_cut(p, n, d, o); + double res = area(p, n); + if (s1 * s2 == -1) res = -res; + return res; +} +__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, + int n2) { + if (area(ps1, n1) < 0) reverse1(ps1, n1); + if (area(ps2, n2) < 0) reverse1(ps2, n2); + ps1[n1] = ps1[0]; + ps2[n2] = ps2[0]; + double res = 0; + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]); + } + } + return res; +} + +template +__device__ inline float devrIoU(T const* const p, T const* const q) { + Point ps1[MAXN], ps2[MAXN]; + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = (double)p[i * 2]; + convex[i].y = (double)p[i * 2 + 1]; + } + int n_convex = 9; + int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1}; + Jarvis_and_index(convex, n_convex, points_to_convex_ind); + int n1 = n_convex; + for (int i = 0; i < n1; i++) { + ps1[i].x = (double)convex[i].x; + ps1[i].y = (double)convex[i].y; + } + int n2 = 4; + for (int i = 0; i < n2; i++) { + ps2[i].x = (double)q[i * 2]; + ps2[i].y = (double)q[i * 2 + 1]; + } + double inter_area = intersectAreaO(ps1, n1, ps2, n2); + double S_pred = area(ps1, n1); + double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area; + double iou = inter_area / union_area; + return (float)iou; +} + +template +__global__ void convex_iou_cuda_kernel(const int ex_n_boxes, + const int gt_n_boxes, const T* ex_boxes, + const T* gt_boxes, T* iou) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T* cur_box = ex_boxes + index * 18; + for (int i = 0; i < gt_n_boxes; i++) { + iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8); + } + } +} +#endif // CONVEX_IOU_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh index 75ea4add72f597c88c8cdf511a7d2fd04727735b..2f7f112989127da235cb35476e15b206d4c2e3d4 100644 --- a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh +++ b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh @@ -29,8 +29,8 @@ using namespace torch; #define TensorAcc5R PackedTensorAccessor32 #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W) -#define THREADS_FORWARD 32 -#define THREADS_BACKWARD 16 +#define WARP_SIZE 32 +#define FULL_MASK 0xffffffff template __global__ void correlation_forward_cuda_kernel( @@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel( const int C = rInput1.size(3); const int n = blockIdx.x; - const int h = blockIdx.y; - const int w = blockIdx.z; + const int h = blockIdx.y * blockDim.y + threadIdx.y; + const int w = blockIdx.z * blockDim.z + threadIdx.z; const int thread = threadIdx.x; const int start_i = -padH + h * dH; @@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel( const int patchRadH = dilation_patchH * (patchH - 1) / 2; const int patchRadW = dilation_patchW * (patchW - 1) / 2; - __shared__ scalar_t prod_sum[THREADS_FORWARD]; - for (int ph = 0; ph < patchH; ++ph) { int ph_dilated = ph * dilation_patchH - patchRadH; for (int pw = 0; pw < patchW; ++pw) { int pw_dilated = pw * dilation_patchW - patchRadW; - prod_sum[thread] = 0; + scalar_t prod_sum = 0.0f; for (int i = 0; i < kH; ++i) { int i1 = start_i + i * dilationH; int i2 = i1 + ph_dilated; @@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel( int j2 = j1 + pw_dilated; if WITHIN_BOUNDS(j1, j2, iW, iW) { - for (int c = thread; c < C; c += THREADS_FORWARD) { + for (int c = thread; c < C; c += WARP_SIZE) { scalar_t v1 = rInput1[n][i1][j1][c]; scalar_t v2 = rInput2[n][i2][j2][c]; - prod_sum[thread] += v1 * v2; + prod_sum += v1 * v2; } } } } } // accumulate - __syncthreads(); + for (int offset = 16; offset > 0; offset /= 2) + prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset); if (thread == 0) { - scalar_t reduce_sum = 0; - for (int index = 0; index < THREADS_FORWARD; ++index) { - reduce_sum += prod_sum[index]; - } - output[n][ph][pw][h][w] = reduce_sum; + output[n][ph][pw][h][w] = prod_sum; } } } @@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1( TensorAcc4R grad_input1, const int kH, const int kW, const int patchH, const int patchW, const int padH, const int padW, const int dilationH, const int dilationW, const int dilation_patchH, const int dilation_patchW, - const int dH, const int dW, const int batch) { - const int iH = input2.size(2); - const int iW = input2.size(3); + const int dH, const int dW) { + const int iH = input2.size(1); + const int iW = input2.size(2); + const int C = input2.size(3); const int H = grad_output.size(3); const int W = grad_output.size(4); @@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1( const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; - const int n = batch; - const int c = blockIdx.x; + const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; - const int ph_off = threadIdx.x; - const int pw_off = threadIdx.y; const int h_2 = h + padH; const int w_2 = w + padW; const int min_h = h_2 - kH * dilationH; const int min_w = w_2 - kW * dilationW; - __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD]; - prod_sum[ph_off][pw_off] = 0; - - for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) { + extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; + scalar_t *grad_cache = reinterpret_cast(grad_cache_char); + for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { + const int ph = i / patchW; + const int pw = i % patchW; int i1 = h + dilation_patchH * (ph - patchRadH); - for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) { - int j1 = w + dilation_patchW * (pw - patchRadW); - if (WITHIN_BOUNDS(i1, j1, iH, iW)) { - scalar_t val = input2[n][c][i1][j1]; - for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { - int i2 = (h_3) / dH; - if (i2 * dH != h_3) continue; - for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { - int j2 = (w_3) / dW; - if (j2 * dW != w_3) continue; - if - WITHIN_BOUNDS(i2, j2, H, W) { - prod_sum[ph_off][pw_off] += - grad_output[n][ph][pw][i2][j2] * val; - } + int j1 = w + dilation_patchW * (pw - patchRadW); + + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + scalar_t grad_val = 0.0f; + for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { + int i2 = (h_3) / dH; + if (i2 * dH != h_3) continue; + for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { + int j2 = (w_3) / dW; + if (j2 * dW != w_3) continue; + if (WITHIN_BOUNDS(i2, j2, H, W)) { + grad_val += grad_output[n][ph][pw][i2][j2]; } } } + grad_cache[i] = grad_val; } } - __syncthreads(); - if (ph_off == 0 && pw_off == 0) { - scalar_t reduce_sum = 0; - for (int ph = 0; ph < THREADS_BACKWARD; ++ph) { - for (int pw = 0; pw < THREADS_BACKWARD; ++pw) { - reduce_sum += prod_sum[ph][pw]; + for (int c = threadIdx.x; c < C; c += blockDim.x) { + scalar_t grad_input_val = 0.0f; + for (int ph = 0; ph < patchH; ++ph) { + int i1 = h + dilation_patchH * (ph - patchRadH); + for (int pw = 0; pw < patchW; ++pw) { + int j1 = w + dilation_patchW * (pw - patchRadW); + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw]; + } } } - grad_input1[n][c][h][w] = reduce_sum; + grad_input1[n][c][h][w] = grad_input_val; } } @@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2( const TensorAcc5R grad_output, const TensorAcc4R input1, TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH, int dilationW, int dilation_patchH, - int dilation_patchW, int dH, int dW, int batch) { - const int iH = input1.size(2); - const int iW = input1.size(3); + int dilation_patchW, int dH, int dW) { + const int iH = input1.size(1); + const int iW = input1.size(2); + const int C = input1.size(3); const int patchRadH = (patchH - 1) / 2; const int patchRadW = (patchW - 1) / 2; @@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2( const int dilatedKH = kH * dilationH; const int dilatedKW = kW * dilationW; - const int n = batch; - const int c = blockIdx.x; + const int n = blockIdx.x; const int h = blockIdx.y; const int w = blockIdx.z; - const int ph_off = threadIdx.x; - const int pw_off = threadIdx.y; - - __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD]; - prod_sum[ph_off][pw_off] = 0; - for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) { + extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[]; + scalar_t *grad_cache = reinterpret_cast(grad_cache_char); + for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) { + const int ph = i / patchW; + const int pw = i % patchW; int i1 = h - dilation_patchH * (ph - patchRadH); - for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) { - int j1 = w - dilation_patchW * (pw - patchRadW); - if - WITHIN_BOUNDS(i1, j1, iH, iW) { - scalar_t val = input1[n][c][i1][j1]; - - const int h_2 = i1 + padH; - const int w_2 = j1 + padW; - const int min_h = h_2 - dilatedKH; - const int min_w = w_2 - dilatedKW; - - for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { - int i2 = (h_3) / dH; - if (i2 * dH != h_3) continue; - for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { - int j2 = (w_3) / dW; - if (j2 * dW != w_3) continue; - if - WITHIN_BOUNDS(i2, j2, H, W) { - prod_sum[ph_off][pw_off] += - grad_output[n][ph][pw][i2][j2] * val; - } - } + int j1 = w - dilation_patchW * (pw - patchRadW); + + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + scalar_t grad_val = 0.0f; + + const int h_2 = i1 + padH; + const int w_2 = j1 + padW; + const int min_h = h_2 - dilatedKH; + const int min_w = w_2 - dilatedKW; + + for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) { + int i2 = (h_3) / dH; + if (i2 * dH != h_3) continue; + for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) { + int j2 = (w_3) / dW; + if (j2 * dW != w_3) continue; + if (WITHIN_BOUNDS(i2, j2, H, W)) { + grad_val += grad_output[n][ph][pw][i2][j2]; } } + } + grad_cache[i] = grad_val; } } - __syncthreads(); - if (ph_off == 0 && pw_off == 0) { - scalar_t reduce_sum = 0; - for (int ph = 0; ph < THREADS_BACKWARD; ++ph) { - for (int pw = 0; pw < THREADS_BACKWARD; ++pw) { - reduce_sum += prod_sum[ph][pw]; + for (int c = threadIdx.x; c < C; c += blockDim.x) { + scalar_t grad_input_val = 0.0f; + for (int ph = 0; ph < patchH; ++ph) { + int i1 = h - dilation_patchH * (ph - patchRadH); + for (int pw = 0; pw < patchW; ++pw) { + int j1 = w - dilation_patchW * (pw - patchRadW); + if (WITHIN_BOUNDS(i1, j1, iH, iW)) { + grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw]; + } } } - grad_input2[n][c][h][w] = reduce_sum; + grad_input2[n][c][h][w] = grad_input_val; } } #endif diff --git a/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..3ee1814e12d185a08640f9768d6c87b5eb3428e5 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh @@ -0,0 +1,136 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Adapted from +// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu # noqa +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAX_NUM_VERT_IDX 9 +#define INTERSECTION_OFFSET 8 +#define EPSILON 1e-8 + +inline int opt_n_thread(int work_size) { + const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0); + return max(min(1 << pow_2, THREADS_PER_BLOCK), 1); +} + +/* +compare normalized vertices (vertices around (0,0)) +if vertex1 < vertex2 return true. +order: minimum at x-aixs, become larger in anti-clockwise direction +*/ +__device__ bool compare_vertices(float x1, float y1, float x2, float y2) { + if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON) + return false; // if equal, return false + + if (y1 > 0 && y2 < 0) return true; + if (y1 < 0 && y2 > 0) return false; + + float n1 = x1 * x1 + y1 * y1 + EPSILON; + float n2 = x2 * x2 + y2 * y2 + EPSILON; + float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2; + + if (y1 > 0 && y2 > 0) { + if (diff > EPSILON) + return true; + else + return false; + } + if (y1 < 0 && y2 < 0) { + if (diff < EPSILON) + return true; + else + return false; + } +} + +__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel( + int b, int n, int m, const float *__restrict__ vertices, + const bool *__restrict__ mask, const int *__restrict__ num_valid, + int *__restrict__ idx) { + int batch_idx = blockIdx.x; + vertices += batch_idx * n * m * 2; + mask += batch_idx * n * m; + num_valid += batch_idx * n; + idx += batch_idx * n * MAX_NUM_VERT_IDX; + + int index = threadIdx.x; // index of polygon + int stride = blockDim.x; + for (int i = index; i < n; i += stride) { + int pad; // index of arbitrary invalid intersection point (not box corner!) + for (int j = INTERSECTION_OFFSET; j < m; ++j) { + if (!mask[i * m + j]) { + pad = j; + break; + } + } + if (num_valid[i] < 3) { + // not enough vertices, take an invalid intersection point + // (zero padding) + for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + } else { + // sort the valid vertices + // note the number of valid vertices is known + // note: check that num_valid[i] < MAX_NUM_VERT_IDX + for (int j = 0; j < num_valid[i]; ++j) { + // initialize with a "big" value + float x_min = 1; + float y_min = -EPSILON; + int i_take = 0; + int i2; + float x2, y2; + if (j != 0) { + i2 = idx[i * MAX_NUM_VERT_IDX + j - 1]; + x2 = vertices[i * m * 2 + i2 * 2 + 0]; + y2 = vertices[i * m * 2 + i2 * 2 + 1]; + } + for (int k = 0; k < m; ++k) { + float x = vertices[i * m * 2 + k * 2 + 0]; + float y = vertices[i * m * 2 + k * 2 + 1]; + if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) { + if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) { + x_min = x; + y_min = y; + i_take = k; + } + } + } + idx[i * MAX_NUM_VERT_IDX + j] = i_take; + } + // duplicate the first idx + idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0]; + + // pad zeros + for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + + // for corner case: the two boxes are exactly the same. + // in this case, idx would have duplicate elements, which makes the + // shoelace formula broken because of the definition, the duplicate + // elements only appear in the first 8 positions (they are "corners in + // box", not "intersection of edges") + if (num_valid[i] == 8) { + int counter = 0; + for (int j = 0; j < 4; ++j) { + int check = idx[i * MAX_NUM_VERT_IDX + j]; + for (int k = 4; k < INTERSECTION_OFFSET; ++k) { + if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++; + } + } + if (counter == 4) { + idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0]; + for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) { + idx[i * MAX_NUM_VERT_IDX + j] = pad; + } + } + } + + // TODO: still might need to cover some other corner cases :( + } + } +} diff --git a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh index c8fc61546acbce55c59abe8371590bba2e610442..6d932434cba245833e661b8c7e140601940bc35b 100644 --- a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh @@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m, int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; - - out += bs_idx * c * m + c_idx * m + pt_idx; - idx += bs_idx * m + pt_idx; - points += bs_idx * c * n + c_idx * n; - out[0] = points[idx[0]]; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b || c_idx >= c) return; + + out += bs_idx * c * m + c_idx * m + pt_idx; + idx += bs_idx * m + pt_idx; + points += bs_idx * c * n + c_idx * n; + out[0] = points[idx[0]]; + } } template @@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m, int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= b || c_idx >= c || pt_idx >= m) return; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b || c_idx >= c) return; - grad_out += bs_idx * c * m + c_idx * m + pt_idx; - idx += bs_idx * m + pt_idx; - grad_points += bs_idx * c * n + c_idx * n; + grad_out += bs_idx * c * m + c_idx * m + pt_idx; + idx += bs_idx * m + pt_idx; + grad_points += bs_idx * c * n + c_idx * n; - atomicAdd(grad_points + idx[0], grad_out[0]); + atomicAdd(grad_points + idx[0], grad_out[0]); + } } #endif // GATHER_POINTS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh index 9cfc2dc865152769d55d4062b7f6bad25e9c70e8..dfad66fc16d8759f614d7f36fa961673976b1d95 100644 --- a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh @@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n, // out: (B, C, npoints, nsample) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int index = blockIdx.x * blockDim.x + threadIdx.x; - int pt_idx = index / nsample; - if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; + CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { + if (bs_idx >= b || c_idx >= c) return; - int sample_idx = index % nsample; + int pt_idx = index / nsample; + int sample_idx = index % nsample; - idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; - int in_idx = bs_idx * c * n + c_idx * n + idx[0]; - int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + - pt_idx * nsample + sample_idx; + idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; + int in_idx = bs_idx * c * n + c_idx * n + idx[0]; + int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample + + pt_idx * nsample + sample_idx; - out[out_idx] = points[in_idx]; + out[out_idx] = points[in_idx]; + } } template @@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n, // grad_points: (B, C, N) int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int index = blockIdx.x * blockDim.x + threadIdx.x; - int pt_idx = index / nsample; - if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return; + CUDA_1D_KERNEL_LOOP(index, npoints * nsample) { + int pt_idx = index / nsample; + if (bs_idx >= b || c_idx >= c) return; - int sample_idx = index % nsample; - grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + - pt_idx * nsample + sample_idx; - idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; + int sample_idx = index % nsample; + grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample + + pt_idx * nsample + sample_idx; + idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx; - atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); + atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]); + } } #endif // GROUP_POINTS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh index 4e261cbd0cf1d69973eab34f32ab2a334d6a13a6..9ebdcad15eee05a9f412ef34eb12d3553874a4dc 100644 --- a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh @@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2, } __device__ inline int check_in_box2d(const float *box, const Point &p) { - // params: box (5) [x1, y1, x2, y2, angle] - const float MARGIN = 1e-5; - - float center_x = (box[0] + box[2]) / 2; - float center_y = (box[1] + box[3]) / 2; - float angle_cos = cos(-box[4]), - angle_sin = - sin(-box[4]); // rotate the point in the opposite direction of box - float rot_x = - (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x; - float rot_y = - (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y; - - return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN && - rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN); + // params: box (7) [x, y, z, dx, dy, dz, heading] + const float MARGIN = 1e-2; + + float center_x = box[0], center_y = box[1]; + // rotate the point in the opposite direction of box + float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]); + float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin); + float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos; + + return (fabs(rot_x) < box[3] / 2 + MARGIN && + fabs(rot_y) < box[4] / 2 + MARGIN); } __device__ inline int intersection(const Point &p1, const Point &p0, @@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b, } __device__ inline float box_overlap(const float *box_a, const float *box_b) { - // params: box_a (5) [x1, y1, x2, y2, angle] - // params: box_b (5) [x1, y1, x2, y2, angle] + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] - float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3], - a_angle = box_a[4]; - float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3], - b_angle = box_b[4]; + float a_angle = box_a[6], b_angle = box_b[6]; + float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, + a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2; + float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half; + float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half; + float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half; + float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half; - Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2); - Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2); + Point center_a(box_a[0], box_a[1]); + Point center_b(box_b[0], box_b[1]); Point box_a_corners[5]; box_a_corners[0].set(a_x1, a_y1); @@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) { } __device__ inline float iou_bev(const float *box_a, const float *box_b) { - // params: box_a (5) [x1, y1, x2, y2, angle] - // params: box_b (5) [x1, y1, x2, y2, angle] - float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]); - float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]); + // params box_a: [x, y, z, dx, dy, dz, heading] + // params box_b: [x, y, z, dx, dy, dz, heading] + float sa = box_a[3] * box_a[4]; + float sb = box_b[3] * box_b[4]; float s_overlap = box_overlap(box_a, box_b); return s_overlap / fmaxf(sa + sb - s_overlap, EPS); } @@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) { __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel( const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap) { - const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; - const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; - - if (a_idx >= num_a || b_idx >= num_b) { - return; - } - const float *cur_box_a = boxes_a + a_idx * 5; - const float *cur_box_b = boxes_b + b_idx * 5; - float s_overlap = box_overlap(cur_box_a, cur_box_b); - ans_overlap[a_idx * num_b + b_idx] = s_overlap; -} - -__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a, - const float *boxes_a, - const int num_b, - const float *boxes_b, - float *ans_iou) { - const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y; - const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x; + // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading] + // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading] + CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) { + if (a_idx >= num_a || b_idx >= num_b) { + return; + } - if (a_idx >= num_a || b_idx >= num_b) { - return; + const float *cur_box_a = boxes_a + a_idx * 7; + const float *cur_box_b = boxes_b + b_idx * 7; + float cur_overlap = box_overlap(cur_box_a, cur_box_b); + ans_overlap[a_idx * num_b + b_idx] = cur_overlap; } - - const float *cur_box_a = boxes_a + a_idx * 5; - const float *cur_box_b = boxes_b + b_idx * 5; - float cur_iou_bev = iou_bev(cur_box_a, cur_box_b); - ans_iou[a_idx * num_b + b_idx] = cur_iou_bev; } -__global__ void nms_forward_cuda_kernel(const int boxes_num, - const float nms_overlap_thresh, - const float *boxes, - unsigned long long *mask) { - // params: boxes (N, 5) [x1, y1, x2, y2, ry] +__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num, + const float nms_overlap_thresh, + const float *boxes, + unsigned long long *mask) { + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) + const int blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - // if (row_start > col_start) return; - - const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, - THREADS_PER_BLOCK_NMS); - const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, - THREADS_PER_BLOCK_NMS); - - __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; - - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 5 + 0] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; - block_boxes[threadIdx.x * 5 + 1] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; - block_boxes[threadIdx.x * 5 + 2] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; - block_boxes[threadIdx.x * 5 + 3] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; - block_boxes[threadIdx.x * 5 + 4] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; - const float *cur_box = boxes + cur_box_idx * 5; + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; - int i = 0; - unsigned long long t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { - t |= 1ULL << i; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; } + for (i = start; i < col_size; i++) { + if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + mask[cur_box_idx * col_blocks + col_start] = t; } - const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); - mask[cur_box_idx * col_blocks + col_start] = t; } } __device__ inline float iou_normal(float const *const a, float const *const b) { - float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); - float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); + // params: a: [x, y, z, dx, dy, dz, heading] + // params: b: [x, y, z, dx, dy, dz, heading] + + float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), + right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2); + float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), + bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2); float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f); float interS = width * height; - float Sa = (a[2] - a[0]) * (a[3] - a[1]); - float Sb = (b[2] - b[0]) * (b[3] - b[1]); + float Sa = a[3] * a[4]; + float Sb = b[3] * b[4]; return interS / fmaxf(Sa + Sb - interS, EPS); } -__global__ void nms_normal_forward_cuda_kernel(const int boxes_num, - const float nms_overlap_thresh, - const float *boxes, - unsigned long long *mask) { - // params: boxes (N, 5) [x1, y1, x2, y2, ry] +__global__ void iou3d_nms3d_normal_forward_cuda_kernel( + const int boxes_num, const float nms_overlap_thresh, const float *boxes, + unsigned long long *mask) { + // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading] // params: mask (N, N/THREADS_PER_BLOCK_NMS) - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - // if (row_start > col_start) return; - - const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, - THREADS_PER_BLOCK_NMS); - const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, - THREADS_PER_BLOCK_NMS); - - __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5]; - - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 5 + 0] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0]; - block_boxes[threadIdx.x * 5 + 1] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1]; - block_boxes[threadIdx.x * 5 + 2] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2]; - block_boxes[threadIdx.x * 5 + 3] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3]; - block_boxes[threadIdx.x * 5 + 4] = - boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4]; - } - __syncthreads(); + const int blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + // if (row_start > col_start) return; + + const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, + THREADS_PER_BLOCK_NMS); + + __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7]; + + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 7 + 0] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0]; + block_boxes[threadIdx.x * 7 + 1] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1]; + block_boxes[threadIdx.x * 7 + 2] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2]; + block_boxes[threadIdx.x * 7 + 3] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3]; + block_boxes[threadIdx.x * 7 + 4] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4]; + block_boxes[threadIdx.x * 7 + 5] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5]; + block_boxes[threadIdx.x * 7 + 6] = + boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6]; + } + __syncthreads(); - if (threadIdx.x < row_size) { - const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; - const float *cur_box = boxes + cur_box_idx * 5; + if (threadIdx.x < row_size) { + const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x; + const float *cur_box = boxes + cur_box_idx * 7; - int i = 0; - unsigned long long t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { - t |= 1ULL << i; + int i = 0; + unsigned long long t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) { + t |= 1ULL << i; + } } + const int col_blocks = + (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS; + mask[cur_box_idx * col_blocks + col_start] = t; } - const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS); - mask[cur_box_idx * col_blocks + col_start] = t; } } diff --git a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh index 3181aa65cddf129e9e97dde97ceb97923b75c135..3cf52bb90eb27d02b28c52069c760c8a38f83f08 100644 --- a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh @@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample, const T *xyz, const T *new_xyz, int *__restrict__ idx, T *dist2) { int bs_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= b || pt_idx >= m) return; + CUDA_1D_KERNEL_LOOP(pt_idx, m) { + if (bs_idx >= b) return; - new_xyz += bs_idx * m * 3 + pt_idx * 3; - xyz += bs_idx * n * 3; - idx += bs_idx * m * nsample + pt_idx * nsample; - dist2 += bs_idx * m * nsample + pt_idx * nsample; + new_xyz += bs_idx * m * 3 + pt_idx * 3; + xyz += bs_idx * n * 3; + idx += bs_idx * m * nsample + pt_idx * nsample; + dist2 += bs_idx * m * nsample + pt_idx * nsample; - T new_x = new_xyz[0]; - T new_y = new_xyz[1]; - T new_z = new_xyz[2]; + T new_x = new_xyz[0]; + T new_y = new_xyz[1]; + T new_z = new_xyz[2]; - float best_dist[100]; - int best_idx[100]; - for (int i = 0; i < nsample; i++) { - best_dist[i] = 1e10; - best_idx[i] = 0; - } - for (int i = 0; i < n; i++) { - T x = xyz[i * 3 + 0]; - T y = xyz[i * 3 + 1]; - T z = xyz[i * 3 + 2]; - T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + - (new_z - z) * (new_z - z); - if (d2 < best_dist[0]) { - best_dist[0] = d2; - best_idx[0] = i; - reheap(best_dist, best_idx, nsample); + float best_dist[100]; + int best_idx[100]; + for (int i = 0; i < nsample; i++) { + best_dist[i] = 1e10; + best_idx[i] = 0; + } + for (int i = 0; i < n; i++) { + T x = xyz[i * 3 + 0]; + T y = xyz[i * 3 + 1]; + T z = xyz[i * 3 + 2]; + T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) + + (new_z - z) * (new_z - z); + if (d2 < best_dist[0]) { + best_dist[0] = d2; + best_idx[0] = i; + reheap(best_dist, best_idx, nsample); + } + } + heap_sort(best_dist, best_idx, nsample); + for (int i = 0; i < nsample; i++) { + idx[i] = best_idx[i]; + dist2[i] = best_dist[i]; } - } - heap_sort(best_dist, best_idx, nsample); - for (int i = 0; i < nsample; i++) { - idx[i] = best_idx[i]; - dist2[i] = best_dist[i]; } } diff --git a/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh @@ -0,0 +1,300 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH +#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +#define MAXN 20 +__device__ const float PI = 3.1415926; + +struct Point { + float x, y; + __device__ Point() {} + __device__ Point(float x, float y) : x(x), y(y) {} +}; + +__device__ inline void swap1(Point *a, Point *b) { + Point temp; + temp.x = a->x; + temp.y = a->y; + + a->x = b->x; + a->y = b->y; + + b->x = temp.x; + b->y = temp.y; +} +__device__ inline float cross(Point o, Point a, Point b) { + return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y); +} + +__device__ inline float dis(Point a, Point b) { + return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y); +} +__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) { + float convex_points[2][MAXN]; + for (int j = 0; j < n_points; j++) { + convex_points[0][j] = ps[j].x; + } + for (int j = 0; j < n_points; j++) { + convex_points[1][j] = ps[j].y; + } + + Point edges[MAXN]; + float edges_angles[MAXN]; + float unique_angles[MAXN]; + int n_edges = n_points - 1; + int n_unique = 0; + int unique_flag = 0; + + for (int i = 0; i < n_edges; i++) { + edges[i].x = ps[i + 1].x - ps[i].x; + edges[i].y = ps[i + 1].y - ps[i].y; + } + for (int i = 0; i < n_edges; i++) { + edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x); + if (edges_angles[i] >= 0) { + edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2); + } else { + edges_angles[i] = + edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2); + } + } + unique_angles[0] = edges_angles[0]; + n_unique += 1; + for (int i = 1; i < n_edges; i++) { + for (int j = 0; j < n_unique; j++) { + if (edges_angles[i] == unique_angles[j]) { + unique_flag += 1; + } + } + if (unique_flag == 0) { + unique_angles[n_unique] = edges_angles[i]; + n_unique += 1; + unique_flag = 0; + } else { + unique_flag = 0; + } + } + + float minarea = 1e12; + for (int i = 0; i < n_unique; i++) { + float R[2][2]; + float rot_points[2][MAXN]; + R[0][0] = cos(unique_angles[i]); + R[0][1] = sin(unique_angles[i]); + R[1][0] = -sin(unique_angles[i]); + R[1][1] = cos(unique_angles[i]); + // R x Points + for (int m = 0; m < 2; m++) { + for (int n = 0; n < n_points; n++) { + float sum = 0.0; + for (int k = 0; k < 2; k++) { + sum = sum + R[m][k] * convex_points[k][n]; + } + rot_points[m][n] = sum; + } + } + + // xmin; + float xmin, ymin, xmax, ymax; + xmin = 1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { + continue; + } else { + if (rot_points[0][j] < xmin) { + xmin = rot_points[0][j]; + } + } + } + // ymin + ymin = 1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { + continue; + } else { + if (rot_points[1][j] < ymin) { + ymin = rot_points[1][j]; + } + } + } + // xmax + xmax = -1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) { + continue; + } else { + if (rot_points[0][j] > xmax) { + xmax = rot_points[0][j]; + } + } + } + // ymax + ymax = -1e12; + for (int j = 0; j < n_points; j++) { + if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) { + continue; + } else { + if (rot_points[1][j] > ymax) { + ymax = rot_points[1][j]; + } + } + } + float area = (xmax - xmin) * (ymax - ymin); + if (area < minarea) { + minarea = area; + minbox[0] = unique_angles[i]; + minbox[1] = xmin; + minbox[2] = ymin; + minbox[3] = xmax; + minbox[4] = ymax; + } + } +} + +// convex_find +__device__ inline void Jarvis(Point *in_poly, int &n_poly) { + int n_input = n_poly; + Point input_poly[20]; + for (int i = 0; i < n_input; i++) { + input_poly[i].x = in_poly[i].x; + input_poly[i].y = in_poly[i].y; + } + Point p_max, p_k; + int max_index, k_index; + int Stack[20], top1, top2; + // float sign; + double sign; + Point right_point[10], left_point[10]; + + for (int i = 0; i < n_poly; i++) { + if (in_poly[i].y < in_poly[0].y || + in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) { + Point *j = &(in_poly[0]); + Point *k = &(in_poly[i]); + swap1(j, k); + } + if (i == 0) { + p_max = in_poly[0]; + max_index = 0; + } + if (in_poly[i].y > p_max.y || + in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) { + p_max = in_poly[i]; + max_index = i; + } + } + if (max_index == 0) { + max_index = 1; + p_max = in_poly[max_index]; + } + + k_index = 0, Stack[0] = 0, top1 = 0; + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top1]], in_poly[i], p_k); + if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) > + dis(in_poly[Stack[top1]], p_k)))) { + p_k = in_poly[i]; + k_index = i; + } + } + top1++; + Stack[top1] = k_index; + } + + for (int i = 0; i <= top1; i++) { + right_point[i] = in_poly[Stack[i]]; + } + + k_index = 0, Stack[0] = 0, top2 = 0; + + while (k_index != max_index) { + p_k = p_max; + k_index = max_index; + for (int i = 1; i < n_poly; i++) { + sign = cross(in_poly[Stack[top2]], in_poly[i], p_k); + if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) > + dis(in_poly[Stack[top2]], p_k))) { + p_k = in_poly[i]; + k_index = i; + } + } + top2++; + Stack[top2] = k_index; + } + + for (int i = top2 - 1; i >= 0; i--) { + left_point[i] = in_poly[Stack[i]]; + } + + for (int i = 0; i < top1 + top2; i++) { + if (i <= top1) { + in_poly[i] = right_point[i]; + } else { + in_poly[i] = left_point[top2 - (i - top1)]; + } + } + n_poly = top1 + top2; +} + +template +__device__ inline void Findminbox(T const *const p, T *minpoints) { + Point ps1[MAXN]; + Point convex[MAXN]; + for (int i = 0; i < 9; i++) { + convex[i].x = p[i * 2]; + convex[i].y = p[i * 2 + 1]; + } + int n_convex = 9; + Jarvis(convex, n_convex); + int n1 = n_convex; + for (int i = 0; i < n1; i++) { + ps1[i].x = convex[i].x; + ps1[i].y = convex[i].y; + } + ps1[n1].x = convex[0].x; + ps1[n1].y = convex[0].y; + + float minbbox[5] = {0}; + minBoundingRect(ps1, n1 + 1, minbbox); + float angle = minbbox[0]; + float xmin = minbbox[1]; + float ymin = minbbox[2]; + float xmax = minbbox[3]; + float ymax = minbbox[4]; + float R[2][2]; + + R[0][0] = cos(angle); + R[0][1] = sin(angle); + R[1][0] = -sin(angle); + R[1][1] = cos(angle); + + minpoints[0] = xmax * R[0][0] + ymin * R[1][0]; + minpoints[1] = xmax * R[0][1] + ymin * R[1][1]; + minpoints[2] = xmin * R[0][0] + ymin * R[1][0]; + minpoints[3] = xmin * R[0][1] + ymin * R[1][1]; + minpoints[4] = xmin * R[0][0] + ymax * R[1][0]; + minpoints[5] = xmin * R[0][1] + ymax * R[1][1]; + minpoints[6] = xmax * R[0][0] + ymax * R[1][0]; + minpoints[7] = xmax * R[0][1] + ymax * R[1][1]; +} + +template +__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes, + const T *ex_boxes, T *minbox) { + CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) { + const T *cur_box = ex_boxes + index * 18; + T *cur_min_box = minbox + index * 8; + Findminbox(cur_box, cur_min_box); + } +} + +#endif // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh index aff1ea26fafb6574060797d24131b8540594716d..12225ffdb3b1691ad9edabcd1663109f67ef1a6f 100644 --- a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh @@ -14,11 +14,6 @@ #include "common_cuda_helper.hpp" #include "pytorch_cuda_helper.hpp" -const int CUDA_NUM_THREADS = 1024; -inline int GET_BLOCKS(const int N, const int num_threads) { - return (N + num_threads - 1) / num_threads; -} - template __device__ scalar_t ms_deform_attn_im2col_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, @@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + const int qid_stride = num_heads * channels; CUDA_1D_KERNEL_LOOP(index, n) { - __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; - __shared__ scalar_t cache_grad_attn_weight[blockSize]; - unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; for (int l_col = 0; l_col < num_levels; ++l_col) { @@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; - for (unsigned int tid = 1; tid < blockSize; ++tid) { + for (unsigned int _tid = 1; _tid < blockSize; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; - _grad_a += cache_grad_attn_weight[tid]; + _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } - *grad_sampling_loc = _grad_w; - *(grad_sampling_loc + 1) = _grad_h; - *grad_attn_weight = _grad_a; + *grad_sampling_loc_out = _grad_w; + *(grad_sampling_loc_out + 1) = _grad_h; + *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } @@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { - __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; - __shared__ scalar_t cache_grad_attn_weight[blockSize]; - unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; @@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( } if (tid == 0) { - *grad_sampling_loc = cache_grad_sampling_loc[0]; - *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; - *grad_attn_weight = cache_grad_attn_weight[0]; + *grad_sampling_loc_out = cache_grad_sampling_loc[0]; + *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } @@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); - scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; @@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1( _grad_h = cache_grad_sampling_loc[1], _grad_a = cache_grad_attn_weight[0]; int sid = 2; - for (unsigned int tid = 1; tid < blockDim.x; ++tid) { + for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) { _grad_w += cache_grad_sampling_loc[sid]; _grad_h += cache_grad_sampling_loc[sid + 1]; - _grad_a += cache_grad_attn_weight[tid]; + _grad_a += cache_grad_attn_weight[_tid]; sid += 2; } - *grad_sampling_loc = _grad_w; - *(grad_sampling_loc + 1) = _grad_h; - *grad_attn_weight = _grad_a; + *grad_sampling_loc_out = _grad_w; + *(grad_sampling_loc_out + 1) = _grad_h; + *grad_attn_weight_out = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } @@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); - scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; @@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2( } if (tid == 0) { - *grad_sampling_loc = cache_grad_sampling_loc[0]; - *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; - *grad_attn_weight = cache_grad_attn_weight[0]; + *grad_sampling_loc_out = cache_grad_sampling_loc[0]; + *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight_out = cache_grad_attn_weight[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } @@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int channels, const int num_levels, const int num_query, const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) { + extern __shared__ int _s[]; + scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); + scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; CUDA_1D_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - scalar_t *cache_grad_sampling_loc = reinterpret_cast(_s); - scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % channels; _temp /= channels; @@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; @@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( } if (tid == 0) { - atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); - atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); - atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } @@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( int data_weight_ptr = sampling_index * num_levels * num_point; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; + scalar_t *grad_sampling_loc_out = + grad_sampling_loc + (grad_sampling_ptr << 1); + scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr; const int grad_weight_stride = 1; const int grad_loc_stride = 2; const int qid_stride = num_heads * channels; @@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm( ms_deform_attn_col2im_bilinear_gm( data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - grad_sampling_loc, grad_attn_weight); + grad_sampling_loc_out, grad_attn_weight_out); } data_weight_ptr += 1; data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; + grad_attn_weight_out += grad_weight_stride; + grad_sampling_loc_out += grad_loc_stride; } } } diff --git a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh index 40a2f462202cb06e7230ad3f1e17474e93ddc4cb..0a5c2505f5c7716ba025a5884debed73c46db9d5 100644 --- a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh @@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b, __global__ void nms_cuda(const int n_boxes, const float iou_threshold, const int offset, const float *dev_boxes, unsigned long long *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - const int tid = threadIdx.x; + int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; + CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) { + const int tid = threadIdx.x; + + if (row_start > col_start) return; + + const int row_size = + fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); + const int col_size = + fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + + __shared__ float block_boxes[threadsPerBlock * 4]; + if (tid < col_size) { + block_boxes[tid * 4 + 0] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; + block_boxes[tid * 4 + 1] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; + block_boxes[tid * 4 + 2] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; + block_boxes[tid * 4 + 3] = + dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; + } + __syncthreads(); + + if (tid < row_size) { + const int cur_box_idx = threadsPerBlock * row_start + tid; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + unsigned long long int t = 0; + int start = 0; + if (row_start == col_start) { + start = tid + 1; + } + for (i = start; i < col_size; i++) { + if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { + t |= 1ULL << i; + } + } + dev_mask[cur_box_idx * gridDim.y + col_start] = t; + } + } +} - if (row_start > col_start) return; +__global__ void gather_keep_from_mask(bool *keep, + const unsigned long long *dev_mask, + const int n_boxes) { + const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock; + const int tid = threadIdx.x; - const int row_size = - fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); - const int col_size = - fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); + // mark the bboxes which have been removed. + extern __shared__ unsigned long long removed[]; - __shared__ float block_boxes[threadsPerBlock * 4]; - if (tid < col_size) { - block_boxes[tid * 4 + 0] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0]; - block_boxes[tid * 4 + 1] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1]; - block_boxes[tid * 4 + 2] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2]; - block_boxes[tid * 4 + 3] = - dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3]; + // initialize removed. + for (int i = tid; i < col_blocks; i += blockDim.x) { + removed[i] = 0; } __syncthreads(); - if (tid < row_size) { - const int cur_box_idx = threadsPerBlock * row_start + tid; - const float *cur_box = dev_boxes + cur_box_idx * 4; - int i = 0; - unsigned long long int t = 0; - int start = 0; - if (row_start == col_start) { - start = tid + 1; - } - for (i = start; i < col_size; i++) { - if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) { - t |= 1ULL << i; + for (int nblock = 0; nblock < col_blocks; ++nblock) { + auto removed_val = removed[nblock]; + __syncthreads(); + const int i_offset = nblock * threadsPerBlock; +#pragma unroll + for (int inblock = 0; inblock < threadsPerBlock; ++inblock) { + const int i = i_offset + inblock; + if (i >= n_boxes) break; + // select a candidate, check if it should kept. + if (!(removed_val & (1ULL << inblock))) { + if (tid == 0) { + // mark the output. + keep[i] = true; + } + auto p = dev_mask + i * col_blocks; + // remove all bboxes which overlap the candidate. + for (int j = tid; j < col_blocks; j += blockDim.x) { + if (j >= nblock) removed[j] |= p[j]; + } + __syncthreads(); + removed_val = removed[nblock]; } } - dev_mask[cur_box_idx * gridDim.y + col_start] = t; } } + #endif // NMS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh index 80bed9681f748390999a2963bd3448570b0dbf6a..747327afb83900177dd4721f1b0ba99153f658d7 100644 --- a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh +++ b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh @@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes, // (x_center, y_center, width, height, angle_degrees) here. __shared__ T block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 6 + 0] = + block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0]; - block_boxes[threadIdx.x * 6 + 1] = + block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1]; - block_boxes[threadIdx.x * 6 + 2] = + block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2]; - block_boxes[threadIdx.x * 6 + 3] = + block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3]; - block_boxes[threadIdx.x * 6 + 4] = + block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4]; - block_boxes[threadIdx.x * 6 + 5] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5]; } __syncthreads(); @@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes, // Instead of devIoU used by original horizontal nms, here // we use the single_box_iou_rotated function from // box_iou_rotated_utils.h - if (single_box_iou_rotated(cur_box, block_boxes + i * 6, 0) > + if (single_box_iou_rotated(cur_box, block_boxes + i * 5, 0) > iou_threshold) { t |= 1ULL << i; } diff --git a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh index 12182cc3704eaacd1da838ce357c2677ad029eaa..342362079a5ce3dde6d19532b3014872f4373330 100644 --- a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh @@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel( // (B, npoints), default -1 int bs_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= batch_size || pt_idx >= pts_num) return; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (bs_idx >= batch_size) return; - boxes += bs_idx * boxes_num * 7; - pts += bs_idx * pts_num * 3 + pt_idx * 3; - box_idx_of_points += bs_idx * pts_num + pt_idx; + boxes += bs_idx * boxes_num * 7; + pts += bs_idx * pts_num * 3 + pt_idx * 3; + box_idx_of_points += bs_idx * pts_num + pt_idx; - T local_x = 0, local_y = 0; - int cur_in_flag = 0; - for (int k = 0; k < boxes_num; k++) { - cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); - if (cur_in_flag) { - box_idx_of_points[0] = k; - break; + T local_x = 0, local_y = 0; + int cur_in_flag = 0; + for (int k = 0; k < boxes_num; k++) { + cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); + if (cur_in_flag) { + box_idx_of_points[0] = k; + break; + } } } } @@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel( // (B, npoints), default -1 int bs_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= batch_size || pt_idx >= pts_num) return; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (bs_idx >= batch_size) return; - boxes += bs_idx * boxes_num * 7; - pts += bs_idx * pts_num * 3 + pt_idx * 3; - box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; + boxes += bs_idx * boxes_num * 7; + pts += bs_idx * pts_num * 3 + pt_idx * 3; + box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num; - T local_x = 0, local_y = 0; - for (int k = 0; k < boxes_num; k++) { - const int cur_in_flag = - check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); - if (cur_in_flag) { - box_idx_of_points[k] = 1; + T local_x = 0, local_y = 0; + for (int k = 0; k < boxes_num; k++) { + const int cur_in_flag = + check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y); + if (cur_in_flag) { + box_idx_of_points[k] = 1; + } } } } diff --git a/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh @@ -0,0 +1,79 @@ +// Copyright (c) OpenMMLab. All rights reserved +#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH +#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +struct point { + float x, y; +}; + +template +__global__ void points_in_polygons_forward_cuda_kernel( + const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2, + const int rows, const int cols, scalar_t *inside_flag) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int row = index / cols; + int col = index % cols; + + const scalar_t *offset_vertex1 = vertex1 + row * 2; + const scalar_t *offset_vertex2 = vertex2 + col * 8; + + point point_[1]; + point polygon[4]; + + point_[0].x = offset_vertex1[0]; + point_[0].y = offset_vertex1[1]; + + polygon[0].x = offset_vertex2[0]; + polygon[0].y = offset_vertex2[1]; + polygon[1].x = offset_vertex2[2]; + polygon[1].y = offset_vertex2[3]; + polygon[2].x = offset_vertex2[4]; + polygon[2].y = offset_vertex2[5]; + polygon[3].x = offset_vertex2[6]; + polygon[3].y = offset_vertex2[7]; + + int nCross = 0; + int i, j; + float sx, sy, tx, ty, px, py, x; + for (i = 0, j = 3; i < 4; j = i, i++) { + sx = polygon[i].x; + sy = polygon[i].y; + tx = polygon[j].x; + ty = polygon[j].y; + + px = point_[0].x; + py = point_[0].y; + + if (py < min(sy, ty)) continue; + if (py > max(sy, ty)) continue; + + if ((sx == px && sy == py) || (tx == px && ty == py)) { + break; + } else { + if ((sy < py && ty >= py) || (sy >= py && ty < py)) { + x = sx + (py - sy) * (tx - sx) / (ty - sy); + if (x == px) { + break; + } + if (x > px) { + nCross++; + } + } + } + } + if (nCross % 2 == 1) { + inside_flag[index] = 1.0; + } else { + inside_flag[index] = 0.0; + } + return; + } +} + +#endif // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ea8c37e22afdd5b3c48c5ea6fc29004d74340fb5 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh @@ -0,0 +1,381 @@ +// Copyright (c) OpenMMLab. All rights reserved +// Modified from +// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu +// Distributed under terms of the MIT license. +#ifndef PRROI_POOL_CUDA_KERNEL_CUH +#define PRROI_POOL_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data, + const int h, + const int w, + const int height, + const int width) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + T retVal = overflow ? 0.0f : data[h * width + w]; + return retVal; +} + +template +__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) { + return (1.0f - abs(dh)) * (1.0f - abs(dw)); +} + +template +__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t, + T c1, T c2) { + return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1; +} + +template +__device__ static T PrRoIPoolingInterpolation(const T *data, const T h, + const T w, const int height, + const int width) { + T retVal = 0.0f; + int h1 = floorf(h); + int w1 = floorf(w); + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h) + 1; + w1 = floorf(w); + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h); + w1 = floorf(w) + 1; + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + h1 = floorf(h) + 1; + w1 = floorf(w) + 1; + retVal += PrRoIPoolingGetData(data, h1, w1, height, width) * + PrRoIPoolingGetCoeff(h - T(h1), w - T(w1)); + return retVal; +} + +template +__device__ static T PrRoIPoolingMatCalculation(const T *this_data, + const int s_h, const int s_w, + const int e_h, const int e_w, + const T y0, const T x0, + const T y1, const T x1, + const int h0, const int w0) { + T alpha, beta, lim_alpha, lim_beta, tmp; + T sum_out = 0; + + alpha = x0 - T(s_w); + beta = y0 - T(s_h); + lim_alpha = x1 - T(s_w); + lim_beta = y1 - T(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; + + alpha = x0 - T(s_w); + beta = T(e_h) - y1; + lim_alpha = x1 - T(s_w); + lim_beta = T(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; + + return sum_out; +} + +template +__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff, + const int h, const int w, + const int height, + const int width, + const T coeff) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff); +} + +template +__device__ static void PrRoIPoolingMatDistributeDiff( + T *diff, const T top_diff, const int s_h, const int s_w, const int e_h, + const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, + const int w0) { + T alpha, beta, lim_alpha, lim_beta, tmp; + + alpha = x0 - T(s_w); + beta = y0 - T(s_h); + lim_alpha = x1 - T(s_w); + lim_beta = y1 - T(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp); + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp); + + alpha = x0 - T(s_w); + beta = T(e_h) - y1; + lim_alpha = x1 - T(s_w); + lim_beta = T(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp); + + alpha = T(e_w) - x1; + lim_alpha = T(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp); +} + +template +__global__ void prroi_pool_forward_cuda_kernel( + const int nthreads, const T *input, const T *rois, T *output, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T *offset_rois = rois + n * 5; + int roi_batch_ind = offset_rois[0]; + + T roi_x1 = offset_rois[1] * spatial_scale; + T roi_y1 = offset_rois[2] * spatial_scale; + T roi_x2 = offset_rois[3] * spatial_scale; + T roi_y2 = offset_rois[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, ((T)0.0)); + T roi_height = max(roi_y2 - roi_y1, ((T)0.0)); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T *this_data = + input + (roi_batch_ind * channels + c) * height * width; + T *this_out = output + index; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + if (bin_size == 0) { + *this_out = 0; + continue; + } + + T sum_out = 0; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) + for (int bin_y = start_y; bin_y < end_y; ++bin_y) + sum_out += PrRoIPoolingMatCalculation( + this_data, bin_y, bin_x, bin_y + 1, bin_x + 1, + max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), + min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, + width); + *this_out = sum_out / bin_size; + } +} + +template +__global__ void prroi_pool_backward_cuda_kernel( + const int nthreads, const T *grad_output, const T *rois, T *grad_input, + const int pooled_height, const int pooled_width, const T spatial_scale, + const int channels, const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + rois += n * 5; + + int roi_batch_ind = rois[0]; + T roi_x1 = rois[1] * spatial_scale; + T roi_y1 = rois[2] * spatial_scale; + T roi_x2 = rois[3] * spatial_scale; + T roi_y2 = rois[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, (T)0); + T roi_height = max(roi_y2 - roi_y1, (T)0); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T *this_out_grad = grad_output + index; + T *this_data_grad = + grad_input + (roi_batch_ind * channels + c) * height * width; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + + T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) + for (int bin_y = start_y; bin_y < end_y; ++bin_y) + PrRoIPoolingMatDistributeDiff( + this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1, + max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)), + min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height, + width); + } +} + +template +__global__ void prroi_pool_coor_backward_cuda_kernel( + const int nthreads, const T *output, const T *grad_output, const T *input, + const T *rois, T *grad_rois, const int pooled_height, + const int pooled_width, const T spatial_scale, const int channels, + const int height, const int width) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + rois += n * 5; + + int roi_batch_ind = rois[0]; + T roi_x1 = rois[1] * spatial_scale; + T roi_y1 = rois[2] * spatial_scale; + T roi_x2 = rois[3] * spatial_scale; + T roi_y2 = rois[4] * spatial_scale; + + T roi_width = max(roi_x2 - roi_x1, (T)0); + T roi_height = max(roi_y2 - roi_y1, (T)0); + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + const T output_grad_val = grad_output[index]; + const T *this_input_data = + input + (roi_batch_ind * channels + c) * height * width; + const T output_val = output[index]; + T *this_rois_grad = grad_rois + n * 5; + + T bin_x1 = roi_x1 + bin_size_w * pw; + T bin_y1 = roi_y1 + bin_size_h * ph; + T bin_x2 = bin_x1 + bin_size_w; + T bin_y2 = bin_y1 + bin_size_h; + + T bin_size = max(T(0.0), bin_size_w * bin_size_h); + + T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size; + + // WARNING: to be discussed + if (sum_out == 0) return; + + int start_x, start_y, end_x, end_y; + + start_x = floorf(bin_x1); + end_x = ceilf(bin_x2); + start_y = floorf(bin_y1); + end_y = ceilf(bin_y2); + + T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0; + for (int bin_y = start_y; bin_y < end_y; ++bin_y) { + grad_x1_y += PrRoIPoolingSingleCoorIntegral( + max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, + PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1, + height, width), + PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1, + height, width)); + + grad_x2_y += PrRoIPoolingSingleCoorIntegral( + max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y, + PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2, + height, width), + PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2, + height, width)); + } + + for (int bin_x = start_x; bin_x < end_x; ++bin_x) { + grad_x_y1 += PrRoIPoolingSingleCoorIntegral( + max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, + PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x), + height, width), + PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1), + height, width)); + + grad_x_y2 += PrRoIPoolingSingleCoorIntegral( + max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x, + PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x), + height, width), + PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1), + height, width)); + } + + T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val; + T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val; + T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val; + T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val; + + partial_x1 = partial_x1 / bin_size * spatial_scale; + partial_x2 = partial_x2 / bin_size * spatial_scale; + partial_y1 = partial_y1 / bin_size * spatial_scale; + partial_y2 = partial_y2 / bin_size * spatial_scale; + + // (index, x1, y1, x2, y2) + this_rois_grad[0] = 0; + atomicAdd(this_rois_grad + 1, + (partial_x1 * (1.0f - T(pw) / pooled_width) + + partial_x2 * (1.0f - T(pw + 1) / pooled_width)) * + output_grad_val); + atomicAdd(this_rois_grad + 2, + (partial_y1 * (1.0f - T(ph) / pooled_height) + + partial_y2 * (1.0f - T(ph + 1) / pooled_height)) * + output_grad_val); + atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width + + partial_x1 * T(pw) / pooled_width) * + output_grad_val); + atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height + + partial_y1 * T(ph) / pooled_height) * + output_grad_val); + } +} + +#endif // ROI_POOL_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh @@ -0,0 +1,242 @@ +// Modified from +// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu +#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH +#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH + +#include +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else // MMCV_USE_PARROTS +#include "pytorch_cuda_helper.hpp" +#endif // MMCV_USE_PARROTS + +/*** Forward ***/ +template +__global__ void riroi_align_rotated_forward_cuda_kernel( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int num_samples, const bool clockwise, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int num_orientations, scalar_t *top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int o = (index / pooled_width / pooled_height) % num_orientations; + int c = + (index / pooled_width / pooled_height / num_orientations) % channels; + int n = index / pooled_width / pooled_height / num_orientations / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not using rounding; this implementation detail is critical + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + // find aligned index + scalar_t ind_float = theta * num_orientations / (2 * M_PI); + int ind = floorf(ind_float); + scalar_t l_var = ind_float - (scalar_t)ind; + scalar_t r_var = 1.0 - l_var; + // correct start channel + ind = (ind + num_orientations) % num_orientations; + // rotated channel + int ind_rot = (o - ind + num_orientations) % num_orientations; + int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; + const scalar_t *offset_bottom_data = + bottom_data + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot) * + height * width; + + const scalar_t *offset_bottom_data_plus = + bottom_data + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot_plus) * + height * width; + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (num_samples > 0) + ? num_samples + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosscalar_theta = cos(theta); + scalar_t sinscalar_theta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + scalar_t output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta (counterclockwise) around the center and translate + scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h; + scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w; + + scalar_t val = bilinear_interpolate( + offset_bottom_data, height, width, y, x, index); + scalar_t val_plus = bilinear_interpolate( + offset_bottom_data_plus, height, width, y, x, index); + output_val += r_var * val + l_var * val_plus; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +/*** Backward ***/ +template +__global__ void riroi_align_rotated_backward_cuda_kernel( + const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, + const scalar_t spatial_scale, const int num_samples, const bool clockwise, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int num_orientations, + scalar_t *bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int o = (index / pooled_width / pooled_height) % num_orientations; + int c = + (index / pooled_width / pooled_height / num_orientations) % channels; + int n = index / pooled_width / pooled_height / num_orientations / channels; + + const scalar_t *offset_bottom_rois = bottom_rois + n * 6; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not round + scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale; + scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale; + scalar_t roi_width = offset_bottom_rois[3] * spatial_scale; + scalar_t roi_height = offset_bottom_rois[4] * spatial_scale; + // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0; + scalar_t theta = offset_bottom_rois[5]; + // Force malformed ROIs to be 1x1 + roi_width = max(roi_width, (scalar_t)1.); + roi_height = max(roi_height, (scalar_t)1.); + + scalar_t bin_size_h = static_cast(roi_height) / + static_cast(pooled_height); + scalar_t bin_size_w = + static_cast(roi_width) / static_cast(pooled_width); + + // find aligned index + scalar_t ind_float = theta * num_orientations / (2 * M_PI); + int ind = floorf(ind_float); + scalar_t l_var = ind_float - (scalar_t)ind; + scalar_t r_var = 1.0 - l_var; + // correct start channel + ind = (ind + num_orientations) % num_orientations; + // rotated channel + int ind_rot = (o - ind + num_orientations) % num_orientations; + int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations; + scalar_t *offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot) * + height * width; + scalar_t *offset_bottom_diff_plus = + bottom_diff + (roi_batch_ind * channels * num_orientations + + c * num_orientations + ind_rot_plus) * + height * width; + int top_offset = + (n * channels * num_orientations + c * num_orientations + o) * + pooled_height * pooled_width; + const scalar_t *offset_top_diff = top_diff + top_offset; + const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (num_samples > 0) + ? num_samples + : ceilf(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width); + + // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). + // Appropriate translation needs to be applied after. + if (clockwise) { + theta = -theta; // If clockwise, the angle needs to be reversed. + } + scalar_t roi_start_h = -roi_height / 2.0; + scalar_t roi_start_w = -roi_width / 2.0; + scalar_t cosTheta = cos(theta); + scalar_t sinTheta = sin(theta); + + // We do average (integral) pooling inside a bin + const scalar_t count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) { // e.g., iy = 0, 1 + const scalar_t yy = + roi_start_h + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const scalar_t xx = roi_start_w + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + + // Rotate by theta around the center and translate + scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h; + scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w; + + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, + w4, x_low, x_high, y_low, + y_high, index); + + scalar_t g1 = top_diff_this_bin * w1 / count; + scalar_t g2 = top_diff_this_bin * w2 / count; + scalar_t g3 = top_diff_this_bin * w3 / count; + scalar_t g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var); + + atomicAdd(offset_bottom_diff_plus + y_low * width + x_low, + g1 * l_var); + atomicAdd(offset_bottom_diff_plus + y_low * width + x_high, + g2 * l_var); + atomicAdd(offset_bottom_diff_plus + y_high * width + x_low, + g3 * l_var); + atomicAdd(offset_bottom_diff_plus + y_high * width + x_high, + g4 * l_var); + + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RiRoIAlignBackward + +#endif // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh index 33571f29674f53674415afe1bb4cc3ea0d8a9865..8274dc50c709630c4ee456efd543aa1265049b41 100644 --- a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh @@ -20,7 +20,7 @@ template __global__ void roi_align_rotated_forward_cuda_kernel( const int nthreads, const scalar_t *bottom_data, const scalar_t *bottom_rois, const scalar_t spatial_scale, - const int sample_num, const bool aligned, const bool clockwise, + const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { @@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel( bottom_data + (roi_batch_ind * channels + c) * height * width; // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sample_num > 0) - ? sample_num + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = - (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. @@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel( template __global__ void roi_align_rotated_backward_cuda_kernel( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, - const scalar_t spatial_scale, const int sample_num, const bool aligned, + const scalar_t spatial_scale, const int sampling_ratio, const bool aligned, const bool clockwise, const int channels, const int height, const int width, const int pooled_height, const int pooled_width, scalar_t *bottom_diff) { CUDA_1D_KERNEL_LOOP(index, nthreads) { @@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel( const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; // We use roi_bin_grid to sample the grid and mimic integral - int roi_bin_grid_h = (sample_num > 0) - ? sample_num + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio : ceilf(roi_height / pooled_height); // e.g., = 2 int roi_bin_grid_w = - (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width); + (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width); // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y). // Appropriate translation needs to be applied after. diff --git a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh index 3b95dc79080323a0b7d1d6bba06a3a46b04a3f05..fc0aacf1435f8715fae92de535bf01bac07ac39a 100644 --- a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh @@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num, // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N, // npoints): -1 means point does not in this box, otherwise: encode (x_idxs, // y_idxs, z_idxs) by binary bit - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.y; - if (pt_idx >= pts_num || box_idx >= boxes_num) return; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (box_idx >= boxes_num) return; - pts += pt_idx * 3; - rois += box_idx * 7; - pts_mask += box_idx * pts_num + pt_idx; + pts += pt_idx * 3; + rois += box_idx * 7; + pts_mask += box_idx * pts_num + pt_idx; - T local_x = 0, local_y = 0; - int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); + T local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y); - pts_mask[0] = -1; - if (cur_in_flag > 0) { - T local_z = pts[2] - rois[2]; - T x_size = rois[3], y_size = rois[4], z_size = rois[5]; + pts_mask[0] = -1; + if (cur_in_flag > 0) { + T local_z = pts[2] - rois[2]; + T x_size = rois[3], y_size = rois[4], z_size = rois[5]; - T x_res = x_size / out_x; - T y_res = y_size / out_y; - T z_res = z_size / out_z; + T x_res = x_size / out_x; + T y_res = y_size / out_y; + T z_res = z_size / out_z; - unsigned int x_idx = int((local_x + x_size / 2) / x_res); - unsigned int y_idx = int((local_y + y_size / 2) / y_res); - unsigned int z_idx = int(local_z / z_res); + unsigned int x_idx = int((local_x + x_size / 2) / x_res); + unsigned int y_idx = int((local_y + y_size / 2) / y_res); + unsigned int z_idx = int(local_z / z_res); - x_idx = min(max(x_idx, 0), out_x - 1); - y_idx = min(max(y_idx, 0), out_y - 1); - z_idx = min(max(z_idx, 0), out_z - 1); + x_idx = min(max(x_idx, 0), out_x - 1); + y_idx = min(max(y_idx, 0), out_y - 1); + z_idx = min(max(z_idx, 0), out_z - 1); - unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; + unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx; - pts_mask[0] = idx_encoding; + pts_mask[0] = idx_encoding; + } } } @@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num, T *pts_idx_of_voxels) { // params pts_mask: (N, npoints) 0 or 1 // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) - - int box_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (box_idx >= boxes_num) return; - - int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter - pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; - - for (int k = 0; k < pts_num; k++) { - if (pts_mask[box_idx * pts_num + k] != -1) { - unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; - unsigned int x_idx = (idx_encoding >> 16) & 0xFF; - unsigned int y_idx = (idx_encoding >> 8) & 0xFF; - unsigned int z_idx = idx_encoding & 0xFF; - unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + - y_idx * out_z * max_pts_each_voxel + - z_idx * max_pts_each_voxel; - unsigned int cnt = pts_idx_of_voxels[base_offset]; - if (cnt < max_num_pts) { - pts_idx_of_voxels[base_offset + cnt + 1] = k; - pts_idx_of_voxels[base_offset]++; + CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) { + int max_num_pts = max_pts_each_voxel - 1; // index 0 is the counter + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel; + + for (int k = 0; k < pts_num; k++) { + if (pts_mask[box_idx * pts_num + k] != -1) { + unsigned int idx_encoding = pts_mask[box_idx * pts_num + k]; + unsigned int x_idx = (idx_encoding >> 16) & 0xFF; + unsigned int y_idx = (idx_encoding >> 8) & 0xFF; + unsigned int z_idx = idx_encoding & 0xFF; + unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel + + y_idx * out_z * max_pts_each_voxel + + z_idx * max_pts_each_voxel; + unsigned int cnt = pts_idx_of_voxels[base_offset]; + if (cnt < max_num_pts) { + pts_idx_of_voxels[base_offset + cnt + 1] = k; + pts_idx_of_voxels[base_offset]++; + } } } } @@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels, int box_idx = blockIdx.z; int channel_idx = blockIdx.y; - int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; - - int x_idx = voxel_idx_flat / (out_y * out_z); - int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; - int z_idx = voxel_idx_flat % out_z; - if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || - y_idx >= out_y || z_idx >= out_z) - return; - - int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; - pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + - offset_base * max_pts_each_voxel; - pooled_features += box_idx * out_x * out_y * out_z * channels + - offset_base * channels + channel_idx; - argmax += box_idx * out_x * out_y * out_z * channels + - offset_base * channels + channel_idx; - - int argmax_idx = -1; - float max_val = -1e50; - - int total_pts = pts_idx_of_voxels[0]; - - for (int k = 1; k <= total_pts; k++) { - if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) { - max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; - argmax_idx = pts_idx_of_voxels[k]; + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + pooled_features += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + argmax += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + int argmax_idx = -1; + float max_val = -1e50; + + int total_pts = pts_idx_of_voxels[0]; + + for (int k = 1; k <= total_pts; k++) { + if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > + max_val) { + max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; + argmax_idx = pts_idx_of_voxels[k]; + } } - } - if (argmax_idx != -1) { - pooled_features[0] = max_val; + if (argmax_idx != -1) { + pooled_features[0] = max_val; + } + argmax[0] = argmax_idx; } - argmax[0] = argmax_idx; } template @@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels, int box_idx = blockIdx.z; int channel_idx = blockIdx.y; - int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; - - int x_idx = voxel_idx_flat / (out_y * out_z); - int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; - int z_idx = voxel_idx_flat % out_z; - if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || - y_idx >= out_y || z_idx >= out_z) - return; - - int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; - pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + - offset_base * max_pts_each_voxel; - pooled_features += box_idx * out_x * out_y * out_z * channels + - offset_base * channels + channel_idx; - - float sum_val = 0; - int total_pts = pts_idx_of_voxels[0]; - - for (int k = 1; k <= total_pts; k++) { - sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; - } + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + pooled_features += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + float sum_val = 0; + int total_pts = pts_idx_of_voxels[0]; + + for (int k = 1; k <= total_pts; k++) { + sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx]; + } - if (total_pts > 0) { - pooled_features[0] = sum_val / total_pts; + if (total_pts > 0) { + pooled_features[0] = sum_val / total_pts; + } } } @@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels, int box_idx = blockIdx.z; int channel_idx = blockIdx.y; - int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; - - int x_idx = voxel_idx_flat / (out_y * out_z); - int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; - int z_idx = voxel_idx_flat % out_z; - if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || - y_idx >= out_y || z_idx >= out_z) - return; - - int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; - argmax += box_idx * out_x * out_y * out_z * channels + - offset_base * channels + channel_idx; - grad_out += box_idx * out_x * out_y * out_z * channels + + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + argmax += box_idx * out_x * out_y * out_z * channels + offset_base * channels + channel_idx; + grad_out += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; - if (argmax[0] == -1) return; + if (argmax[0] == -1) return; - atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); + atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1); + } } template @@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels, int box_idx = blockIdx.z; int channel_idx = blockIdx.y; - int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x; - - int x_idx = voxel_idx_flat / (out_y * out_z); - int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; - int z_idx = voxel_idx_flat % out_z; - if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x || - y_idx >= out_y || z_idx >= out_z) - return; - - int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; - pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + - offset_base * max_pts_each_voxel; - grad_out += box_idx * out_x * out_y * out_z * channels + - offset_base * channels + channel_idx; - - int total_pts = pts_idx_of_voxels[0]; - float cur_grad = 1 / fmaxf(float(total_pts), 1.0); - for (int k = 1; k <= total_pts; k++) { - atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, - grad_out[0] * cur_grad); + CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) { + int x_idx = voxel_idx_flat / (out_y * out_z); + int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z; + int z_idx = voxel_idx_flat % out_z; + if (box_idx >= boxes_num || channel_idx >= channels) return; + + int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx; + pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel + + offset_base * max_pts_each_voxel; + grad_out += box_idx * out_x * out_y * out_z * channels + + offset_base * channels + channel_idx; + + int total_pts = pts_idx_of_voxels[0]; + float cur_grad = 1 / fmaxf(float(total_pts), 1.0); + for (int k = 1; k <= total_pts; k++) { + atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx, + grad_out[0] * cur_grad); + } } } diff --git a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh index 7597719e69098ca4942c803e9853556daaa3b375..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68 100644 --- a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh @@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num, // params boxes3d: (B, M, 7) // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means // background points - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.y; int bs_idx = blockIdx.z; + CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) { + if (box_idx >= boxes_num || bs_idx >= batch_size) return; - if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) { - return; - } - int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; - pts_assign[assign_idx] = 0; + int assign_idx = + bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx; + pts_assign[assign_idx] = 0; - int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; - int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; + int box_offset = bs_idx * boxes_num * 7 + box_idx * 7; + int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3; - T local_x = 0, local_y = 0; - int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, - local_x, local_y); - pts_assign[assign_idx] = cur_in_flag; + T local_x = 0, local_y = 0; + int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset, + local_x, local_y); + pts_assign[assign_idx] = cur_in_flag; + } } __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, @@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num, // params pts_assign: (B, N) // params pts_idx: (B, M, 512) // params pooled_empty_flag: (B, M) - - int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (boxes_idx >= boxes_num) { - return; - } - - int bs_idx = blockIdx.y; - - int cnt = 0; - for (int k = 0; k < pts_num; k++) { - if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) { - if (cnt < sampled_pts_num) { - pts_idx[bs_idx * boxes_num * sampled_pts_num + - boxes_idx * sampled_pts_num + cnt] = k; - cnt++; - } else - break; + CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) { + int bs_idx = blockIdx.y; + + int cnt = 0; + for (int k = 0; k < pts_num; k++) { + if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + + boxes_idx]) { + if (cnt < sampled_pts_num) { + pts_idx[bs_idx * boxes_num * sampled_pts_num + + boxes_idx * sampled_pts_num + cnt] = k; + cnt++; + } else + break; + } } - } - if (cnt == 0) { - pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; - } else if (cnt < sampled_pts_num) { - // duplicate same points for sampling - for (int k = cnt; k < sampled_pts_num; k++) { - int duplicate_idx = k % cnt; - int base_offset = - bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; - pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; + if (cnt == 0) { + pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1; + } else if (cnt < sampled_pts_num) { + // duplicate same points for sampling + for (int k = cnt; k < sampled_pts_num; k++) { + int duplicate_idx = k % cnt; + int base_offset = + bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num; + pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx]; + } } } } @@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward( // params pts_feature: (B, N, C) // params pooled_features: (B, M, 512, 3+C) // params pooled_empty_flag: (B, M) - - int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x; int box_idx = blockIdx.y; int bs_idx = blockIdx.z; - - if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num || - bs_idx >= batch_size) { - return; - } - - if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) { - return; + CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) { + if (box_idx >= boxes_num || bs_idx >= batch_size) return; + if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return; + + int temp_idx = bs_idx * boxes_num * sampled_pts_num + + box_idx * sampled_pts_num + sample_pt_idx; + int src_pt_idx = pts_idx[temp_idx]; + int dst_feature_offset = temp_idx * (3 + feature_in_len); + + for (int j = 0; j < 3; j++) + pooled_features[dst_feature_offset + j] = + xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; + + int src_feature_offset = + bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; + memcpy(pooled_features + dst_feature_offset + 3, + pts_feature + src_feature_offset, feature_in_len * sizeof(T)); } - - int temp_idx = bs_idx * boxes_num * sampled_pts_num + - box_idx * sampled_pts_num + sample_pt_idx; - int src_pt_idx = pts_idx[temp_idx]; - int dst_feature_offset = temp_idx * (3 + feature_in_len); - - for (int j = 0; j < 3; j++) - pooled_features[dst_feature_offset + j] = - xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j]; - - int src_feature_offset = - bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len; - memcpy(pooled_features + dst_feature_offset + 3, - pts_feature + src_feature_offset, feature_in_len * sizeof(T)); } #endif // ROIPOINT_POOL3D_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh @@ -0,0 +1,129 @@ +// Copyright (c) OpenMMLab. All rights reserved. +// Modified from +// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu +#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH +#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH + +#ifdef MMCV_USE_PARROTS +#include "parrots_cuda_helper.hpp" +#else +#include "pytorch_cuda_helper.hpp" +#endif + +template +__global__ void rotated_feature_align_forward_kernel( + const int nthreads, const int points, const scalar_t* bottom_data, + const scalar_t* best_bboxes, const scalar_t spatial_scale, + const int channels, const int height, const int width, scalar_t* top_data) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + const scalar_t* bbox_offset = + best_bboxes + ((n * height + h) * width + w) * 5; + scalar_t roi_y = bbox_offset[0] * spatial_scale; + scalar_t roi_x = bbox_offset[1] * spatial_scale; + + scalar_t px[5] = {roi_x, 0, 0, 0, 0}; + scalar_t py[5] = {roi_y, 0, 0, 0, 0}; + + if (points > 1) { + scalar_t roi_w = bbox_offset[2] * spatial_scale; + scalar_t roi_h = bbox_offset[3] * spatial_scale; + scalar_t roi_a = bbox_offset[4]; + + scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; + scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); + scalar_t wx = cosa * w_2, wy = sina * w_2; + scalar_t hx = -sina * h_2, hy = cosa * h_2; + + px[1] = roi_x + wx + hx; + py[1] = roi_y + wy + hy; + px[2] = roi_x - wx + hx; + py[2] = roi_y - wy + hy; + px[3] = roi_x - wx - hx; + py[3] = roi_y - wy - hy; + px[4] = roi_x + wx - hx; + py[4] = roi_y + wy - hy; + } + + const scalar_t* offset_bottom_data = + bottom_data + (n * channels + c) * height * width; + + scalar_t output_val = bottom_data[index]; + for (int i = 0; i < points; i++) { + output_val += bilinear_interpolate(offset_bottom_data, height, + width, py[i], px[i], i); + } + top_data[index] = output_val; + } +} + +template +__global__ void rotated_feature_align_backward_kernel( + const int nthreads, const int points, const scalar_t* top_diff, + const scalar_t* best_bboxes, const scalar_t spatial_scale, + const int channels, const int height, const int width, + scalar_t* bottom_diff) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + int w = index % width; + int h = (index / width) % height; + int c = (index / width / height) % channels; + int n = index / width / height / channels; + + const scalar_t* bbox_offset = + best_bboxes + ((n * height + h) * width + w) * 5; + scalar_t roi_y = bbox_offset[0] * spatial_scale; + scalar_t roi_x = bbox_offset[1] * spatial_scale; + + scalar_t px[5] = {roi_x, 0, 0, 0, 0}; + scalar_t py[5] = {roi_y, 0, 0, 0, 0}; + + if (points > 1) { + scalar_t roi_w = bbox_offset[2] * spatial_scale; + scalar_t roi_h = bbox_offset[3] * spatial_scale; + scalar_t roi_a = bbox_offset[4]; + + scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2; + scalar_t cosa = cosf(roi_a), sina = sinf(roi_a); + scalar_t wx = cosa * w_2, wy = sina * w_2; + scalar_t hx = -sina * h_2, hy = cosa * h_2; + + px[1] = roi_x + wx + hx; + py[1] = roi_y + wy + hy; + px[2] = roi_x - wx + hx; + py[2] = roi_y - wy + hy; + px[3] = roi_x - wx - hx; + py[3] = roi_y - wy - hy; + px[4] = roi_x + wx - hx; + py[4] = roi_y + wy - hy; + } + + scalar_t* offset_bottom_diff = + bottom_diff + (n * channels + c) * height * width; + scalar_t value_top_diff = top_diff[index]; + + atomicAdd(bottom_diff + index, value_top_diff); + for (int i = 0; i < points; i++) { + scalar_t w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, py[i], px[i], w1, + w2, w3, w4, x_low, x_high, y_low, + y_high, i); + scalar_t g1 = value_top_diff * w1; + scalar_t g2 = value_top_diff * w2; + scalar_t g3 = value_top_diff * w3; + scalar_t g4 = value_top_diff * w4; + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, g1); + atomicAdd(offset_bottom_diff + y_low * width + x_high, g2); + atomicAdd(offset_bottom_diff + y_high * width + x_low, g3); + atomicAdd(offset_bottom_diff + y_high * width + x_high, g4); + } + } + } +} +#endif // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh new file mode 100644 index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh @@ -0,0 +1,236 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef INDICE_CU_H_ +#define INDICE_CU_H_ +#include +#include + +#include + +template +__global__ void prepareIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void prepareDeConvIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView indicesOut, + tv::TensorView gridsOut, tv::TensorView indicePairs, + tv::TensorView indiceNum, tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + auto indicePairsDim2 = indicePairs.dim(2); + Index index; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 0, oldNum) = ix; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + indicePairs(offset, 1, oldNum) = index; + indicePairUnique[offset * indicePairsDim2 + oldNum] = index; + } + } +} + +template +__global__ void assignGridAndIndiceOutKernel( + tv::TensorView indicesOut, tv::TensorView gridsOut, + int numAct, tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, int batchSize) { + Index index; + auto indicesOutPtr = indicesOut.data(); + for (int ix : tv::KernelLoopX(numAct)) { + index = indicePairUnique[ix]; + gridsOut[index] = ix; + index = tv::rowArrayIdxInv( + index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data()); + indicesOut[ix * (NDim + 1)] = index % batchSize; + } +} + +template +__global__ void assignIndicePairsKernel( + tv::TensorView indicesOut, tv::TensorView gridsOut, + int numActIn, tv::TensorView indicePairs, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape) { + Index index; + int kernelVolume = indicePairs.dim(0); + for (int ix : tv::KernelLoopX(numActIn)) { + for (int i = 0; i < kernelVolume; ++i) { + index = indicePairs(i, 1, ix); + if (index > -1) { + indicePairs(i, 1, ix) = gridsOut[index]; + } + } + } +} + +template +__global__ void prepareSubMGridKernel( + tv::TensorView indicesIn, tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + index = tv::rowArrayIdx(indicesIn.data() + ix * (NDim + 1) + 1, + outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + gridsOut[index] = ix; + } +} + +template +__global__ void getSubMIndicePairsKernel( + tv::TensorView indicesIn, tv::TensorView gridsOut, + tv::TensorView indicePairs, tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape) { + auto numActIn = indicesIn.dim(0); + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index numValidPoints = 0; + Index validPoints[KernelMaxVolume * (NDim + 1)]; + Index *pointPtr = nullptr; + Index index = 0; + for (int ix : tv::KernelLoopX(numActIn)) { + numValidPoints = getValidOutPos( + indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(), + stride.data(), padding.data(), dilation.data(), outSpatialShape.data(), + validPoints); + for (int i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape.data()) + + spatialVolume * indicesIn(ix, 0); + if (gridsOut[index] > -1) { + auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1)); + indicePairs(offset, 1, oldNum) = gridsOut[index]; + indicePairs(offset, 0, oldNum) = ix; + } + } + } +} + +template +__global__ void resetGridKernel(const Index *indicePairUnique, + tv::TensorView gridsOut, + int numAct) { + for (int ix : tv::KernelLoopX(numAct)) { + gridsOut[indicePairUnique[ix]] = -1; + } +} + +template +__global__ void resetGridSubMKernel( + const Index *indices, tv::TensorView gridsOut, + const tv::SimpleVector outSpatialShape, int numAct) { + int outSpatialShapeReg[NDim]; + for (int i = 0; i < NDim; ++i) { + outSpatialShapeReg[i] = outSpatialShape[i]; + } + Index spatialVolume = 1; + auto indsPtr = indices; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index index; + for (int ix : tv::KernelLoopX(numAct)) { + indsPtr = indices + ix * (NDim + 1); + index = tv::rowArrayIdx(indsPtr + 1, outSpatialShapeReg); + gridsOut[index + spatialVolume * indsPtr[0]] = -1; + } +} + +#endif diff --git a/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh new file mode 100644 index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7 --- /dev/null +++ b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh @@ -0,0 +1,160 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef REORDERING_CU_H_ +#define REORDERING_CU_H_ +#include + +template +__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, + int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) + buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] = + features[inds[ilp] + iy]; + } + } + } +} + +template +__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) + reinterpret_cast( + buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] = + reinterpret_cast(features)[inds[ilp] + iy]; + } + } + } +} + +template +__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features, + const Index *indices, int size, + int numPlanes) { + int ILPStrideY[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; + features += blockIdx.x * NumTLP; + buffer += blockIdx.x * NumTLP; + + for (int iy : tv::KernelLoopY(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + reinterpret_cast( + buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] = + reinterpret_cast( + features)[indices[iy + ILPStrideY[ilp]] * numPlanes + + threadIdx.x]; + } + } +} + +template +__global__ void scatterAddGenericKernel(scalar_t *outFeatures, + const scalar_t *buffer, + const Index *indices, int size, + int numPlanes) { + int ILPStrideX[NumILP]; + Index inds[NumILP]; +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x; + for (int ix : tv::KernelLoopX(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) { + if (ix + ILPStrideX[ilp] < size) + inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes; + } + for (int iy : tv::KernelLoopY(numPlanes)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + if (ix + ILPStrideX[ilp] < size) { + outFeatures[inds[ilp] + iy] += + buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy]; + } + } + } + } +} + +template +__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures, + const scalar_t *buffer, + const Index *indices, int size, + int numPlanes) { + int ILPStrideY[NumILP]; + constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t); +#pragma unroll + for (int ilp = 0; ilp < NumILP; ilp++) + ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y; + outFeatures += blockIdx.x * NumTLP; + buffer += blockIdx.x * NumTLP; + scalar_t buf[vecloadFactor]; + scalar_t buf2[vecloadFactor]; + Index idx; + for (int iy : tv::KernelLoopY(size)) { +#pragma unroll + for (int ilp = 0; ilp < NumILP; ++ilp) { + idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x; + reinterpret_cast(buf)[0] = + reinterpret_cast(outFeatures)[idx]; + reinterpret_cast(buf2)[0] = reinterpret_cast( + buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x]; +#pragma unroll + for (int i = 0; i < vecloadFactor; i++) { + buf[i] += buf2[i]; + } + reinterpret_cast(outFeatures)[idx] = + reinterpret_cast(buf)[0]; + } + } +} + +#endif diff --git a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh index 43aecb3a0d3585491584c54a6881645573baafbf..971b496e589d2210131351305cbaf0ed1a027cb1 100644 --- a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh @@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel( int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b || c_idx >= c) return; - if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; + weight += bs_idx * n * 3 + pt_idx * 3; + points += bs_idx * c * m + c_idx * m; + idx += bs_idx * n * 3 + pt_idx * 3; + out += bs_idx * c * n + c_idx * n; - weight += bs_idx * n * 3 + pt_idx * 3; - points += bs_idx * c * m + c_idx * m; - idx += bs_idx * n * 3 + pt_idx * 3; - out += bs_idx * c * n + c_idx * n; - - out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + - weight[2] * points[idx[2]]; + out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] + + weight[2] * points[idx[2]]; + } } template @@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel( int bs_idx = blockIdx.z; int c_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - - if (bs_idx >= b || c_idx >= c || pt_idx >= n) return; - - grad_out += bs_idx * c * n + c_idx * n + pt_idx; - weight += bs_idx * n * 3 + pt_idx * 3; - grad_points += bs_idx * c * m + c_idx * m; - idx += bs_idx * n * 3 + pt_idx * 3; - - atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); - atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); - atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b || c_idx >= c) return; + + grad_out += bs_idx * c * n + c_idx * n + pt_idx; + weight += bs_idx * n * 3 + pt_idx * 3; + grad_points += bs_idx * c * m + c_idx * m; + idx += bs_idx * n * 3 + pt_idx * 3; + + atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]); + atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]); + atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]); + } } #endif // THREE_INTERPOLATE_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh index 824da4c5c02fbaf3b87730df910e0763269cd832..15434121b94033afb2fcb9945a83db15b92262d4 100644 --- a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh @@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m, // idx: (B, N, 3) int bs_idx = blockIdx.y; - int pt_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (bs_idx >= b || pt_idx >= n) return; + CUDA_1D_KERNEL_LOOP(pt_idx, n) { + if (bs_idx >= b) return; - unknown += bs_idx * n * 3 + pt_idx * 3; - known += bs_idx * m * 3; - dist2 += bs_idx * n * 3 + pt_idx * 3; - idx += bs_idx * n * 3 + pt_idx * 3; + unknown += bs_idx * n * 3 + pt_idx * 3; + known += bs_idx * m * 3; + dist2 += bs_idx * n * 3 + pt_idx * 3; + idx += bs_idx * n * 3 + pt_idx * 3; - T ux = unknown[0]; - T uy = unknown[1]; - T uz = unknown[2]; + T ux = unknown[0]; + T uy = unknown[1]; + T uz = unknown[2]; - double best1 = 1e40, best2 = 1e40, best3 = 1e40; - int besti1 = 0, besti2 = 0, besti3 = 0; - for (int k = 0; k < m; ++k) { - T x = known[k * 3 + 0]; - T y = known[k * 3 + 1]; - T z = known[k * 3 + 2]; - T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); - if (d < best1) { - best3 = best2; - besti3 = besti2; - best2 = best1; - besti2 = besti1; - best1 = d; - besti1 = k; - } else if (d < best2) { - best3 = best2; - besti3 = besti2; - best2 = d; - besti2 = k; - } else if (d < best3) { - best3 = d; - besti3 = k; + double best1 = 1e40, best2 = 1e40, best3 = 1e40; + int besti1 = 0, besti2 = 0, besti3 = 0; + for (int k = 0; k < m; ++k) { + T x = known[k * 3 + 0]; + T y = known[k * 3 + 1]; + T z = known[k * 3 + 2]; + T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z); + if (d < best1) { + best3 = best2; + besti3 = besti2; + best2 = best1; + besti2 = besti1; + best1 = d; + besti1 = k; + } else if (d < best2) { + best3 = best2; + besti3 = besti2; + best2 = d; + besti2 = k; + } else if (d < best3) { + best3 = d; + besti3 = k; + } } + dist2[0] = best1; + dist2[1] = best2; + dist2[2] = best3; + idx[0] = besti1; + idx[1] = besti2; + idx[2] = besti3; } - dist2[0] = best1; - dist2[1] = best2; - dist2[2] = best3; - idx[0] = besti1; - idx[1] = besti2; - idx[2] = besti3; } #endif // THREE_NN_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh index 62e118b35294b864b5374394c8ae84070b8c5afb..021b488d8d716c9e8132173bf04491d42b7b6fa2 100644 --- a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh +++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh @@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel( // To save some computation auto points_offset = points + index * num_features; auto coors_offset = coors + index * NDim; - int c_x = floor((points_offset[0] - coors_x_min) / voxel_x); + int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x); if (c_x < 0 || c_x >= grid_x) { coors_offset[0] = -1; continue; } - int c_y = floor((points_offset[1] - coors_y_min) / voxel_y); + int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y); if (c_y < 0 || c_y >= grid_y) { coors_offset[0] = -1; coors_offset[1] = -1; continue; } - int c_z = floor((points_offset[2] - coors_z_min) / voxel_z); + int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z); if (c_z < 0 || c_z >= grid_z) { coors_offset[0] = -1; coors_offset[1] = -1; @@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor, CUDA_1D_KERNEL_LOOP(index, num_points) { auto coor_offset = coor + index * NDim; // skip invalid points - if ((index >= num_points) || (coor_offset[0] == -1)) return; + if (coor_offset[0] == -1) continue; int num = 0; int coor_x = coor_offset[0]; @@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor, point_to_pointidx[index] = i; } else if (num >= max_points) { // out of boundary - return; + break; } } } @@ -166,4 +166,51 @@ __global__ void determin_voxel_num( } } +__global__ void nondeterministic_get_assign_pos( + const int nthreads, const int32_t* coors_map, int32_t* pts_id, + int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + int coors_idx = coors_map[thread_idx]; + if (coors_idx > -1) { + int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1); + pts_id[thread_idx] = coors_pts_pos; + if (coors_pts_pos == 0) { + coors_order[coors_idx] = atomicAdd(coors_count, 1); + } + } + } +} + +template +__global__ void nondeterministic_assign_point_voxel( + const int nthreads, const T* points, const int32_t* coors_map, + const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count, + const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count, + const int max_voxels, const int max_points, const int num_features, + const int NDim) { + CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { + int coors_idx = coors_map[thread_idx]; + int coors_pts_pos = pts_id[thread_idx]; + if (coors_idx > -1 && coors_pts_pos < max_points) { + int coors_pos = coors_order[coors_idx]; + if (coors_pos < max_voxels) { + auto voxels_offset = + voxels + (coors_pos * max_points + coors_pts_pos) * num_features; + auto points_offset = points + thread_idx * num_features; + for (int k = 0; k < num_features; k++) { + voxels_offset[k] = points_offset[k]; + } + if (coors_pts_pos == 0) { + pts_count[coors_pos] = min(reduce_count[coors_idx], max_points); + auto coors_offset = coors + coors_pos * NDim; + auto coors_in_offset = coors_in + coors_idx * NDim; + for (int k = 0; k < NDim; k++) { + coors_offset[k] = coors_in_offset[k]; + } + } + } + } + } +} + #endif // VOXELIZATION_CUDA_KERNEL_CUH diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..58e695a0153e59ca9d0c66040962c2e12d6226b6 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu @@ -0,0 +1,322 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include + +#include "common_mlu_helper.hpp" + +#define COORD_NUM 4 + +__nram__ char nmem_buf[MAX_NRAM_SIZE]; + +template +__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1, + void *nram_addition, const int32_t deal_num) { + __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num); + __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num); +} + +template <> +__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, + void *nram_src1, void *nram_addition, + const int32_t deal_num) { + __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num); + __bang_active_reciphp((float *)nram_addition, (float *)nram_addition, + deal_num); + __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num); + __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num); +} + +template +__mlu_func__ void bboxOverlapsWorkflow( + T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1, + T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right, + T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious, + const int32_t offset, const int32_t mode, const int32_t batches_stride, + const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) { + int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim; + int32_t batch_start = taskId * task_batch_stride; + int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1 + ? task_batch_stride + : num_bbox1 - batch_start; + batch_per_task = batch_per_task > 0 ? batch_per_task : (0); + + if (aligned) { + int32_t num_loop_cpy = batch_per_task / batches_stride; + int32_t num_rem_cpy_batches = batch_per_task % batches_stride; + num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy; + for (int32_t i = 0; i < num_loop_cpy; i++) { + int32_t index = batch_start + i * batches_stride; + int32_t handle_batches = index + batches_stride > num_bbox1 + ? num_rem_cpy_batches + : batches_stride; + int32_t b1 = index; + int32_t b2 = index; + + int32_t base1 = b1 * COORD_NUM; + __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + + int32_t base2 = b2 * COORD_NUM; + __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + // get the width and height + __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride); + __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride); + __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride); + __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride); + + // right - left + offset ---> left + __bang_sub(vec_left, vec_right, vec_left, batches_stride); + __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + + // bottom - top + offset ---> right + __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); + __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + + // zero vector ---> bottom + __nramset(vec_bottom, batches_stride, 0.f); + + // width --> vec_left + __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); + T *width = vec_left; + // height --> vec_right + __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride); + T *height = vec_right; + + // get the b1_area + // (b1_x2 - b1_x1 + offset) ---> vec_top + __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); + __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + + // (b1_y2 - b1_y1 + offset) ---> vec_bottom + __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); + __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + + // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) + // ---> vec_top; + __bang_mul(vec_top, vec_top, vec_bottom, batches_stride); + T *b1_area = vec_top; + + // get the b2_area + // (b2_x2 - b2_x1 + offset) ---> b2_x1 + __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); + __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + + // (b2_y2 - b2_y1 + offset) ---> b2_y1 + __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); + __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + + // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) + // ---> b2_x1; + __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); + T *b2_area = vec_b2_x1; + + // inter_s = width * height + __bang_mul(height, width, height, batches_stride); + T *inter_s = height; + + // offset vector ---> vec_b2_y1 + __nramset(vec_b2_y1, batches_stride, T(offset)); + T *vec_offset = vec_b2_y1; + + if (mode == 0) { + __bang_add(b1_area, b1_area, b2_area, batches_stride); + __bang_sub(b1_area, b1_area, inter_s, batches_stride); + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } else { + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } + T *base_s = b1_area; + + // ious = inter_s / base_s + computeDiv(width, inter_s, base_s, vec_b2_x2, batches_stride); + __memcpy((T *)ious + index, width, handle_batches * sizeof(T), + NRAM2GDRAM); + } + } else { + int32_t num_loop_cpy = num_bbox2 / batches_stride; + int32_t num_rem_cpy_batches = num_bbox2 % batches_stride; + num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy; + for (int32_t i = 0; i < batch_per_task; i++) { + int32_t index1 = batch_start + i; + int32_t b1 = index1; + int32_t base1 = b1 * COORD_NUM; + + // set bbox1 and bbox2 to nram + __nramset(vec_b1_x1, batches_stride, bbox1[base1]); + __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]); + __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]); + __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]); + + for (int32_t j = 0; j < num_loop_cpy; j++) { + int32_t index2 = j * batches_stride; + int32_t handle_batches = index2 + batches_stride > num_bbox2 + ? num_rem_cpy_batches + : batches_stride; + int32_t b2 = index2; + int32_t base2 = b2 * COORD_NUM; + + // copy bbox2 to nram + __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T), + COORD_NUM * sizeof(T), handle_batches - 1); + + // get the width and height + __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride); + __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride); + __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride); + __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride); + + // right - left + offset ---> left + __bang_sub(vec_left, vec_right, vec_left, batches_stride); + __bang_add_const(vec_left, vec_left, (T)offset, batches_stride); + // bottom - top + offset ---> right + __bang_sub(vec_right, vec_bottom, vec_top, batches_stride); + __bang_add_const(vec_right, vec_right, (T)offset, batches_stride); + + // zero vector ---> bottom + __nramset(vec_bottom, batches_stride, (T)0); + + // width --> vec_left + __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride); + T *width = vec_left; + // height --> vec_right + __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride); + T *height = vec_right; + + // get the b1_area + // (b1_x2 - b1_x1 + offset) ---> vec_top + __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride); + __bang_add_const(vec_top, vec_top, (T)offset, batches_stride); + // (b1_y2 - b1_y1 + offset) ---> vec_bottom + __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride); + __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride); + // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset) + // ---> vec_top; + __bang_mul(vec_top, vec_top, vec_bottom, batches_stride); + T *b1_area = vec_top; + + // get the b2_area + // (b2_x2 - b2_x1 + offset) ---> b2_x1 + __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride); + __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride); + // (b2_y2 - b2_y1 + offset) ---> b2_y1 + __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride); + __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride); + // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset) + // ---> b2_x1; + __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride); + T *b2_area = vec_b2_x1; + + // inter_s = width * height + __bang_mul(height, width, height, batches_stride); + T *inter_s = height; + + // offset vector ---> vec_b2_y1 + __nramset(vec_b2_y1, batches_stride, T(offset)); + T *vec_offset = vec_b2_y1; + + if (mode == 0) { + __bang_add(b1_area, b1_area, b2_area, batches_stride); + __bang_sub(b1_area, b1_area, inter_s, batches_stride); + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } else { + __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride); + } + T *base_s = b1_area; + + // ious = inter_s / base_s + computeDiv(width, inter_s, base_s, vec_b2_x2, batches_stride); + int32_t gdram_offset = index1 * num_bbox2 + index2; + __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T), + NRAM2GDRAM); + } + } + } +} + +template +__mlu_global__ void MLUUnion1KernelBBoxOverlaps( + const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1, + const int32_t num_bbox2, const int32_t mode, const bool aligned, + const int32_t offset) { + /* + * NRAM partition + * |-------------------------------------------------------------| + * | vec_b1_x1 | vec_b1_y1 | vec_b1_x2 | vec_b1_y2 | + * |-------------------------------------------------------------| + * | vec_b2_x1 | vec_b2_y1 | vec_b2_x2 | vec_b2_y2 | + * |-------------------------------------------------------------| + * | vec_left | vec_right | vec_top | vec_bottom | + * |-------------------------------------------------------------| + * + */ + const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE); + const int32_t split_nram_num = 12; + const int32_t nram_stride = + align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE; + + void *vec_b1_x1 = nmem_buf; + void *vec_b1_y1 = nmem_buf + nram_stride; + void *vec_b1_x2 = nmem_buf + 2 * nram_stride; + void *vec_b1_y2 = nmem_buf + 3 * nram_stride; + + void *vec_b2_x1 = nmem_buf + 4 * nram_stride; + void *vec_b2_y1 = nmem_buf + 5 * nram_stride; + void *vec_b2_x2 = nmem_buf + 6 * nram_stride; + void *vec_b2_y2 = nmem_buf + 7 * nram_stride; + + void *vec_left = nmem_buf + 8 * nram_stride; + void *vec_right = nmem_buf + 9 * nram_stride; + void *vec_top = nmem_buf + 10 * nram_stride; + void *vec_bottom = nmem_buf + 11 * nram_stride; + + const int32_t vec_length = nram_stride / sizeof(T); + bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2, + (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1, + (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left, + (T *)vec_right, (T *)vec_top, (T *)vec_bottom, + (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode, + vec_length, num_bbox1, num_bbox2, aligned); +} + +void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *bbox1, const void *bbox2, void *ious, + const int32_t num_bbox1, const int32_t num_bbox2, + const int32_t mode, const bool aligned, + const int32_t offset) { + if (d_type == CNRT_FLOAT16) { + MLUUnion1KernelBBoxOverlaps<<>>( + bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset); + } else { + MLUUnion1KernelBBoxOverlaps<<>>( + bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset); + } +} diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..669a9d78e0c48b6761e05ca933cb4689bbcbc272 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp @@ -0,0 +1,190 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef COMMON_MLU_HELPER_HPP_ +#define COMMON_MLU_HELPER_HPP_ + +#define NFU_ALIGN_SIZE 128 // Byte +#define REM_FOR_STACK (128 * 1024) // 128KB reserved for cncc + +#ifdef __BANG_ARCH__ +#define MAX_NRAM_SIZE \ + (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#define MAX_SRAM_SIZE \ + (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK) // 128KB reserved for cncc +#else +#define MAX_NRAM_SIZE (384 * 1024) // 384KB, initialization value +#define MAX_SRAM_SIZE (1920 * 1024) // 1920KB, initialization value +#endif + +#ifndef PAD_UP +#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) +#endif + +#ifndef PAD_DOWN +#define PAD_DOWN(x, y) (((x) / (y)) * (y)) +#endif + +#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y)) + +/*! + * @brief Converts int32 to float32 data type. + * + * @param[out] dst + * Pointer to NRAM that stores int32 type data. + * @param[in,out] dst_addition + * Pointer to NRAM as the workspace of dst, which has the same size as dst. + * It allows empty pointer on MLU300 series. + * @param[in] src + * Pointer to NRAM that stores float32 type data. + * @param[in,out] src_addition + * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. + * It allows empty pointer on MLU300 series. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src, + float *src_addition, const int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_int2float((float *)dst, (int32_t *)src, src_count, 0); +#else + // get sign bit + const float move_23bit = 8388608.0; + // 0x80000000 = 1,000000000,0000000000000000000000000000 + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); + __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + // get 1 or 0 from sign bit + // judg is Odd + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); + __bang_cycle_bor((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * sizeof(float), + NFU_ALIGN_SIZE); + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000001); + __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + // minus xor, positive num invariant + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); + __bang_cycle_mul(dst, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float)); + // convert int32 to float32 + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff); + __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x4b000000); + __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition, + src_count * sizeof(float), NFU_ALIGN_SIZE); + __bang_sub_const(dst, dst, move_23bit, src_count); + // add one + __bang_add(dst, dst, dst_addition, src_count); + // set sign for float32 + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xffffffff); + __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x00000001); + __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0x80000000); + __bang_cycle_band((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * 4, 128); + __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4); +#endif // __BANG_ARCH__ >= 300 +} + +/*! + * @brief Converts float32 to int32 data type with to_zero round mode. + * + * @param[out] dst + * Pointer to NRAM that stores float32 type data. + * @param[in,out] dst_addition + * Pointer to NRAM as the workspace of dst, which has the same size as dst. + * It allows empty pointer on MLU300 series. + * @param[in] src + * Pointer to NRAM that stores int32 type data. + * @param[in,out] src_addition + * Pointer to NRAM as the workspace of src, which has a size of 128 Bytes. + * It allows empty pointer on MLU300 series. + * @param[in] src_count + * The count of elements in src. + */ +__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src, + float *src_addition, const int src_count) { +#if __BANG_ARCH__ >= 300 + __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0); +#else + // sign ===> src_addition + // dst=-1.0 : when src[i] is a negative number + // dst=+1.0 : when src[i] is a positive number + const int floatDchar = sizeof(float) / sizeof(char); + __bang_active_sign((float *)dst, src, src_count); + // dst_addition = abs(src) + __bang_mul(dst_addition, src, (float *)dst, src_count); + // if dst_addition < 1.0 , then src_addition + 1, to fix add error. + __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f); + __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count); + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + 0xbf800000); + // set negative flag -1.0 = 0xbf80000 + __bang_cycle_eq( + (float *)dst, (float *)dst, (float *)src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); // to mark all src in [x<-1.0] + __bang_active_abs(dst_addition, src, src_count); + __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f); + // mask shift move 23 + __bang_cycle_add_tz( + dst_addition, dst_addition, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); // right shift move 23bit + // two`s complement for negatibe + // dst=1.0 , when src <-1.0 + // dst=0.0 , when src >=-1.0 + __bang_sub(dst_addition, dst_addition, (float *)dst, src_count); + // to fix max value + // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0, + // means max value. + __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count); + __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst, + src_count * floatDchar); + // get low 23bit + __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), + (unsigned)0x007fffff); + // mask low 23bit is 1 + __bang_cycle_band((char *)dst_addition, (char *)dst_addition, + (char *)src_addition, src_count * floatDchar, + NFU_ALIGN_SIZE / sizeof(char)); + // set 9 high bit ===> dst + // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000 + // 1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000 + __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000); + __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count, + NFU_ALIGN_SIZE / sizeof(float)); + // src or dst_addition + __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition, + src_count * floatDchar); + __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count); + __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, + src_count * floatDchar); +#endif // __BANG_ARCH__ >= 300 +} + +#endif // COMMON_MLU_HELPER_HPP_ diff --git a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..7624379b68d6df41aae0253df26b9add61c7a76e --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu @@ -0,0 +1,888 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include + +#include "common_mlu_helper.hpp" + +#define PING 0 +#define PONG 1 + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size, + const int32_t dst_stride = 0, + const int32_t src_stride = 0, + const int32_t count = 1) { + if (dst_stride == src_stride) { + __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM); + } else { + __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride, + src_stride, count - 1); + } +} + +template +__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t, + const int32_t c, const int32_t has_weight, + const int32_t partition_nc) { + if (has_weight && partition_nc && t >= 0 && t < c) { + __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM); + } +} + +template +__mlu_func__ void storeOutput(T *dram_output, char *nram_output, + const int32_t size, const int32_t dst_stride = 0, + const int32_t src_stride = 0, + const int32_t count = 1) { + if (dst_stride == src_stride) { + __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM); + } else { + __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride, + src_stride, count - 1); + } +} + +template +__mlu_func__ void compute(T *input, const int32_t *target, const T *weight, + const int32_t has_weight, const int32_t partition_nc, + const int32_t deal_num, const int32_t n_seg, + const int32_t c, const int32_t c_seg, + const int32_t c_start_index, const float alpha, + const float gamma, T *compute_a, T *compute_b, + T *output) { + // set params + const int32_t c_num = + has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg; + const int32_t c_end_index = c_start_index + c_seg; + const int32_t half_epsilon = 0x0400; + const T epsilon_f = + sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon); + + // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma if t == c_i + // = (1 - alpha) * p ^ gamma if t != c_i + __nramset((T *)output, deal_num, (T)(1 - alpha)); + __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num); + for (int32_t i = 0; i < n_seg; ++i) { + const int32_t t = *((uint32_t *)target + i); + if (t >= c_start_index && t < c_end_index) { + const uint32_t index = i * c_num + t - c_start_index; + *((T *)input + index) = -1.0 * (*((T *)input + index)); + *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f; + *((T *)output + index) = alpha; + } + } + if (sizeof(T) == sizeof(half)) { + __bang_half2float((float *)compute_a, (half *)compute_b, deal_num); + __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num); + __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma, + deal_num); + __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num); + __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num); + } else { + __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num); + __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num); + __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num); + } + __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num); + + // 1. max = max(0, -x) if t == c_i + // = max(0, x) if t != c_i + __nramset((T *)compute_b, deal_num, (T)0); + __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num); + + // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max if t == c_i + // = ln(e^(-max)+ e^(-max+x) + max if t != c_i + __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num); + __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num); + + __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num); + __bang_active_exphp((T *)input, (T *)input, deal_num); + __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num); + __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num); + __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num); + + // 3. output = alpha_t * p_t^r * [-log(p_t)] + __bang_mul((T *)output, (T *)output, (T *)input, deal_num); + + // 4. with weight + if (has_weight) { + for (int32_t i = 0; i < n_seg; ++i) { + int32_t t = *((int32_t *)target + i); + if (t >= 0 && t < c) { + t = partition_nc ? 0 : t; + __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num, + *((T *)weight + t), c_num); + } + } + } +} + +template +__mlu_func__ void startPipeline( + const T *input, const int32_t *target, const T *weight, + char *nram_compute_a, char *nram_compute_b, char *nram_input, + char *nram_target, char *nram_weight, char *nram_output, + const int32_t has_weight, const int32_t partition_nc, + const int32_t pingpong_offset, const int32_t pingpong_weight_offset, + const int32_t c_offset_num, const int32_t n, const int32_t n_seg, + const int32_t c, const int32_t c_seg, const float alpha, const float gamma, + T *output) { + // with offset + input = (T *)((char *)input + c_offset_num * sizeof(T)); + output = (T *)((char *)output + c_offset_num * sizeof(T)); + + const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t c_num = has_weight ? c_seg_align_num : c_seg; + const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t load_size = c_seg * sizeof(T); + const int32_t dram_stride = c * sizeof(T); + const int32_t nram_stride = c_num * sizeof(T); + + if (has_weight && !partition_nc) { + loadInput(nram_weight, (T *)weight, load_size, nram_stride, dram_stride, + 1); + __asm__ volatile("sync;\n\t"); + } + const int32_t repeat = n / n_seg; + const int32_t remain = n % n_seg; + + /* + * Pipeline: The pipeline is processed in three stages: Load, Compute, Store. + * The allocated memory space of NRAM is divided into two parts: + * PING and Pong. In a single time slice, PING is used to process + * IO stream and PONG is used for computation. Both of them are + * processed synchronously until finished. + * + * diagram of PINGPONG: + * |------|-----------------------------------------------------------------| + * | | space | + * |------|-----------------------------------------------------------------| + * | time | Ping | Pong | Ping | Pong | Ping | Pong | + * |------|-----------------------------------------------------------------| + * | 0 | L0 | | | | | | + * | 1 | C0 | L1 | | | | | + * | 2 | S0 | C1 | L2 | | | | + * | 3 | | S1 | C2 | L3 | | | + * | 4 | | | S2 | C3 | L4 | | + * | 5 | | | | S3 | C4 | L5 | + * | 6 | | | | | S4 | C5 | + * | 7 | | | | | | S5 | + * |------|-----------------------------------------------------------------| + */ + + // diagram of PINGPONG: L0 + if (repeat > 0) { + loadInput(nram_input, (T *)input, load_size, nram_stride, dram_stride, + n_seg); + loadInput(nram_target, (int32_t *)target, n_seg * sizeof(int32_t)); + loadWeight(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight, + partition_nc); + __asm__ volatile("sync;\n\t"); + } + + // diagram of PINGPONG: C0 and L1 + if (repeat > 1) { + compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight, + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)nram_output); + loadInput((char *)nram_input + pingpong_offset, (T *)input + c * n_seg, + load_size, nram_stride, dram_stride, n_seg); + loadInput((char *)nram_target + pingpong_offset, + (int32_t *)target + n_seg, n_seg * sizeof(int32_t)); + loadWeight((char *)nram_weight + pingpong_weight_offset, (T *)weight, + *((int32_t *)target + n_seg), c, has_weight, partition_nc); + __asm__ volatile("sync;\n\t"); + } + + for (int32_t i = 0; i < repeat - 2; ++i) { + storeOutput((T *)output + i * c * n_seg, + nram_output + (i % 2) * pingpong_offset, load_size, + dram_stride, nram_stride, n_seg); + loadInput((char *)nram_input + (i % 2) * pingpong_offset, + (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride, + dram_stride, n_seg); + loadInput((char *)nram_target + (i % 2) * pingpong_offset, + (int32_t *)target + (i + 2) * n_seg, + n_seg * sizeof(int32_t)); + loadWeight((char *)nram_weight + (i % 2) * pingpong_weight_offset, + (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c, + has_weight, partition_nc); + compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset), + (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * ((i + 1) % 2) * pingpong_weight_offset), + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + ((i + 1) % 2) * pingpong_offset)); + __asm__ volatile("sync;\n\t"); + } + + if (repeat > 1) { + storeOutput((T *)output + (repeat - 2) * c * n_seg, + (char *)nram_output + (repeat % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, n_seg); + } + + if (remain > 0) { + loadInput((char *)nram_input + (repeat % 2) * pingpong_offset, + (T *)input + repeat * c * n_seg, load_size, nram_stride, + dram_stride, remain); + loadInput((char *)nram_target + (repeat % 2) * pingpong_offset, + (int32_t *)target + repeat * n_seg, + remain * sizeof(int32_t)); + loadWeight((char *)nram_weight + (repeat % 2) * pingpong_weight_offset, + (T *)weight, *((int32_t *)target + repeat * n_seg), c, + has_weight, partition_nc); + } + + if (repeat > 0) { + compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset), + (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset), + has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset)); + } + __asm__ volatile("sync;\n\t"); + + if (repeat > 0) { + storeOutput((T *)output + (repeat - 1) * c * n_seg, + (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, n_seg); + } + + if (remain > 0) { + int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T)); + compute((T *)(nram_input + (repeat % 2) * pingpong_offset), + (int32_t *)(nram_target + (repeat % 2) * pingpong_offset), + (T *)(nram_weight + + partition_nc * (repeat % 2) * pingpong_weight_offset), + has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num, + alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b, + (T *)(nram_output + (repeat % 2) * pingpong_offset)); + __asm__ volatile("sync;\n\t"); + + storeOutput((T *)output + repeat * c * n_seg, + (char *)nram_output + (repeat % 2) * pingpong_offset, + load_size, dram_stride, nram_stride, remain); + } + __asm__ volatile("sync;\n\t"); +} + +template +__mlu_func__ void focalLossSigmoidForwardBlock( + const T *input, const int32_t *target, const T *weight, const int32_t n, + const int32_t c, const float alpha, const float gamma, T *output) { + /* + * NRAM partition + * |-----------------------------------------------------------------------| + * | weight | + * |------------------------------- COMPUTE -------------------------------| + * | | | + * | computeA | computeB | + * | | | + * |------------- PING ------------------------------- PONG ---------------| + * | | | + * | input | input | + * | | | + * |-----------------------------------|-----------------------------------| + * | | | + * | output | output | + * | | | + * |-----------------------------------|-----------------------------------| + * | target | target | + * |-----------------------------------|-----------------------------------| + * + * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output), + * PONG(input,output). + * split_target_num is 2: PING(target), PONG(target). + * weight is not NULL: + * The nram-size of weight is equal to c_align_size when partition input-N. + * The nram-size of weight is equal to NFU_ALIGN_SIZE when partition + * input-NC. + */ + + // calculate threshold of c + const int32_t split_pipeline_num = 6; + const int32_t split_target_num = 2; + const int32_t has_weight = weight != NULL; + const int32_t threshold_c = + PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) / + (split_pipeline_num + has_weight), + NFU_ALIGN_SIZE) / + sizeof(T); + const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T)); + const int32_t c_align_size = c_align * sizeof(T); + + if (c <= threshold_c) { + // partition inputN + int32_t c_num = c; + int32_t reservered_align_size = + (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE; + int32_t weight_size = 0; + if (has_weight) { + c_num = c_align; + reservered_align_size = split_target_num * NFU_ALIGN_SIZE; + weight_size = c_align_size; + } + + const int32_t remain_size = + MAX_NRAM_SIZE - weight_size - reservered_align_size; + const int32_t n_seg = + remain_size / (split_pipeline_num * c_num * sizeof(T) + + split_target_num * sizeof(int32_t)); + const int32_t split_pipeline_size = + PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE); + const int32_t compute_size = 2 * split_pipeline_size; + const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2; + + char *nram_weight = (char *)nram_buffer; + char *nram_compute_a = nram_weight + has_weight * c_align_size; + char *nram_compute_b = nram_compute_a + split_pipeline_size; + char *nram_input = nram_compute_b + split_pipeline_size; + char *nram_output = nram_input + split_pipeline_size; + char *nram_target = nram_output + split_pipeline_size; + + startPipeline(input, target, weight, nram_compute_a, nram_compute_b, + nram_input, nram_target, nram_weight, nram_output, + has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c, + alpha, gamma, output); + } else { + // partition inputNC + const int32_t weight_size = has_weight * NFU_ALIGN_SIZE; + const int32_t remain_size = MAX_NRAM_SIZE - weight_size; + const int32_t split_pipeline_size = PAD_DOWN( + (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num, + NFU_ALIGN_SIZE); + const int32_t c_seg = split_pipeline_size / sizeof(T); + const int32_t n_seg = 1; + const int32_t compute_size = 2 * split_pipeline_size; + const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2; + const int32_t pingpong_weight_offset = weight_size / 2; + + char *nram_weight = (char *)nram_buffer; + char *nram_compute_a = nram_weight + weight_size; + char *nram_compute_b = nram_compute_a + split_pipeline_size; + char *nram_input = nram_compute_b + split_pipeline_size; + char *nram_output = nram_input + split_pipeline_size; + char *nram_target = nram_output + split_pipeline_size; + + const int32_t loop_num = (c + c_seg - 1) / c_seg; + const int32_t partition_nc = 1; + for (int32_t i = 0; i < loop_num; ++i) { + const int32_t c_index = i * c_seg; + const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg; + startPipeline(input, target, weight, nram_compute_a, nram_compute_b, + nram_input, nram_target, nram_weight, nram_output, + has_weight, partition_nc, pingpong_offset, + pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr, + alpha, gamma, output); + } + } +} + +template +__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward( + const void *input, const void *target, const void *weight, const int32_t N, + const int32_t C, const float alpha, const float gamma, void *output) { + const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim); + const T *input_offset = (T *)input + N / taskDim * taskId * C; + const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId; + T *output_offset = (T *)output + N / taskDim * taskId * C; + + focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset, + (T *)weight, n_seg, C, alpha, gamma, + (T *)output_offset); +} +} // namespace forward + +namespace backward { +template +__mlu_func__ void loadInput(char *nram_input, char *nram_target, + const T *gdram_input, const int32_t *gdram_target, + const int32_t deal_n, const int32_t total_c, + const bool pingping_flag, const bool has_weight, + const int32_t nram_offset, + const int32_t gdram_offset) { + if (pingping_flag == PONG) { + nram_input += nram_offset; + nram_target += nram_offset; + } + + __memcpy_async(nram_target, gdram_target + gdram_offset / total_c, + deal_n * sizeof(int32_t), GDRAM2NRAM); + + char *nram_input_load = nram_input; + int32_t compute_align_size = 2 * NFU_ALIGN_SIZE; + if (has_weight) { + if (sizeof(T) == sizeof(half)) { + int32_t compute_align_num = compute_align_size / sizeof(float); + int32_t align_c = PAD_UP(total_c, compute_align_num); + int32_t compute_size = deal_n * align_c * sizeof(float); + nram_input_load += compute_size / 2; + } + int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T)); + int32_t total_c_size = total_c * sizeof(T); + int32_t align_c_size = align_c * sizeof(T); + __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size, + GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1); + } else { + if (sizeof(T) == sizeof(half)) { + int32_t compute_size = + PAD_UP(deal_n * total_c * sizeof(float), compute_align_size); + nram_input_load += compute_size / 2; + } + int32_t load_size = deal_n * total_c * sizeof(T); + __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size, + GDRAM2NRAM); + } +} + +template +__mlu_func__ void sigmoid(T *dst_data, const T *src_data, + const int32_t elem_count) { + __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count); + __bang_active_exphp(dst_data, dst_data, elem_count); + __bang_add_const(dst_data, dst_data, T(1), elem_count); + __bang_active_reciphp(dst_data, dst_data, elem_count); +} + +template +__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight, + const float *nram_flt_min, char *nram_pt, + char *nram_alpha_t, char *nram_temp, + char *nram_target, const float *nram_gamma, + char *nram_output, const float alpha, + const int32_t compute_num, const int32_t deal_n, + const int32_t total_c, const bool pingpong_flag, + const int32_t nram_offset, + const bool has_weight) { + if (pingpong_flag == PONG) { + nram_input += nram_offset; + nram_pt += nram_offset; + nram_alpha_t += nram_offset; + nram_temp += nram_offset; + nram_output += nram_offset; + nram_target += nram_offset; + } + + if (sizeof(T) == sizeof(half)) { + const int32_t compute_size = compute_num * sizeof(float); + char *nram_input_load = nram_input + compute_size / 2; + __bang_half2float((float *)nram_input, (half *)nram_input_load, + compute_num); + } + + // 0. alpha_t = alpha - 1 + __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0)); + + // 1. pt = 1 - sigmoid(x) + sigmoid((float *)nram_pt, (float *)nram_input, compute_num); + __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1), + compute_num); + __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num); + + // 2. pt = target[n] == c ? sigmoid(x) : 1 - sigmoid(x) + // alpha_t = target[n] == c ? alpha : alpha - 1 + const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float); + for (int n = 0; n < deal_n; n++) { + const int32_t target_value = ((int32_t *)nram_target)[n]; + if (target_value >= total_c || target_value < 0) continue; + int32_t c_offset = 0; + if (has_weight) { + int32_t c_align_num = nfu_align_num; + if (sizeof(T) == sizeof(half)) { + c_align_num += nfu_align_num; + } + c_offset = PAD_UP(total_c, c_align_num); + } else { + c_offset = total_c; + } + int32_t idx = n * c_offset + target_value; + *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx); + *((float *)nram_alpha_t + idx) = alpha; + } + + // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN)) + __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1), + compute_num); + __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1), + compute_num); + __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp, + (float *)nram_flt_min, compute_num, nfu_align_num); + __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num); + __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma, + compute_num, nfu_align_num); + __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num); + __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t, + compute_num); + __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1), + compute_num); + + // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN)) + __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt, + (float *)nram_flt_min, compute_num, nfu_align_num); + __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num); + __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt, + compute_num); + __bang_cycle_mul((float *)nram_output, (float *)nram_output, + (float *)nram_gamma, compute_num, nfu_align_num); + __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt, + compute_num); + __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1), + compute_num); + __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1), + compute_num); + + // 5. output = output * temp + __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp, + compute_num); + + if (sizeof(T) == sizeof(half)) { + __bang_float2half_rd((half *)nram_output, (float *)nram_output, + compute_num); + } + + if (has_weight) { + // with weight + for (int n = 0; n < deal_n; n++) { + int32_t c_align_num = nfu_align_num; + if (sizeof(T) == sizeof(half)) { + c_align_num += nfu_align_num; + } + int32_t align_c = PAD_UP(total_c, c_align_num); + int32_t target_value = ((int32_t *)nram_target)[n]; + T weight_value = nram_weight[target_value]; + __bang_mul_const((T *)nram_output + n * align_c, + (T *)nram_output + n * align_c, weight_value, align_c); + } + } +} + +template +__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output, + const int32_t deal_n, const int32_t total_c, + const bool pingpong_flag, const bool has_weight, + const int32_t nram_offset, + const int32_t gdram_offset) { + if (pingpong_flag == PONG) { + nram_output += nram_offset; + } + const int32_t store_size = deal_n * total_c * sizeof(T); + if (has_weight) { + int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T)); + int32_t total_c_size = total_c * sizeof(T); + int32_t align_c_size = align_c * sizeof(T); + __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size, + NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1); + } else { + __memcpy_async(gdram_output + gdram_offset, nram_output, store_size, + NRAM2GDRAM); + } +} + +template +__mlu_func__ void focalLossSigmoidBackwardBlock( + const T *input, const int32_t *target, const T *weight, const float gamma, + const float alpha, const int32_t total_n, const int32_t deal_n, + const int32_t total_c, T *output) { + // params per time slice + int32_t deal_num = deal_n * total_c; + int32_t deal_size = deal_num * sizeof(float); + int32_t compute_num = 0; + int32_t compute_size = 0; + int32_t compute_align_size = NFU_ALIGN_SIZE; + const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T); + if (sizeof(T) == sizeof(half)) { + compute_align_size += NFU_ALIGN_SIZE; + } + const int32_t compute_align_num = compute_align_size / sizeof(float); + bool has_weight = false; + if (weight != NULL) { + has_weight = true; + int32_t align_c = PAD_UP(total_c, compute_align_num); + compute_num = deal_n * align_c; + compute_size = compute_num * sizeof(float); + } else { + compute_size = PAD_UP(deal_size, compute_align_size); + compute_num = compute_size / sizeof(float); + } + + // params per core + int32_t total_num = total_n * total_c; + int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num); + int32_t loop_per_core = num_per_core / deal_num; + + /* NRAM partition: + * + * |-----------------ping pong--------------------| + * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight| + * + * split_pipeline_num is 5: input, pt, alpha_t, temp, output. + * nram_reserved_line_num is 2: flt_min, gamma. + */ + const int32_t split_pipeline_num = 5; + const int32_t nram_reserved_line_num = 2; + int32_t target_deal_size = deal_n * sizeof(int32_t); + int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE); + // nram PING/PONG offset + int32_t ping_pong_offset = + compute_size * split_pipeline_num + target_deal_size_align; + + // gdram addr + int32_t *base_addr_target = + (int32_t *)target + taskId * loop_per_core * deal_n; + T *base_addr_input = (T *)input + taskId * num_per_core; + T *base_addr_output = output + taskId * num_per_core; + + // nram addr + char *nram_input = (char *)nram_buffer; + char *nram_pt = nram_input + compute_size; + char *nram_alpha_t = nram_pt + compute_size; + char *nram_temp = nram_alpha_t + compute_size; + char *nram_output = nram_temp + compute_size; + char *nram_target = nram_output + compute_size; + float *nram_flt_min = NULL; + float *nram_gamma = NULL; + T *nram_weight = NULL; + + if (!has_weight) { + nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE - + nram_reserved_line_num * NFU_ALIGN_SIZE); + nram_gamma = nram_flt_min + nfu_align_num; + } else { + int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE); + nram_flt_min = + (float *)(nram_buffer + MAX_NRAM_SIZE - + nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space); + nram_gamma = nram_flt_min + nfu_align_num; + nram_weight = (T *)(nram_gamma + nfu_align_num); + __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM); + } + + // nram set gamma and FLT_MIN + __nramset(nram_gamma, nfu_align_num, gamma); + __nramset(nram_flt_min, nfu_align_num, FLT_MIN); + + /* + * Pipeline: The pipeline is processed in three stages: Load, Compute, Store. + * The allocated memory space of NRAM is divided into two parts: + * PING and Pong. In a single time slice, PING is used to process + * IO stream and PONG is used for computation. Both of them are + * processed synchronously until finished. + * + * diagram of PINGPONG: + * |------|-----------------------------------------------------------------| + * | | space | + * |------|-----------------------------------------------------------------| + * | time | Ping | Pong | Ping | Pong | Ping | Pong | + * |------|-----------------------------------------------------------------| + * | 0 | L0 | | | | | | + * | 1 | C0 | L1 | | | | | + * | 2 | S0 | C1 | L2 | | | | + * | 3 | | S1 | C2 | L3 | | | + * | 4 | | | S2 | C3 | L4 | | + * | 5 | | | | S3 | C4 | L5 | + * | 6 | | | | | S4 | C5 | + * | 7 | | | | | | S5 | + * |------|-----------------------------------------------------------------| + */ + + // diagram of PINGPONG: L0 + if (loop_per_core > 0) { + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PING, has_weight, ping_pong_offset, 0); + __asm__ volatile("sync;"); + } + + // diagram of PINGPONG: C0 and L1 + if (loop_per_core > 1) { + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num); + __asm__ volatile("sync;"); + } + + for (int i = 0; i < loop_per_core - 2; ++i) { + if (i % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, i * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PONG, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PING, has_weight, ping_pong_offset, + (i + 2) * deal_num); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, i * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + deal_n, total_c, PONG, has_weight, ping_pong_offset, + (i + 2) * deal_num); + } + __asm__ volatile("sync;"); + } + + if (loop_per_core > 1) { + if ((loop_per_core - 2) % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PONG, ping_pong_offset, + has_weight); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + } + __asm__ volatile("sync;"); + } + + if (loop_per_core > 0) { + if (loop_per_core == 1) { + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + compute_num, deal_n, total_c, PING, ping_pong_offset, + has_weight); + __asm__ volatile("sync;"); + } + if ((loop_per_core - 1) % 2 == PING) { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PING, + has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num); + } else { + storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG, + has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num); + } + } + + // process the remaining data which N remainder per core is less than deal_n + int32_t rem_for_all = total_num - num_per_core * taskDim; + if (rem_for_all == 0) return; + int32_t rem_n_for_all = rem_for_all / total_c; + int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim; + int32_t rem_num_per_core = rem_n_per_core * total_c; + int32_t rem_num_per_core_align = 0; + int32_t rem_core_num = rem_for_all / rem_num_per_core; + + int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core; + int32_t rem_num_for_last = rem_n_for_last * total_c; + int32_t rem_num_for_last_align = 0; + + if (has_weight) { + int32_t align_c = PAD_UP(total_c, compute_align_num); + rem_num_per_core_align = rem_n_per_core * align_c; + rem_num_for_last_align = rem_n_for_last * align_c; + } else { + rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num); + rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num); + } + + int32_t rem_addr_base = num_per_core * taskDim; + int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim; + base_addr_target = (int32_t *)target + rem_target_addr_base; + base_addr_input = (T *)input + rem_addr_base; + base_addr_output = output + rem_addr_base; + + if (taskId < rem_core_num) { + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + rem_n_per_core, total_c, PING, has_weight, ping_pong_offset, + taskId * rem_num_per_core); + __asm__ volatile("sync;"); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + rem_num_per_core_align, rem_n_per_core, total_c, PING, + ping_pong_offset, has_weight); + __asm__ volatile("sync;"); + storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING, + has_weight, ping_pong_offset, taskId * rem_num_per_core); + } else if (taskId == rem_core_num) { + if (rem_num_for_last == 0) return; + loadInput(nram_input, nram_target, base_addr_input, base_addr_target, + rem_n_for_last, total_c, PING, has_weight, ping_pong_offset, + taskId * rem_num_per_core); + __asm__ volatile("sync;"); + coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t, + nram_temp, nram_target, nram_gamma, nram_output, alpha, + rem_num_for_last_align, rem_n_for_last, total_c, PING, + ping_pong_offset, has_weight); + __asm__ volatile("sync;"); + storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING, + has_weight, ping_pong_offset, taskId * rem_num_per_core); + } else { + return; + } +} + +template +__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward( + const void *input, const void *target, const void *weight, + const float gamma, const float alpha, const int32_t total_n, + const int32_t deal_n, const int32_t total_c, void *output) { + focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight, + gamma, alpha, total_n, deal_n, total_c, + (T *)output); +} +} // namespace backward + +void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const int32_t N, + const int32_t C, const float alpha, + const float gamma, void *output) { + if (d_type == CNRT_FLOAT16) { + forward::MLUUnion1KernelFocalLossSigmoidForward< + half><<>>(input, target, weight, N, C, alpha, + gamma, output); + } else { + forward::MLUUnion1KernelFocalLossSigmoidForward< + float><<>>(input, target, weight, N, C, alpha, + gamma, output); + } +} + +void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, + const cnrtDataType_t d_type, + const void *input, const void *target, + const void *weight, const float gamma, + const float alpha, const int32_t dim_n, + const int32_t deal_n, const int32_t dim_c, + void *output) { + if (d_type == CNRT_FLOAT16) { + backward::MLUUnion1KernelFocalLossSigmoidBackward< + half><<>>(input, target, weight, gamma, alpha, + dim_n, deal_n, dim_c, output); + } else { + backward::MLUUnion1KernelFocalLossSigmoidBackward< + float><<>>(input, target, weight, gamma, alpha, + dim_n, deal_n, dim_c, output); + } +} diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..7cb16bb100355d49f3d1ad004a5e82998f258994 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu @@ -0,0 +1,1161 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define NMS_SIZE (64) +#define COORD_DIM (4) +#define MEMORY_CORE (0x80) +#define INFO_NUM (5) // 5 means x1, x2, y1, y2 and score +#define REDUCE_NUM \ + (7) // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input) + +#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024) +#define SIZE_SRAM_BUF (MAX_SRAM_SIZE) + +__nram__ int8_t nram_buffer[SIZE_NRAM_BUF]; +__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF]; + +__mlu_func__ void pvLock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_lock(0, 0); + } +#endif +} + +__mlu_func__ void pvUnlock() { +#if __BANG_ARCH__ == 270 + if (coreId != MEMORY_CORE) { + __bang_unlock(0, 0); + } +#endif +} + +enum Addr { SRAM, GDRAM }; + +template +__mlu_func__ void nms_detection( + uint32_t *output_box_num, const int output_mode, const int input_layout, + OUT_DT *output_data, const Addr dst, IN_DT *input_data_score, + const IN_DT *input_data_box, const Addr src, IN_DT *buffer, + const int buffer_size, IN_DT *sram, const int core_limit, + const int input_box_num, const int input_stride, const int output_stride, + const int keepNum, const float thresh_iou, const float thresh_score, + const float offset, const int algo) { + // global value, it is stored in sram with a offset from the begin. + const int flag_offset_size = 28; + int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size); + loop_end_flag[0] = 0; + // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 + const int nms_buffer_count1 = 9; + // temp nram buffer to store selected target. + const int nram_save_limit_count = 256; + float div_thresh_iou = 1.0 / thresh_iou; + + // input data ptr + IN_DT *input_score_ptr; + const IN_DT *input_x1_ptr; + const IN_DT *input_y1_ptr; + const IN_DT *input_x2_ptr; + const IN_DT *input_y2_ptr; + input_score_ptr = input_data_score; + input_x1_ptr = input_data_box; + if (input_layout == 0) { + // [boxes_num, 4] + input_y1_ptr = input_x1_ptr + 1; + input_x2_ptr = input_x1_ptr + 2; + input_y2_ptr = input_x1_ptr + 3; + } else if (input_layout == 1) { + // [4, boxes_num] + input_y1_ptr = input_x1_ptr + input_stride; + input_x2_ptr = input_y1_ptr + input_stride; + input_y2_ptr = input_x2_ptr + input_stride; + } + + // nram data ptr + IN_DT *x1; + IN_DT *y1; + IN_DT *x2; + IN_DT *y2; + IN_DT *score; + IN_DT *inter_x1; + IN_DT *inter_y1; + IN_DT *inter_x2; + IN_DT *inter_y2; + IN_DT *max_box; // the max score, x1, y1, x2, y2 + IN_DT *x1_mask; + IN_DT *y1_mask; + IN_DT *x2_mask; + IN_DT *y2_mask; + OUT_DT *nram_save; + + int limit = 0; // find limit when GDRAM or SRAM + int len_core = 0; // the length deal by every core + int max_seg_pad = 0; // the max length every repeat + int repeat = 0; + int remain = 0; + int remain_pad = 0; + int input_offset = 0; // offset of input_data for current core + int nram_save_count = 0; + // mask for collect x1, y1, x2, y2. each mask has 128 elements + const int mask_size = 128; + const int total_mask_size = 512; + + if (output_mode == 0) { + limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * sizeof(OUT_DT) - + total_mask_size * sizeof(IN_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } else { + limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) - + total_mask_size * sizeof(IN_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } + + if (core_limit == 1) { + len_core = input_box_num; + input_offset = 0; + } else { + int avg_core = input_box_num / core_limit; + int rem = input_box_num % core_limit; + len_core = avg_core + (taskId < rem ? 1 : 0); + input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem); + } + max_seg_pad = PAD_DOWN(limit, NMS_SIZE); + repeat = len_core / max_seg_pad; + remain = len_core % max_seg_pad; + remain_pad = PAD_UP(remain, NMS_SIZE); + + // if datatype is half, we should convert it to float when compute the IoU + int max_seg_iou_compute = + PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); + int repeat_iou_compute = len_core / max_seg_iou_compute; + int remain_iou_compute = len_core % max_seg_iou_compute; + int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); + // initial the address point + score = buffer; + x1 = score + max_seg_pad; + y1 = x1 + max_seg_pad; + x2 = y1 + max_seg_pad; + y2 = x2 + max_seg_pad; + inter_x1 = y2 + max_seg_pad; + inter_y1 = inter_x1 + max_seg_pad; + inter_x2 = inter_y1 + max_seg_pad; + inter_y2 = inter_x2 + max_seg_pad; + x1_mask = inter_y2 + max_seg_pad; + y1_mask = x1_mask + mask_size; + x2_mask = y1_mask + mask_size; + y2_mask = x2_mask + mask_size; + max_box = y2_mask + mask_size; // the max score, x1, y1, x2, y2 + // offset two line from max_box + nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE); + + // set mask for __bang_collect instruction + if (input_layout == 0) { + __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0); + for (int idx = 0; idx < mask_size; idx++) { + int index = (idx % COORD_DIM) * mask_size + idx; + x1_mask[index] = (IN_DT)1.0; + } + } + + for (int keep = 0; keep < keepNum; keep++) { // loop until the max_score <= 0 + if (core_limit != 1) { + __sync_cluster(); // sync before current loop + } + + /******find max start******/ + int max_index = 0; // the max score index + int global_max_index = 0; // for U1 + float max_area = 0; // the max score area + max_box[0] = 0; // init 0 + + for (int i = 0; i <= repeat; i++) { + if (i == repeat && remain == 0) { + break; + } + int seg_len = 0; // the length every nms compute + int cpy_len = 0; // the length every nms memcpy + i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad; + // check seg_len exceeds the limit of fp16 or not. 65536 is the largest + // num that half data type could express. + if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { + // seg length exceeds the max num for fp16 datatype! + return; + } + i == repeat ? cpy_len = remain : cpy_len = max_seg_pad; + /******nms load start******/ + mluMemcpyDirection_t load_dir = SRAM2NRAM; + if (src == SRAM) { + load_dir = SRAM2NRAM; + } else { + load_dir = GDRAM2NRAM; + } + __nramset(score, seg_len, (IN_DT)0); + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + + /******nms load end******/ + + __bang_max(inter_x1, score, seg_len); + if (inter_x1[0] > max_box[0]) { + max_box[0] = inter_x1[0]; + + if (sizeof(IN_DT) == sizeof(half)) { + max_index = ((uint16_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } else if (sizeof(IN_DT) == sizeof(float)) { + max_index = ((uint32_t *)inter_x1)[1] + input_offset + + i * max_seg_pad; // offset start from head of input_data + } + } + } // for repeat + + int stride = 1; + if (input_layout == 0) { + stride = input_stride; + } else if (input_layout == 1) { + stride = 1; + } + + if (core_limit == 1) { + max_box[1] = input_x1_ptr[max_index * stride]; + max_box[2] = input_y1_ptr[max_index * stride]; + max_box[3] = input_x2_ptr[max_index * stride]; + max_box[4] = input_y2_ptr[max_index * stride]; + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + input_score_ptr[max_index] = 0; + global_max_index = max_index; + ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; + } else if (core_limit == 4) { + // find the max with sram + // the max box's x1, y1, x2, y2 on every core + if (coreId != MEMORY_CORE) { + max_box[1] = input_x1_ptr[max_index * stride]; + max_box[2] = input_y1_ptr[max_index * stride]; + max_box[3] = input_x2_ptr[max_index * stride]; + max_box[4] = input_y2_ptr[max_index * stride]; + } + ((uint32_t *)(max_box + INFO_NUM))[0] = max_index; + // copy every core's box info to sram, form: score---x1---y1---x2---y2--- + for (int i = 0; i < INFO_NUM; i++) { + __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT), + NRAM2SRAM); + } + // copy every core's max_index to sram, use 2 half to store max_index + __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM, + sizeof(uint32_t), + NRAM2SRAM); // int32_t datatype + __sync_cluster(); + + // copy score from sram to nram and find the max + __nramset(inter_x1, NMS_SIZE, (IN_DT)0); + __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_core = 0; + if (sizeof(IN_DT) == sizeof(half)) { + max_core = ((uint16_t *)max_box)[1]; + } else if (sizeof(IN_DT) == sizeof(float)) { + max_core = ((uint32_t *)max_box)[1]; + } + + // copy the max box from SRAM to NRAM + __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // x1 + __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // y1 + __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // x2 + __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT), + SRAM2NRAM); // y2 + __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core, + sizeof(uint32_t), SRAM2NRAM); + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0]; + input_score_ptr[global_max_index] = 0; + } + // by now, we get: max_score|max_index|max_box|max_area + /******find max end******/ + + /******nms store start******/ + // store to nram + if (float(max_box[0]) > thresh_score) { + OUT_DT *save_ptr; + int save_offset = 0; + int save_str_num = 0; + save_ptr = nram_save; + save_offset = nram_save_count; + save_str_num = nram_save_limit_count; + if (coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM), + 1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t), + 1 * sizeof(uint32_t), 0); + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + __memcpy(save_ptr + save_offset * INFO_NUM, max_box, + INFO_NUM * sizeof(IN_DT), NRAM2NRAM, + INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); + } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- + __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), + NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), + 4); + } + } + nram_save_count++; + (*output_box_num)++; + } + + // store to sram/gdram + if (*output_box_num != 0) { + mluMemcpyDirection_t store_dir = NRAM2GDRAM; + if (dst == SRAM) { + store_dir = NRAM2SRAM; + } else { // dst == GDRAM + store_dir = NRAM2GDRAM; + } + if ((nram_save_count == nram_save_limit_count) || + (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) { + if (nram_save_count != 0) { + if (coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + pvLock(); + __memcpy(output_data, nram_save, + nram_save_count * sizeof(uint32_t), store_dir); + pvUnlock(); + output_data += nram_save_count; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + pvLock(); + __memcpy(output_data, nram_save, + nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir); + pvUnlock(); + output_data += nram_save_count * INFO_NUM; + } else if (output_mode == + 2) { // score---, x1---, y1---, x2---, y2--- + pvLock(); + __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT), + store_dir, output_stride * sizeof(IN_DT), + nram_save_limit_count * sizeof(IN_DT), 4); + pvUnlock(); + output_data += nram_save_count; + } + nram_save_count = 0; + } + } + } // if move data nram->sram/gdram + } // if dst + + // if the max score <= 0, end + if (core_limit == 1) { + if (float(max_box[0]) <= thresh_score) { + break; + } + } else { + if (float(max_box[0]) <= thresh_score) { + if (coreId == 0) { + loop_end_flag[0] = 1; + } + } + __sync_cluster(); + if (loop_end_flag[0] == 1) { + break; + } + } + /******nms store end******/ + + // To solve half data accuracy, we convert half to float to calculate IoU. + for (int i = 0; i <= repeat_iou_compute; i++) { + if (i == repeat_iou_compute && remain_iou_compute == 0) { + break; + } + int seg_len = 0; // the length every nms compute + int cpy_len = 0; // the length every nms memcpy + i == repeat_iou_compute ? seg_len = remain_pad_iou_compute + : seg_len = max_seg_iou_compute; + i == repeat_iou_compute ? cpy_len = remain_iou_compute + : cpy_len = max_seg_iou_compute; + + /******nms load start******/ + mluMemcpyDirection_t load_dir = SRAM2NRAM; + if (src == SRAM) { + load_dir = SRAM2NRAM; + } else { + load_dir = GDRAM2NRAM; + } + + __nramset((float *)score, seg_len, 0.0f); + int dt_offset = 0; + if (sizeof(IN_DT) == sizeof(float)) { + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + dt_offset = 0; + } else if (sizeof(IN_DT) == sizeof(half)) { + __nramset(x1, seg_len, half(0)); + __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __bang_half2float((float *)score, (half *)x1, seg_len); + dt_offset = max_seg_iou_compute; + } + + if (input_layout == 0) { + // the following number 4 means x1, y1, x2, y2 + __memcpy( + inter_x1, + input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM, + cpy_len * COORD_DIM * sizeof(IN_DT), load_dir, + cpy_len * COORD_DIM * sizeof(IN_DT), + cpy_len * COORD_DIM * sizeof(IN_DT), 0); + // here use collect instruction to transpose the [n, 4] shape into [4, + // n] shape to avoid + // discrete memory accessing. + for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) { + // the following number 32 means 32 elements will be selected out by + // once operation + __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + x1_mask, mask_size); + __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + y1_mask, mask_size); + __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + x2_mask, mask_size); + __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size, + y2_mask, mask_size); + } + } else if (input_layout == 1) { + __memcpy(x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(y1 + dt_offset, + input_y1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(x2 + dt_offset, + input_x2_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + __memcpy(y2 + dt_offset, + input_y2_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + } + /******nms load end******/ + + /******nms compute start******/ + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, + seg_len); + } + // 1、 compute IOU + // get the area_I + __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 + __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, + seg_len); // inter_x1 + __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 + __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + __bang_active_relu((float *)inter_x1, (float *)inter_x1, + seg_len); // inter_w + __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 + __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, + seg_len); // inter_y1 + __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 + __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + __bang_active_relu((float *)inter_y1, (float *)inter_y1, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); + } + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + // get the area_U: area + max_area - area_I + __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), + seg_len); + __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, + seg_len); // area_U + // 2、 select the box + // if IOU greater than thres, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); + /******nms compute end******/ + + // update the score + mluMemcpyDirection_t update_dir = NRAM2SRAM; + if (dst == SRAM) { + update_dir = NRAM2SRAM; + } else { + update_dir = NRAM2GDRAM; + } + if (sizeof(IN_DT) == sizeof(half)) { + __bang_float2half_rd((half *)score, (float *)score, seg_len); + } + pvLock(); + __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, + cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT), + cpy_len * sizeof(IN_DT), 0); + pvUnlock(); + } // for repeat + } // for keepNum +} + +__mlu_global__ void MLUUnion1KernelNMS( + const void *input_boxes, const void *input_confidence, + const int input_num_boxes, const int input_stride, + const int max_output_size, const float iou_threshold, + const float confidence_threshold, const int mode, const int input_layout, + void *workspace, void *result_num, void *output, + const cnrtDataType_t data_type_input, const float offset, const int algo) { + if (data_type_input == CNRT_FLOAT16) { + __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half), + GDRAM2GDRAM); + } else if (data_type_input == CNRT_FLOAT32) { + __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float), + GDRAM2GDRAM); + } else { + } + + int output_stride = max_output_size; + uint32_t result_box_num = 0; + if (mode == 0) { + uint32_t *out_data = (uint32_t *)output; + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *boxes_data = (half *)input_boxes; + half *confi_data = (half *)workspace; + half *buffer = (half *)nram_buffer; + half *sram = (half *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + case CNRT_FLOAT32: { + float *boxes_data = (float *)input_boxes; + float *confi_data = (float *)workspace; + float *buffer = (float *)nram_buffer; + float *sram = (float *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + } + } else { + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *boxes_data = (half *)input_boxes; + half *confi_data = (half *)workspace; + half *out_data = (half *)output; + half *buffer = (half *)nram_buffer; + half *sram = (half *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + case CNRT_FLOAT32: { + float *boxes_data = (float *)input_boxes; + float *confi_data = (float *)workspace; + float *out_data = (float *)output; + float *buffer = (float *)nram_buffer; + float *sram = (float *)sram_buffer; + + nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM, + confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF, + sram, taskDim, input_num_boxes, input_stride, + output_stride, max_output_size, iou_threshold, + confidence_threshold, offset, algo); + ((uint32_t *)result_num)[0] = result_box_num; + }; break; + } + } +} + +template +__mlu_func__ void nms_detection_ux( + int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram, + IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram, + const int input_layout, const int input_num_boxes, const int input_stride, + const int max_output_size, const float thresh_iou, const float thresh_score, + const float offset, const int output_mode, const int algo) { + loop_end_flag[0] = 0; + IN_DT *sram = (IN_DT *)sram_buffer; + + // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2 + int nms_buffer_count1 = 9; + // temp nram buffer to store selected target. + int nram_save_limit_count = 256; + float div_thresh_iou = 1.0 / thresh_iou; + + // input data ptr + IN_DT *input_score_ptr; + const IN_DT *input_x1_ptr; + const IN_DT *input_y1_ptr; + const IN_DT *input_x2_ptr; + const IN_DT *input_y2_ptr; + input_score_ptr = score_data; + input_x1_ptr = boxes_data; + input_y1_ptr = input_x1_ptr + input_stride; + input_x2_ptr = input_y1_ptr + input_stride; + input_y2_ptr = input_x2_ptr + input_stride; + + int limit = 0; // find limit when GDRAM or SRAM + int max_seg_pad = 0; // the max length every repeat + int repeat = 0; + int remain = 0; + int remain_pad = 0; + int nram_save_count = 0; + + if (output_mode == 0) { + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * sizeof(OUT_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } else { + limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) - + nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) / + (nms_buffer_count1 * sizeof(IN_DT)); + } + + // data split + int avg_cluster = input_num_boxes / clusterDim; + int rem_cluster = input_num_boxes % clusterDim; + int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0); + int cluster_offset = avg_cluster * clusterId + + (clusterId <= rem_cluster ? clusterId : rem_cluster); + + int avg_core = len_cluster / coreDim; + int rem_core = len_cluster % coreDim; + int len_core = avg_core + (coreId < rem_core ? 1 : 0); + int core_offset = + avg_core * coreId + (coreId <= rem_core ? coreId : rem_core); + int input_offset = cluster_offset + core_offset; + + max_seg_pad = PAD_DOWN(limit, NMS_SIZE); + + // core 0 of each cluster calculate the max score index + int max_index_avg_core = input_num_boxes / clusterDim; + int max_index_rem_core = input_num_boxes % clusterDim; + int max_index_len_core = + max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0); + int max_index_input_offset = + max_index_avg_core * clusterId + + (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core); + repeat = max_index_len_core / max_seg_pad; + remain = max_index_len_core % max_seg_pad; + remain_pad = PAD_UP(remain, NMS_SIZE); + + // if datatype is fp16, we should cvt to fp32 when compute iou + int max_seg_iou_compute = + PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE); + int repeat_iou_compute = len_core / max_seg_iou_compute; + int remain_iou_compute = len_core % max_seg_iou_compute; + int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE); + + // init the nram ptr + IN_DT *score = (IN_DT *)nram_buffer; + IN_DT *x1 = score + max_seg_pad; + IN_DT *y1 = x1 + max_seg_pad; + IN_DT *x2 = y1 + max_seg_pad; + IN_DT *y2 = x2 + max_seg_pad; + IN_DT *inter_x1 = y2 + max_seg_pad; + IN_DT *inter_y1 = inter_x1 + max_seg_pad; + IN_DT *inter_x2 = inter_y1 + max_seg_pad; + IN_DT *inter_y2 = inter_x2 + max_seg_pad; + IN_DT *max_box = inter_y2 + max_seg_pad; // the max score, x1, y1, x2, y2 + OUT_DT *nram_save = + (OUT_DT *)((char *)max_box + + NFU_ALIGN_SIZE); // offset two line from max_box + + mluMemcpyDirection_t input_load_dir = SRAM2NRAM; + mluMemcpyDirection_t input_store_dir = NRAM2SRAM; + input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM; + input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM; + + for (int keep = 0; keep < max_output_size; + keep++) { // loop until the max_score <= 0 + __sync_all(); + + /******FIND MAX START******/ + int max_index = 0; + int global_max_index = 0; // for Ux + float max_area = 0; // the max socre area + max_box[0] = 0; // init 0 + + if (coreId == 0) { + for (int i = 0; i <= repeat; i++) { + if (i == repeat && remain == 0) { + break; + } + + int seg_len = (i == repeat) + ? remain_pad + : max_seg_pad; // the length every nms compute + // check seg_len exceeds the limit of fp16 or not. 65536 is the largest + // num + // that fp16 could express. + if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) { + return; + } + int cpy_len = (i == repeat) + ? remain + : max_seg_pad; // the length every nms memcpy + + /******NMS LOAD START******/ + __bang_write_zero(score, seg_len); + __memcpy(score, + input_score_ptr + max_index_input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + + /******NMS LOAD END******/ + + __bang_max(inter_x1, score, seg_len); + if (inter_x1[0] > max_box[0]) { + max_box[0] = inter_x1[0]; + if (sizeof(IN_DT) == sizeof(half)) { + max_index = + ((uint16_t *)inter_x1)[1] + max_index_input_offset + + i * max_seg_pad; // offset start from head of input_data + } else if (sizeof(IN_DT) == sizeof(float)) { + max_index = + ((uint32_t *)inter_x1)[1] + max_index_input_offset + + i * max_seg_pad; // offset start from head of input_data + } + } + } // for repeat + + // the max box's x1, y1, x2, y2 on every cluster + max_box[1] = input_x1_ptr[max_index]; + max_box[2] = input_y1_ptr[max_index]; + max_box[3] = input_x2_ptr[max_index]; + max_box[4] = input_y2_ptr[max_index]; + ((uint32_t *)(max_box + 5))[0] = max_index; + // copy max box info to sram + __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); + } + __sync_all(); + // copy all partial max to the sram of cluster 0 + if (clusterId != 0) { + __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT), + SRAM2SRAM, 0); + } + __sync_all(); + + // reduce between clusters to get the global max box + if (clusterId == 0) { + if (coreId == 0) { + __bang_write_zero(inter_x1, NMS_SIZE); + __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT), + REDUCE_NUM * sizeof(IN_DT), clusterDim - 1); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_cluster = (sizeof(IN_DT) == sizeof(half)) + ? ((uint16_t *)max_box)[1] + : ((uint32_t *)max_box)[1]; + __memcpy(max_box, sram + max_cluster * REDUCE_NUM, + REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); + __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); + } + __sync_cluster(); + if (coreId == 0x80 && clusterDim > 1) { + // broadcast global max box to each cluster's sram + for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) { + __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM, + cluster_idx); + } + } + __sync_cluster(); + } + __sync_all(); + + // copy the global max box to max_box + __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM); + if (algo == 0 || offset == 0.0) { + max_area = ((float)max_box[3] - (float)max_box[1]) * + ((float)max_box[4] - (float)max_box[2]); + } else { + max_area = ((float)max_box[3] - (float)max_box[1] + offset) * + ((float)max_box[4] - (float)max_box[2] + offset); + } + global_max_index = ((uint32_t *)(max_box + 5))[0]; + if (coreId != 0x80) { + input_score_ptr[global_max_index] = 0; + } + // by now, we get: max_score|max_index|max_box|max_area + /******FIND MAX END******/ + + /******NMS STORE START******/ + // store to nram + if (float(max_box[0]) > thresh_score) { + OUT_DT *save_ptr; + int save_offset = 0; + int save_str_num = 0; + save_ptr = nram_save; + save_offset = nram_save_count; + save_str_num = nram_save_limit_count; + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0]; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + __memcpy(save_ptr + save_offset * INFO_NUM, max_box, + INFO_NUM * sizeof(IN_DT), NRAM2NRAM, + INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0); + } else if (output_mode == 2) { // score---, x1---, y1---, x2---, y2--- + __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), + NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), + 4); + } + } + nram_save_count++; + output_box_num++; + } + + // store to sram/gdram + if (output_box_num != 0) { + if ((nram_save_count == nram_save_limit_count) || + (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) { + if (nram_save_count != 0) { + if (clusterId == 0 && coreId == 0) { + if (output_mode == 0) { // index1, index2, ... + pvLock(); + __memcpy(output_dram, nram_save, + nram_save_count * sizeof(uint32_t), NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count; + } else if (output_mode == 1) { // score, x1, y1, x2, y2 + pvLock(); + __memcpy(output_dram, nram_save, + nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM); + pvUnlock(); + output_dram += nram_save_count * INFO_NUM; + } else if (output_mode == + 2) { // score---, x1---, y1---, x2---, y2--- + pvLock(); + __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT), + NRAM2GDRAM, max_output_size * sizeof(IN_DT), + nram_save_limit_count * sizeof(IN_DT), 4); + pvUnlock(); + output_dram += nram_save_count; + } + nram_save_count = 0; + } + } + } // if move data nram->sram/gdram + } // if dst + + if (float(max_box[0]) <= thresh_score) { + if (clusterId == 0 && coreId == 0) { + loop_end_flag[0] = 1; // dram + } + } + __sync_all(); + if (loop_end_flag[0] == 1) { + break; + } + /******NMS STORE END******/ + + // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU. + for (int i = 0; i <= repeat_iou_compute; i++) { + if (i == repeat_iou_compute && remain_iou_compute == 0) { + break; + } + int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute + : max_seg_iou_compute; + int cpy_len = + (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute; + + /******NMS LOAD START******/ + __nramset((float *)score, seg_len, 0.0f); + int dt_offset = 0; + if (sizeof(IN_DT) == sizeof(float)) { + __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + dt_offset = 0; + } else if (sizeof(IN_DT) == sizeof(half)) { + __nramset(x1, seg_len, half(0)); + __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), input_load_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + __bang_half2float((float *)score, (half *)x1, seg_len); + dt_offset = max_seg_iou_compute; + } + + __memcpy(x1 + dt_offset, + input_x1_ptr + input_offset + i * max_seg_iou_compute, + cpy_len * sizeof(IN_DT), input_load_dir, + max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3); + /******NMS LOAD END******/ + + /******NMS COMPUTE START******/ + if (sizeof(IN_DT) == sizeof(half)) { + __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, + seg_len); + __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, + seg_len); + } + // 1、 compute IOU + // get the area_I + __nramset((float *)inter_y1, seg_len, float(max_box[1])); // max_x1 + __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1, + seg_len); // inter_x1 + __nramset((float *)inter_y2, seg_len, float(max_box[3])); // max_x2 + __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2, + seg_len); // inter_x2 + __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len); + } + __bang_active_relu((float *)inter_x1, (float *)inter_x1, + seg_len); // inter_w + __nramset((float *)inter_x2, seg_len, float(max_box[2])); // max_y1 + __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2, + seg_len); // inter_y1 + __nramset((float *)inter_x2, seg_len, float(max_box[4])); // max_y2 + __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2, + seg_len); // inter_y2 + __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1, + seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + } + __bang_active_relu((float *)inter_y1, (float *)inter_y1, + seg_len); // inter_h + __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1, + seg_len); // area_I + // get the area of input_box: area = (x2 - x1) * (y2 - y1); + __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len); + __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len); + if (algo == 1 && offset != 0.0) { + __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len); + __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len); + } + __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2, + seg_len); // area + // get the area_U: area + max_area - area_I + __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area), + seg_len); + __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1, + seg_len); // area_U + // 2、 select the box + // if IOU greater than thres, set the score to zero, abort it: area_U > + // area_I * (1 / thresh)? + if (thresh_iou > 0.0) { + __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou, + seg_len); + } else { + __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou, + seg_len); + } + __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, + seg_len); + __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len); + /******NMS COMPUTE END******/ + + if (sizeof(IN_DT) == 2) { + __bang_float2half_rd((half *)score, (float *)score, seg_len); + } + pvLock(); + __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score, + cpy_len * sizeof(IN_DT), input_store_dir, + cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0); + pvUnlock(); + } // for repeat + } // for max_output_size +} + +__mlu_global__ void MLUUionXKernelNMS( + const void *input_boxes, const void *input_confidence, + const int input_num_boxes, const int input_layout, const int input_stride, + const int max_output_size, const float iou_threshold, + const float confidence_threshold, const float offset, + const cnrtDataType_t data_type_input, const int output_mode, const int algo, + void *workspace, void *result_num, void *output) { + int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2; + int32_t *loop_end_flag = + (int32_t *)((char *)workspace + + INFO_NUM * input_num_boxes * input_dwidth); + int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth; + int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size; + + int cluster_score_size = input_num_boxes * input_dwidth; + int cluster_boxes_size = input_num_boxes * 4 * input_dwidth; + char *sram_score = (char *)sram_buffer + reduce_sram_size; + char *sram_boxes = + (char *)sram_buffer + reduce_sram_size + cluster_score_size; + Addr input_ram = GDRAM; + if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) { + input_ram = SRAM; + __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM); + __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM); + } else { + __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM); + } + __sync_cluster(); + uint32_t output_box_num = 0; + if (output_mode == 0) { + uint32_t *output_dram = (uint32_t *)output; + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *score_data; + half *boxes_data; + score_data = + (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; + boxes_data = + (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + case CNRT_FLOAT32: { + float *score_data; + float *boxes_data; + score_data = + (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; + boxes_data = + (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + } + } else { + switch (data_type_input) { + default: { return; } + case CNRT_FLOAT16: { + half *output_dram = (half *)output; + half *score_data; + half *boxes_data; + score_data = + (input_ram == SRAM) ? (half *)sram_score : (half *)workspace; + boxes_data = + (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + case CNRT_FLOAT32: { + float *output_dram = (float *)output; + float *score_data; + float *boxes_data; + score_data = + (input_ram == SRAM) ? (float *)sram_score : (float *)workspace; + boxes_data = + (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes; + nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data, + boxes_data, input_ram, input_layout, input_num_boxes, + input_stride, max_output_size, iou_threshold, + confidence_threshold, offset, output_mode, algo); + ((uint32_t *)result_num)[0] = output_box_num; + }; break; + } + } +} + +void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const cnrtDataType_t data_type_input, const void *boxes_ptr, + const void *scores_ptr, const int input_num_boxes, + const int input_stride, const int max_output_boxes, + const float iou_threshold, const float offset, + void *workspace_ptr, void *output_size_ptr, void *output_ptr) { + switch (k_type) { + default: { return; } + case CNRT_FUNC_TYPE_BLOCK: + case CNRT_FUNC_TYPE_UNION1: { + MLUUnion1KernelNMS<<>>( + boxes_ptr, scores_ptr, input_num_boxes, input_stride, + max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, + /*output_mode=*/0, + /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr, + data_type_input, offset, /*algo=*/1); + }; break; + case CNRT_FUNC_TYPE_UNION2: + case CNRT_FUNC_TYPE_UNION4: + case CNRT_FUNC_TYPE_UNION8: + case CNRT_FUNC_TYPE_UNION16: { + MLUUionXKernelNMS<<>>( + boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1, + input_stride, max_output_boxes, iou_threshold, + /*confidence_threshold=*/0.0, offset, data_type_input, + /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr, + output_ptr); + }; break; + } +} diff --git a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..13b4af19f669aa0b63758e899a06395b39e455aa --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu @@ -0,0 +1,615 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" +#include "psamask_utils.hpp" + +#define COMPUTE_COUNT_ALIGN 64 + +__nram__ char buf[MAX_NRAM_SIZE]; + +template +__mlu_func__ void swap(T &a, T &b) { + T tmp = a; + a = b; + b = tmp; +} + +template +__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src, + const PositionInCore &position, + const Shape &shape_full) { + int n_offset = shape_full.h * shape_full.w * shape_full.c; + int h_offset = shape_full.w * shape_full.c; + int w_offset = shape_full.c; + int n_seg = position.n_end - position.n_start; + int h_seg = position.h_end - position.h_start; + int w_seg = position.w_end - position.w_start; + int size = h_seg * w_seg * shape_full.c; + + __memcpy(dst + position.n_start * n_offset + position.h_start * h_offset + + position.w_start * w_offset, + src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T), + size * sizeof(T), n_seg - 1); +} + +template +__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src, + const PositionInCore &position, + const Shape &shape_full) { + int n_offset = shape_full.h * shape_full.w * shape_full.c; + int h_offset = shape_full.w * shape_full.c; + int w_offset = shape_full.c; + int n_seg = position.n_end - position.n_start; + int h_seg = position.h_end - position.h_start; + int w_seg = position.w_end - position.w_start; + int size = h_seg * w_seg * shape_full.c; + + __memcpy(dst, + src + position.n_start * n_offset + position.h_start * h_offset + + position.w_start * w_offset, + size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T), + n_seg - 1); +} + +// transpose the data from A*B*C*(D*E) to A*D*E*(B*C) +template +__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) { + int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T)); + int align_hw = + CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); + for (int i = 0; i < shape_seg.n; ++i) { + __bang_transpose(dst, src, align_hw, align_c); + dst += align_hw * align_c; + src += align_hw * align_c; + } +} + +template +__mlu_func__ void psamaskCollectForward( + const T *x_dram, T *y_dram, const PositionInCore &position, + const Shape &x_full, const Shape &y_full, const Shape &shape_seg, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask) { + T *x_nram = (T *)buf; + T *y_nram = + x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c, + COMPUTE_COUNT_ALIGN / sizeof(T)); + loadDataFromDramToNram(x_nram, x_dram, position, x_full); + + // fill zeros to output + int elem_count = + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c, + NFU_ALIGN_SIZE / sizeof(T)); + __nramset(y_nram, elem_count, (T)0); + + int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c; + int y_h_offset = shape_seg.w * shape_seg.c; + int y_w_offset = shape_seg.c; + int x_n_offset = shape_seg.h * shape_seg.w * x_full.c; + int y_c_offset = 1; + int x_h_offset = shape_seg.w * x_full.c; + int x_w_offset = x_full.c; + int x_c_offset = 1; + int x_start = 0; + int y_start = 0; + for (int nidx = 0; nidx < shape_seg.n; ++nidx) { + for (int hidx = 0; hidx < shape_seg.h; ++hidx) { + for (int widx = 0; widx < shape_seg.w; ++widx) { + int h_abs = hidx + position.h_start; + int w_abs = widx + position.w_start; + int y_offset = y_start; + int x_offset = x_start; + y_offset += hidx * y_h_offset + widx * y_w_offset; + x_offset += hidx * x_h_offset + widx * x_w_offset; + + const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0; + const int hend = x_full.h + half_h_mask - h_abs < h_mask + ? x_full.h + half_h_mask - h_abs + : h_mask; + const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0; + const int wend = x_full.w + half_w_mask - w_abs < w_mask + ? x_full.w + half_w_mask - w_abs + : w_mask; + // (h, w ) with mask-indexed + // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed + y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart + + w_abs - half_w_mask) * + y_c_offset; + x_offset += (hstart * w_mask + wstart) * x_c_offset; + int count = wend - wstart; + __memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T), + NRAM2NRAM, y_c_offset * x_full.w * sizeof(T), + x_c_offset * w_mask * sizeof(T), hend - hstart - 1); + } + } + y_start += y_n_offset; + x_start += x_n_offset; + } + storeDataFromNramToDram(y_dram, y_nram, position, y_full); +} + +template +__mlu_func__ void psamaskDistributeForward( + const T *x_dram, T *y_dram, const PositionInCore &position, + const Shape &x_full, const Shape &y_full, const Shape &shape_seg, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask) { + T *x_nram = (T *)buf; + T *y_nram_temp = + x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c, + COMPUTE_COUNT_ALIGN / sizeof(T)); + loadDataFromDramToNram(x_nram, x_dram, position, x_full); + + // fill zeros to output + int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T)); + int align_hw = + CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); + int elem_count = + CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T)); + __nramset(y_nram_temp, elem_count, (T)0); + + int y_n_offset = align_hw * align_c; + int y_h_offset = shape_seg.w * align_c; + int y_w_offset = align_c; + int y_c_offset = 1; + int x_n_offset = shape_seg.h * shape_seg.w * x_full.c; + int x_h_offset = shape_seg.w * x_full.c; + int x_w_offset = x_full.c; + int x_c_offset = 1; + int h_feature = y_full.h; + int w_feature = y_full.w; + + int y_start = 0; + int x_start = 0; + for (int nidx = 0; nidx < shape_seg.n; ++nidx) { + for (int hidx = 0; hidx < shape_seg.h; ++hidx) { + for (int widx = 0; widx < shape_seg.w; ++widx) { + int h_abs = hidx + position.h_start; + int w_abs = widx + position.w_start; + int y_offset = y_start; + int x_offset = x_start; + y_offset += hidx * y_h_offset + widx * y_w_offset; + x_offset += hidx * x_h_offset + widx * x_w_offset; + const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0; + const int hend = h_feature + half_h_mask - h_abs < h_mask + ? h_feature + half_h_mask - h_abs + : h_mask; + const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0; + const int wend = w_feature + half_w_mask - w_abs < w_mask + ? w_feature + half_w_mask - w_abs + : w_mask; + // (h, w ) with mask-indexed + // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed + y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart + + w_abs - half_w_mask) * + y_c_offset; + x_offset += (hstart * w_mask + wstart) * x_c_offset; + int count = wend - wstart; + __memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T), + NRAM2NRAM, y_c_offset * w_feature * sizeof(T), + x_c_offset * w_mask * sizeof(T), hend - hstart - 1); + } + } + y_start += y_n_offset; + x_start += x_n_offset; + } + // transpose y + T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c; + Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c}; + transposeData(y_nram, y_nram_temp, y_seg); + swap(align_c, align_hw); + // store y from nram to dram + int y_n_offset_full = y_full.h * y_full.w * y_full.c; + int y_w_offset_full = y_full.c; + int y_c_offset_full = 1; + + int y_dram_start = + position.n_start * y_n_offset_full + + (position.h_start * y_full.w + position.w_start) * y_c_offset_full; + int y_nram_start = 0; + for (int nidx = 0; nidx < shape_seg.n; ++nidx) { + int y_dram_offset = y_dram_start + nidx * y_n_offset_full; + int y_nram_offset = y_nram_start + nidx * align_hw * align_c; + __memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset, + shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM, + y_w_offset_full * sizeof(T), align_c * sizeof(T), + h_feature * w_feature - 1); + } +} + +template +__mlu_func__ void psamaskCollectBackward( + const T *dy_dram, T *dx_dram, const PositionInCore &position, + const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask) { + T *dy_nram = (T *)buf; + T *dx_nram = + dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c, + COMPUTE_COUNT_ALIGN / sizeof(T)); + loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full); + + // fill zeros to output + int elem_count = + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c, + NFU_ALIGN_SIZE / sizeof(T)); + __nramset(dx_nram, elem_count, (T)0); + + int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c; + int dy_h_offset = shape_seg.w * dy_full.c; + int dy_w_offset = dy_full.c; + int dy_c_offset = 1; + int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c; + int dx_h_offset = shape_seg.w * dx_full.c; + int dx_w_offset = dx_full.c; + int dx_c_offset = 1; + int h_feature = dy_full.h; + int w_feature = dy_full.w; + + int dy_start = 0; + int dx_start = 0; + for (int nidx = 0; nidx < shape_seg.n; ++nidx) { + for (int hidx = 0; hidx < shape_seg.h; ++hidx) { + for (int widx = 0; widx < shape_seg.w; ++widx) { + int h_abs = hidx + position.h_start; + int w_abs = widx + position.w_start; + int dy_offset = dy_start; + int dx_offset = dx_start; + dy_offset += hidx * dy_h_offset + widx * dy_w_offset; + dx_offset += hidx * dx_h_offset + widx * dx_w_offset; + + const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0; + const int hend = h_feature + half_h_mask - h_abs < h_mask + ? h_feature + half_h_mask - h_abs + : h_mask; + const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0; + const int wend = w_feature + half_w_mask - w_abs < w_mask + ? w_feature + half_w_mask - w_abs + : w_mask; + // (h, w ) with mask-indexed + // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with + // feature-indexed + dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart + + w_abs - half_w_mask) * + dy_c_offset; + dx_offset += (hstart * w_mask + wstart) * dx_c_offset; + int count = wend - wstart; + __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T), + NRAM2NRAM, dx_c_offset * w_mask * sizeof(T), + dy_c_offset * w_feature * sizeof(T), hend - hstart - 1); + } + } + dy_start += dy_n_offset; + dx_start += dx_n_offset; + } + storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full); +} + +template +__mlu_func__ void psamaskDistributeBackward( + const T *dy_dram, T *dx_dram, const PositionInCore &position, + const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg, + const int h_mask, const int w_mask, const int half_h_mask, + const int half_w_mask) { + // load dy from dram to nram + T *dy_nram_temp = (T *)buf; + int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c; + int dy_c_offset_full = 1; + int h_feature = dy_full.h; + int w_feature = dy_full.w; + int align_c = + CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T)); + int align_hw = + CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T)); + + int dy_dram_start = + position.n_start * dy_n_offset_full + + (position.h_start * w_feature + position.w_start) * dy_c_offset_full; + int dy_nram_start = 0; + for (int i = 0; i < shape_seg.n; ++i) { + int dy_nram_offset = dy_nram_start + i * (align_hw * align_c); + int dy_dram_offset = dy_dram_start + i * dy_n_offset_full; + __memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset, + shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM, + align_c * sizeof(T), dy_full.c * sizeof(T), + h_feature * w_feature - 1); + } + T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c; + Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w}; + transposeData(dy_nram, dy_nram_temp, dy_seg); + swap(align_c, align_hw); + + // fill zeros to dx + T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c; + int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c; + __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0); + + int dy_n_offset_seg = align_hw * align_c; + int dy_h_offset_seg = shape_seg.w * align_c; + int dy_w_offset_seg = align_c; + int dy_c_offset_seg = 1; + int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c; + int dx_h_offset_seg = shape_seg.w * shape_seg.c; + int dx_w_offset_seg = shape_seg.c; + int dx_c_offset_seg = 1; + + int dy_start = 0; + int dx_start = 0; + for (int nidx = 0; nidx < shape_seg.n; ++nidx) { + for (int hidx = 0; hidx < shape_seg.h; ++hidx) { + for (int widx = 0; widx < shape_seg.w; ++widx) { + int h_abs = hidx + position.h_start; + int w_abs = widx + position.w_start; + int dy_offset = dy_start; + int dx_offset = dx_start; + dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg; + dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg; + const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0; + const int hend = h_feature + half_h_mask - h_abs < h_mask + ? h_feature + half_h_mask - h_abs + : h_mask; + const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0; + const int wend = w_feature + half_w_mask - w_abs < w_mask + ? w_feature + half_w_mask - w_abs + : w_mask; + // (h, w ) with mask-indexed + // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with + // feature-indexed + dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart + + w_abs - half_w_mask) * + dy_c_offset_seg; + dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg; + int count = wend - wstart; + __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T), + NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T), + w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1); + } + } + dy_start += dy_n_offset_seg; + dx_start += dx_n_offset_seg; + } + storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full); +} + +template +__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram, + const Shape &input_full, const Shape &output_full, + LimitParam &limit, const PsamaskType psa_type, + const DimPartitionType core_partition, + const DimPartitionType cluster_partition, + const bool is_forward, const int h_mask, + const int w_mask, const int half_h_mask, + const int half_w_mask, const int n_per_core, + const int h_per_core, const int n_per_cluster, + const int h_per_cluster) { + PositionInCore position_full; + PositionInCore position_seg; + position_full.w_start = 0; + position_full.w_end = output_full.w; + int n_num_in_cluster = n_per_cluster; + int h_num_in_cluster = h_per_cluster; + + switch (cluster_partition) { + case PARTITION_N: { + position_full.h_start = 0; + position_full.h_end = input_full.h; + position_full.n_start = taskIdY * n_per_cluster; + int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster; + if (taskIdY >= cluster_need) return; + int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster; + n_num_in_cluster = + (taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster; + position_full.n_end = position_full.n_start + n_num_in_cluster; + }; break; + case PARTITION_H: { + position_full.n_start = 0; + position_full.n_end = input_full.n; + position_full.h_start = taskIdY * h_per_cluster; + int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster; + if (taskIdY >= cluster_need) return; + int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster; + h_num_in_cluster = + (taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster; + position_full.h_end = position_full.h_start + h_num_in_cluster; + }; break; + } + switch (core_partition) { + case PARTITION_N: { + position_full.n_start += taskIdX * n_per_core; + int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core; + if (taskIdX >= core_need) return; + int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core; + position_full.n_end = + position_full.n_start + + ((taskIdX == core_need - 1) ? n_remainder : n_per_core); + }; break; + case PARTITION_H: { + position_full.h_start += taskIdX * h_per_core; + int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core; + if (taskIdX >= core_need) return; + int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core; + position_full.h_end = + position_full.h_start + + ((taskIdX == core_need - 1) ? h_remainder : h_per_core); + }; break; + } + // the count of n ,h and w need to be processed in the current core + int shape_core_n = position_full.n_end - position_full.n_start; + int shape_core_h = position_full.h_end - position_full.h_start; + int shape_core_w = input_full.w; + + limit.n = limit.n < shape_core_n ? limit.n : shape_core_n; + limit.h = limit.h < shape_core_h ? limit.h : shape_core_h; + limit.w = limit.w < shape_core_w ? limit.w : shape_core_w; + + // load the data to nram according to the limit + for (int nidx = position_full.n_start; nidx < position_full.n_end; + nidx += limit.n) { + position_seg.n_start = nidx; + position_seg.n_end = + position_seg.n_start + (position_full.n_end - nidx < limit.n + ? position_full.n_end - nidx + : limit.n); + for (int hidx = position_full.h_start; hidx < position_full.h_end; + hidx += limit.h) { + position_seg.h_start = hidx; + position_seg.h_end = + position_seg.h_start + (position_full.h_end - hidx < limit.h + ? position_full.h_end - hidx + : limit.h); + for (int widx = position_full.w_start; widx < position_full.w_end; + widx += limit.w) { + position_seg.w_start = widx; + position_seg.w_end = + position_seg.w_start + (position_full.w_end - widx < limit.w + ? position_full.w_end - widx + : limit.w); + + // record the segment of output except the size of channel + // channel segments of output and input are the same + Shape shape_seg; + shape_seg.n = position_seg.n_end - position_seg.n_start; + shape_seg.h = position_seg.h_end - position_seg.h_start; + shape_seg.w = position_seg.w_end - position_seg.w_start; + shape_seg.c = output_full.c; + + switch (psa_type) { + case COLLECT: { + if (is_forward) { + psamaskCollectForward(input_dram, output_dram, position_seg, + input_full, output_full, shape_seg, h_mask, + w_mask, half_h_mask, half_w_mask); + } else { + psamaskCollectBackward(input_dram, output_dram, position_seg, + input_full, output_full, shape_seg, h_mask, + w_mask, half_h_mask, half_w_mask); + } + } break; + case DISTRIBUTE: { + if (is_forward) { + psamaskDistributeForward(input_dram, output_dram, position_seg, + input_full, output_full, shape_seg, + h_mask, w_mask, half_h_mask, + half_w_mask); + } else { + psamaskDistributeBackward(input_dram, output_dram, position_seg, + input_full, output_full, shape_seg, + h_mask, w_mask, half_h_mask, + half_w_mask); + } + } break; + } + } + } + } +} + +template +__mlu_global__ void MLUUnion1KernelPsamaskForward( + const T *x, T *y, const PsamaskType psa_type, + const DimPartitionType core_partition, + const DimPartitionType cluster_partition, const int batch, + const int h_feature, const int w_feature, const int h_mask, + const int w_mask, const int x_c, const int y_c, const int half_h_mask, + const int half_w_mask, const int n_per_core, const int h_per_core, + const int n_per_cluster, const int h_per_cluster, const int limit_n_seg, + const int limit_h_seg, const int limit_w_seg) { + if (coreId == 0x80) { + return; + } + Shape x_full, y_full; + x_full.n = batch; + x_full.h = h_feature; + x_full.w = w_feature; + x_full.c = x_c; + y_full.n = batch; + y_full.h = h_feature; + y_full.w = w_feature; + y_full.c = y_c; + + LimitParam limit; + limit.n = limit_n_seg; + limit.h = limit_h_seg; + limit.w = limit_w_seg; + + psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition, + cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask, + n_per_core, h_per_core, n_per_cluster, h_per_cluster); +} + +template +__mlu_global__ void MLUUnion1KernelPsamaskBackward( + const T *dy, T *dx, const PsamaskType psa_type, + const DimPartitionType core_partition, + const DimPartitionType cluster_partition, const int batch, + const int h_feature, const int w_feature, const int h_mask, + const int w_mask, const int dx_c, const int dy_c, const int half_h_mask, + const int half_w_mask, const int n_per_core, const int h_per_core, + const int n_per_cluster, const int h_per_cluster, const int limit_n_seg, + const int limit_h_seg, const int limit_w_seg) { + if (coreId == 0x80) { + return; + } + Shape dy_full, dx_full; + dx_full.n = batch; + dx_full.h = h_feature; + dx_full.w = w_feature; + dx_full.c = dx_c; + dy_full.n = batch; + dy_full.h = h_feature; + dy_full.w = w_feature; + dy_full.c = dy_c; + + LimitParam limit; + limit.n = limit_n_seg; + limit.h = limit_h_seg; + limit.w = limit_w_seg; + + psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition, + cluster_partition, false, h_mask, w_mask, half_h_mask, + half_w_mask, n_per_core, h_per_core, n_per_cluster, + h_per_cluster); +} + +void KernelPsamaskForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *x, void *y, const PsamaskType psa_type, + const DimPartitionType core_partition, + const DimPartitionType cluster_partition, const int batch, + const int h_feature, const int w_feature, const int h_mask, + const int w_mask, const int x_c, const int y_c, const int half_h_mask, + const int half_w_mask, const int n_per_core, const int h_per_core, + const int n_per_cluster, const int h_per_cluster, const int limit_n_seg, + const int limit_h_seg, const int limit_w_seg) { + MLUUnion1KernelPsamaskForward<<>>( + static_cast(x), static_cast(y), psa_type, + core_partition, cluster_partition, batch, h_feature, w_feature, h_mask, + w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core, + n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg); +} + +void KernelPsamaskBackward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *dy, void *dx, const PsamaskType psa_type, + const DimPartitionType core_partition, + const DimPartitionType cluster_partition, const int batch, + const int h_feature, const int w_feature, const int h_mask, + const int w_mask, const int dx_c, const int dy_c, const int half_h_mask, + const int half_w_mask, const int n_per_core, const int h_per_core, + const int n_per_cluster, const int h_per_cluster, const int limit_n_seg, + const int limit_h_seg, const int limit_w_seg) { + MLUUnion1KernelPsamaskBackward<<>>( + static_cast(dy), static_cast(dx), psa_type, + core_partition, cluster_partition, batch, h_feature, w_feature, h_mask, + w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core, + n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg); +} diff --git a/mmcv/ops/csrc/common/mlu/psamask_utils.hpp b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..30ec388494615842528b74da0661e169b08a545e --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp @@ -0,0 +1,55 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef PSAMASK_UTILS_HPP_ +#define PSAMASK_UTILS_HPP_ + +typedef enum { + COLLECT = 0, + DISTRIBUTE = 1, +} PsamaskType; + +typedef enum { + PARTITION_N = 0, + PARTITION_H = 1, +} DimPartitionType; + +struct PartitionSeg { + int h_per_cluster; + int n_per_cluster; + int h_per_core; + int n_per_core; + DimPartitionType cluster_partition; + DimPartitionType core_partition; +}; + +struct Shape { + int n; + int h; + int w; + int c; +}; + +struct LimitParam { + int n; + int h; + int w; +}; + +struct PositionInCore { + int n_start; + int n_end; + int h_start; + int h_end; + int w_start; + int w_end; +}; +#endif // PSAMASK_UTILS_HPP_ diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..f62554d0effd9e67ba5068b1b57d7e7131c696ea --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu @@ -0,0 +1,493 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define ROI_OFFSET 5 + +__nram__ char buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void bilinearInterpolate(const int input_height, + const int input_width, T y, T x, T *w1, + T *w2, T *w3, T *w4, int *x_low, + int *x_high, int *y_low, int *y_high, + bool *empty) { + // deal with cases that inverse elements are of feature map boundary + if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) { + *empty = true; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low_ = int(y); + int x_low_ = int(x); + + if (y_low_ >= input_height - 1) { + *y_high = y_low_ = input_height - 1; + y = (T)y_low_; + } else { + *y_high = y_low_ + 1; + } + + if (x_low_ >= input_width - 1) { + *x_high = x_low_ = input_width - 1; + x = T(x_low_); + } else { + *x_high = x_low_ + 1; + } + + *y_low = y_low_; + *x_low = x_low_; + + T ly = y - y_low_; + T lx = x - x_low_; + T hy = 1.0 - ly; + T hx = 1.0 - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + return; +} + +template +__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core, + T *nram_out, const int roi_bin_grid_h, + const int roi_bin_grid_w, const T roi_start_h, + const T roi_start_w, const int ph, + const int pw, const T bin_size_h, + const T bin_size_w, const float count, + const int input_height, const int input_width, + const int channels, const int cyc_num, + const int max_elements) { + int cyc_channel = max_elements; + + for (int i = 0; i < cyc_num; i++) { + int real_channel = + (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel; + int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T)); + __bang_write_zero(nram_out, align_channel); + uint32_t real_size = real_channel * sizeof(T); + + int iy, ix; + for (iy = 0; iy < roi_bin_grid_h; iy++) { + // 1. compute the coordinates of the y axis in the current roi_bin_grid_h + T y = roi_start_h + ph * bin_size_h + + (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h); + for (ix = 0; ix < roi_bin_grid_w; ix++) { + // 2. compute the coordinates of the x axis in the current + // roi_bin_grid_w + T x = roi_start_w + pw * bin_size_w + + (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w); + + // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low + // and y_high) and weight (x_low and x_high) of input feature map in + // the current roi bin grid, and the flag (empty) which shows if x, y + // are out of input feature map ranges + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bool empty = false; + + bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4, + &x_low, &x_high, &y_low, &y_high, &empty); + + // 4. compute interpolation of the current roi bin grid + // tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values + // to compute the interpolation, and then reused to compute + // the argmax_x and argmax_y. + T *tmp_cyc1 = nram_in + cyc_channel; + T *tmp_cyc2 = nram_in + cyc_channel * 2; + T *tmp_cyc3 = nram_in + cyc_channel * 3; + T *tmp_cyc4 = nram_in + cyc_channel * 4; + + if (empty) { // exits abnormal values + __bang_write_zero(nram_in, align_channel); + } else { + __bang_write_zero(nram_in, align_channel); + uint32_t offset1 = (y_low * input_width + x_low) * channels; + uint32_t offset2 = (y_low * input_width + x_high) * channels; + uint32_t offset3 = (y_high * input_width + x_low) * channels; + uint32_t offset4 = (y_high * input_width + x_high) * channels; + T *input1 = (T *)input_core + offset1 + i * cyc_channel; + T *input2 = (T *)input_core + offset2 + i * cyc_channel; + T *input3 = (T *)input_core + offset3 + i * cyc_channel; + T *input4 = (T *)input_core + offset4 + i * cyc_channel; + + // load the four pixels (p1, p2, p3 and p4) of input feature map to + // compute interpolation + __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM); + __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM); + + // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4 + __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel); + __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel); + __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel); + __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel); + + __bang_add(nram_in, tmp_cyc1, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc2, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc3, nram_in, align_channel); + __bang_add(nram_in, tmp_cyc4, nram_in, align_channel); + } + // 5. compute sum value and corresponding coordinates of x axis and y + // axis. Update the sum value. + __bang_add(nram_out, nram_in, nram_out, align_channel); + } // loop_roi_grid_w + } // loop_roi_grid_h + T count_value = (T)(1.0 / count); + __bang_mul_const(nram_out, nram_out, count_value, align_channel); + __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM); + } // loop_cyc_num +} + +template +__mlu_func__ void roialignForwardAvg( + T *input, T *rois, T *output, const bool aligned, const int channels, + const int pooled_height, const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, const T spatial_scale, + const int num_rois) { + // find limit for channel, the nram space is divided to 6 parts that are + // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output + + // max_elements : 300 : float datatype : 27296, half datatype : 54592 + // max_elements : 200 : float datatype : 16384, half datatype : 32768 + int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T); + int cyc_num = channels / max_elements + (int)(channels % max_elements != 0); + T offset = aligned ? (T)0.5 : (T)0.0; + int task_num = num_rois * pooled_height * pooled_width; + T *nram_out = (T *)buffer; + T *nram_in = nram_out + max_elements; + if (task_num < taskDim) { + if (taskId >= task_num) { + return; + } + } + + for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) { + if (bin_idx >= task_num) { + return; + } + + // (n,ph.pw) is a c in the pooled output + int pw = bin_idx % pooled_width; + int ph = (bin_idx / pooled_width) % pooled_height; + int n = bin_idx / pooled_width / pooled_height; + + T *roi_id_tmp = rois + n * ROI_OFFSET; + // 1. compute width and height of roi region. + int batch_idx = (int)roi_id_tmp[0]; + T roi_x1 = roi_id_tmp[1]; + T roi_y1 = roi_id_tmp[2]; + T roi_x2 = roi_id_tmp[3]; + T roi_y2 = roi_id_tmp[4]; + T roi_start_w = roi_x1 * spatial_scale - offset; + T roi_start_h = roi_y1 * spatial_scale - offset; + T roi_end_w = roi_x2 * spatial_scale - offset; + T roi_end_h = roi_y2 * spatial_scale - offset; + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + + if (!aligned) { + roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0); + roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0); + } + + // 2. compute float-type width and height of roi bin region. + T bin_size_w = (T)roi_width / (T)pooled_width; + T bin_size_h = (T)roi_height / (T)pooled_height; + + // 3. compute int-type width and height of roi bin region. + int roi_bin_grid_h, roi_bin_grid_w; + roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : int(ceilf(roi_height / pooled_height)); + roi_bin_grid_w = (sampling_ratio > 0) + ? sampling_ratio + : int(ceilf(roi_width / pooled_width)); + float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1 + ? roi_bin_grid_h * roi_bin_grid_w + : 1.0); + T *input_core = input + batch_idx * channels * input_width * input_height; + T *output_core = output + bin_idx * channels; + // 4. compute avg value and corresponding coordinates of x axis and y axis. + computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h, + roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h, + bin_size_w, count, input_height, input_width, channels, + cyc_num, max_elements); + } +} + +__mlu_global__ void MLUUnion1KernelRoiAlignAvg( + const void *input, const void *rois, const int channels, const bool aligned, + const int pooled_height, const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, const float spatial_scale, + const int num_rois, const cnrtDataType_t data_type, void *output) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + + switch (data_type) { + case CNRT_FLOAT16: { + roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned, + channels, pooled_height, pooled_width, input_height, + input_width, sampling_ratio, + (half)spatial_scale, num_rois); + }; break; + case CNRT_FLOAT32: { + roialignForwardAvg((float *)input, (float *)rois, (float *)output, + aligned, channels, pooled_height, pooled_width, + input_height, input_width, sampling_ratio, + (float)spatial_scale, num_rois); + }; break; + default: + break; + } + + return; +} +} // namespace forward + +namespace backward { +__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y, + float x, float *w1, float *w2, + float *w3, float *w4, int *x_low, + int *x_high, int *y_low, + int *y_high) { + if (y < -1.0 || y > height || x < -1.0 || x > width) { + *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0; + *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1; + return; + } + if (y <= 0) { + y = 0; + } + if (x <= 0) { + x = 0; + } + *y_low = (int)y; + *x_low = (int)x; + if (*y_low >= height - 1) { + *y_high = height - 1, *y_low = height - 1; + y = (float)(*y_low); + } else { + *y_high = *y_low + 1; + } + if (*x_low >= width - 1) { + *x_high = width - 1, *x_low = width - 1; + x = (float)(*x_low); + } else { + *x_high = *x_low + 1; + } + float ly = y - *y_low, lx = x - *x_low; + float hy = 1.0 - ly, hx = 1.0 - lx; + *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx; + return; +} + +template +__mlu_func__ void unionRoiAlignBp( + T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi, + const int wi, const int c, const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, const bool aligned) { + int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T)); + int deal_all = boxes_num * hi * wi; + int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim); + for (int i = 0; i < deal_this_core; ++i) { + int bhw_id = i * taskDim + taskId; + int box_id = bhw_id / (hi * wi); + int ih = (bhw_id / wi) % hi; + int iw = bhw_id % wi; + T *box = boxes + box_id * 5; + int image_id = (int)box[0]; + T *image_offset = grads_image + image_id * ho * wo * c; + T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c; + + float offset = aligned ? 0.5 : 0.0; + float x1 = box[1] * spatial_scale - offset; + float y1 = box[2] * spatial_scale - offset; + float x2 = box[3] * spatial_scale - offset; + float y2 = box[4] * spatial_scale - offset; + float roi_width = x2 - x1; + float roi_height = y2 - y1; + if (!aligned) { + roi_width = (roi_width > 1.0) ? roi_width : 1.0; + roi_height = (roi_height > 1.0) ? roi_height : 1.0; + } + float bin_size_h = roi_height / hi; + float bin_size_w = roi_width / wi; + + int roi_grid_h = + (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi); + int roi_grid_w = + (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi); + const T count = roi_grid_h * roi_grid_w; + if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) { + for (int iy = 0; iy < roi_grid_h; ++iy) { + const float y = + y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h; + for (int ix = 0; ix < roi_grid_w; ++ix) { + const float x = + x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w; + float w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low, + &x_high, &y_low, &y_high); + if (x_low >= 0 && y_low >= 0) { + __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_low * wo * c + x_low * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_low * wo * c + x_high * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_high * wo * c + x_low * c, + (T *)buffer + c_align, c); + __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4, + c_align); + __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align, + 1 / count, c_align); + __bang_atomic_add((T *)buffer + c_align, + image_offset + y_high * wo * c + x_high * c, + (T *)buffer + c_align, c); + } // x_low && y_low + } // ix + } // iy + } else { + for (int iy = 0; iy < roi_grid_h; ++iy) { + const float y = + y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h; + for (int ix = 0; ix < roi_grid_w; ++ix) { + const float x = + x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w; + float w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low, + &x_high, &y_low, &y_high); + if (x_low >= 0 && y_low >= 0) { + int deal_once = + PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T); + int c_repeat = c / deal_once + (int)(c % deal_once != 0); + for (int i = 0; i < c_repeat; ++i) { + int deal_c = deal_once; + int align_c = deal_once; + if (i == c_repeat - 1) { + deal_c = c - i * deal_once; + align_c = c_align - i * deal_once; + } + __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T), + GDRAM2NRAM); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_low * wo * c + x_low * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_low * wo * c + x_high * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_high * wo * c + x_low * c + i * deal_once, + (T *)buffer + align_c, deal_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4, + align_c); + __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c, + 1 / count, align_c); + __bang_atomic_add( + (T *)buffer + align_c, + image_offset + y_high * wo * c + x_high * c + i * deal_once, + (T *)buffer + align_c, deal_c); + } // for c_repeat + } // x_low >= 0 && y_low >= 0 + } // ix + } // iy + } // if c + } // i +} + +__mlu_global__ void MLUUnion1KernelRoiAlignBackward( + const void *grads, const void *boxes, void *grads_image, + const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi, + const int c, const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, const bool aligned) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (dtype) { + case CNRT_FLOAT16: { + unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image, + boxes_num, hi, wi, c, no, ho, wo, spatial_scale, + sampling_ratio, aligned); + }; break; + case CNRT_FLOAT32: { + unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image, + boxes_num, hi, wi, c, no, ho, wo, spatial_scale, + sampling_ratio, aligned); + }; break; + default: { return; } + } +} +} // namespace backward + +void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t d_type, + const void *input, const void *rois, const int channels, + const bool aligned, const int pooled_height, + const int pooled_width, const int input_height, + const int input_width, const int sampling_ratio, + const float spatial_scale, const int num_rois, + void *output) { + forward::MLUUnion1KernelRoiAlignAvg<<>>( + input, rois, channels, aligned, pooled_height, pooled_width, input_height, + input_width, sampling_ratio, spatial_scale, num_rois, d_type, output); +} + +void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, const cnrtDataType_t dtype, + const void *grads, const void *boxes, + void *grads_image, const int boxes_num, + const int hi, const int wi, const int c, + const int no, const int ho, const int wo, + const float spatial_scale, const int sampling_ratio, + const bool aligned) { + backward::MLUUnion1KernelRoiAlignBackward<<>>( + grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo, + spatial_scale, sampling_ratio, aligned); +} diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..7f05b525a0b278e7593db76faee8fa782df4bc38 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu @@ -0,0 +1,472 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" +#include "roi_align_rotated_utils.hpp" + +#define ROI_OFFSET 6 +#define SAMPLING_NUM 4 + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +template +__mlu_func__ void swap(T &a, T &b) { + T tmp = a; + a = b; + b = tmp; +} + +template +__mlu_func__ void bilinearInterpolate(const int input_height, + const int input_width, T x, T y, + const T zero_sign, T *w1, T *w2, T *w3, + T *w4, int *x_low, int *x_high, + int *y_low, int *y_high, bool *empty) { + // deal with case that the point is out of feature map boundary + if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) { + *empty = true; + return; + } + + if (y <= 0) y = (T)0; + if (x <= 0) x = (T)0; + + *y_low = int(y); + *x_low = int(x); + + if (*y_low >= input_height - 1) { + *y_high = *y_low = input_height - 1; + y = (T)(*y_low); + } else { + *y_high = *y_low + 1; + } + + if (*x_low >= input_width - 1) { + *x_high = *x_low = input_width - 1; + x = T(*x_low); + } else { + *x_high = *x_low + 1; + } + T ly = y - *y_low; + T lx = x - *x_low; + T hy = 1.0 - ly; + T hx = 1.0 - lx; + *w1 = hy * hx * zero_sign; + *w2 = hy * lx * zero_sign; + *w3 = ly * hx * zero_sign; + *w4 = ly * lx * zero_sign; +} + +template +__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i, + const RoiAlignRotatedParams ¶ms, + int *batch_idx, int *roi_n, int *pw, int *ph, + T *roi_center_x, T *roi_center_y, T *roi_width, + T *roi_height, T *theta) { + T offset = params.aligned ? (T)0.5 : (T)0.0; + *pw = bin_i % params.pooled_width; + *ph = (bin_i / params.pooled_width) % params.pooled_height; + *roi_n = bin_i / params.pooled_width / params.pooled_height; + const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET; + *batch_idx = (int)roi_info[0]; + *roi_center_x = roi_info[1] * (T)params.spatial_scale - offset; + *roi_center_y = roi_info[2] * (T)params.spatial_scale - offset; + *roi_width = roi_info[3] * (T)params.spatial_scale; + *roi_height = roi_info[4] * (T)params.spatial_scale; + *theta = roi_info[5]; + if (params.clockwise) { + *theta = -(*theta); + } + if (!params.aligned) { + *roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0; + *roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0; + } +} + +template +__mlu_func__ void roiAlignRotatedForward(const T *input_dram, + const T *rois_dram, const int batch, + const int height, const int width, + const int channel, const int rois_num, + const RoiAlignRotatedParams ¶ms, + T *output_dram) { + int align_base_128 = NFU_ALIGN_SIZE / sizeof(T); + int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1); + channel_max_cap = channel_max_cap / align_base_128 * align_base_128; + int channel_align = channel < channel_max_cap ? channel : channel_max_cap; + channel_align = CEIL_ALIGN(channel_align, align_base_128); + + T *nram_out = (T *)nram_buffer; + T *nram_ping = nram_out + channel_align; + T *nram_pong = nram_ping + channel_align * SAMPLING_NUM; + + int bin_first = taskId; + int bin_end = rois_num * params.pooled_height * params.pooled_width; + + for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) { + T roi_center_x, roi_center_y, roi_width, roi_height, theta; + int batch_idx, roi_n, pw, ph; + getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph, + &roi_center_x, &roi_center_y, &roi_width, &roi_height, + &theta); + T bin_size_h = roi_height / params.pooled_height; + T bin_size_w = roi_width / params.pooled_width; + + int roi_bin_grid_h = + (params.sample_ratio > 0) + ? params.sample_ratio + : __float2int_up((float)roi_height / params.pooled_height); + int roi_bin_grid_w = + (params.sample_ratio > 0) + ? params.sample_ratio + : __float2int_up((float)roi_width / params.pooled_width); + T roi_start_y = -roi_height / 2; + T roi_start_x = -roi_width / 2; + const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1 + ? roi_bin_grid_h * roi_bin_grid_w + : 1; + T cos_theta = std::cos(theta); + T sin_theta = std::sin(theta); + T zero_sign = 1.0f / bin_dim; + + bool is_first_sample = true; + int src_offset = 0; + int dst_offset = 0; + int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align; + for (int c_offset = 0; c_offset < channel; c_offset += channel_align) { + __nramset(nram_out, channel_align, (T)0); + c_rem = channel - c_offset; + c_slice = channel_align > c_rem ? c_rem : channel_align; + c_slice_align = CEIL_ALIGN(c_slice, align_base_128); + is_first_sample = true; + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + const T yy = roi_start_y + ph * bin_size_h + + T(iy + 0.5) * bin_size_h / roi_bin_grid_h; + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + const T xx = roi_start_x + pw * bin_size_w + + T(ix + 0.5) * bin_size_w / roi_bin_grid_w; + int sample_i = iy * roi_bin_grid_w + ix; + + T y = yy * cos_theta - xx * sin_theta + roi_center_y; + T x = yy * sin_theta + xx * cos_theta + roi_center_x; + T w1, w2, w3, w4; + bool empty = false; + int x_low, x_high, y_low, y_high; + bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3, + &w4, &x_low, &x_high, &y_low, &y_high, &empty); + int sample_wdim = x_high - x_low + 1; + /******************************************************* + | ping | pong | + |------|-----|-----|-----|-----|-----|-----|-----|-----| + |output| p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 | + |------|-----|-----|-----|-----|-----|-----|-----|-----| + ********************************************************/ + if (is_first_sample && !empty) { + // load input data from dram to nram + __nramset(nram_ping, SAMPLING_NUM * c_slice_align, (T)0); + for (int h = y_low; h <= y_high; ++h) { + src_offset = + (batch_idx * height * width + h * width + x_low) * channel + + c_offset; + dst_offset = (h - y_low) * SAMPLING_NUM * c_slice_align / 2; + if (c_slice_align == channel) { + __memcpy(nram_ping + dst_offset, input_dram + src_offset, + sample_wdim * channel * sizeof(T), GDRAM2NRAM); + } else { + __memcpy(nram_ping + dst_offset, input_dram + src_offset, + c_slice * sizeof(T), GDRAM2NRAM, + c_slice_align * sizeof(T), channel * sizeof(T), + sample_wdim - 1); + } + } + } + // load next input data to nram + if (sample_i + 1 < bin_dim) { + int p_iy = (sample_i + 1) / roi_bin_grid_w; + int p_ix = (sample_i + 1) % roi_bin_grid_w; + const T p_yy = roi_start_y + ph * bin_size_h + + T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h; + const T p_xx = roi_start_x + pw * bin_size_w + + T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w; + T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y; + T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x; + T p_w1, p_w2, p_w3, p_w4; + bool p_empty = false; + int p_x_low, p_x_high, p_y_low, p_y_high; + bilinearInterpolate(height, width, p_x, p_y, zero_sign, &p_w1, + &p_w2, &p_w3, &p_w4, &p_x_low, &p_x_high, + &p_y_low, &p_y_high, &p_empty); + int p_sample_wdim = p_x_high - p_x_low + 1; + pongc_slice = c_slice; + pongc_slice_align = c_slice_align; + if (!p_empty) { + __nramset(nram_pong, SAMPLING_NUM * pongc_slice_align, (T)0); + for (int h = p_y_low; h <= p_y_high; ++h) { + src_offset = + (batch_idx * height * width + h * width + p_x_low) * + channel + + c_offset; + dst_offset = + (h - p_y_low) * SAMPLING_NUM * pongc_slice_align / 2; + if (pongc_slice_align == channel) { + __memcpy_async( + nram_pong + dst_offset, input_dram + src_offset, + p_sample_wdim * channel * sizeof(T), GDRAM2NRAM); + } else { + __memcpy_async(nram_pong + dst_offset, + input_dram + src_offset, + pongc_slice * sizeof(T), GDRAM2NRAM, + pongc_slice_align * sizeof(T), + channel * sizeof(T), p_sample_wdim - 1); + } + } + } + } + T *tmp_sum = nram_ping + 3 * c_slice_align; + if (empty) { + __nramset(tmp_sum, c_slice_align, T(0)); + } else { + __bang_mul_const(nram_ping, nram_ping, w1, c_slice_align); + __bang_mul_const(nram_ping + c_slice_align, + nram_ping + c_slice_align, w2, c_slice_align); + __bang_mul_const(nram_ping + 2 * c_slice_align, + nram_ping + 2 * c_slice_align, w3, c_slice_align); + __bang_mul_const(nram_ping + 3 * c_slice_align, + nram_ping + 3 * c_slice_align, w4, c_slice_align); + __bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM, + 1, SAMPLING_NUM, 1, 1); + } + __bang_add(nram_out, nram_out, tmp_sum, c_slice_align); + swap(nram_ping, nram_pong); + + __asm__ volatile("sync;"); + is_first_sample = false; + } + } + // store the result to dram + int output_offset = + ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) * + channel + + c_offset; + __memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T), + NRAM2GDRAM); + } + } +} + +template +__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram, + const T *rois_dram, const int batch, + const int height, const int width, + const int channel, const int rois_num, + const RoiAlignRotatedParams ¶ms, + T *bottom_grad_dram) { + int align_base_128 = NFU_ALIGN_SIZE / sizeof(T); + int channel_align = CEIL_ALIGN(channel, align_base_128); + + unsigned int max_element = MAX_NRAM_SIZE / sizeof(T); + int c_limit = max_element >> 2; + c_limit = c_limit > channel_align ? channel_align : c_limit; + + T *nram_ping = (T *)nram_buffer; + T *nram_pong = nram_ping + 2 * c_limit; + T *nram_output = nullptr; + + int bin_first = taskId; + int bin_end = rois_num * params.pooled_height * params.pooled_width; + bool is_first_bin = true; + T roi_center_x, roi_center_y, roi_width, roi_height, theta; + int batch_idx, roi_n, pw, ph; + T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height, + pong_theta; + int pong_batch_idx, pong_roi_n, pong_pw, pong_ph; + for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) { + getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph, + &roi_center_x, &roi_center_y, &roi_width, &roi_height, + &theta); + T bin_size_h = roi_height / params.pooled_height; + T bin_size_w = roi_width / params.pooled_width; + + int roi_bin_grid_h = + (params.sample_ratio > 0) + ? params.sample_ratio + : __float2int_up((float)roi_height / params.pooled_height); + int roi_bin_grid_w = + (params.sample_ratio > 0) + ? params.sample_ratio + : __float2int_up((float)roi_width / params.pooled_width); + T roi_start_y = -roi_height / 2; + T roi_start_x = -roi_width / 2; + const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1 + ? roi_bin_grid_h * roi_bin_grid_w + : 1; + T cos_theta = std::cos(theta); + T sin_theta = std::sin(theta); + T zero_sign = 1.0f / bin_dim; + + int c_rem, c_slice, pongc_slice, c_offset; + c_rem = channel; + c_offset = 0; + /**************************************** + | ping | pong | + |---------|---------|---------|---------| + | input | output | input | output | + |---------|---------|---------|---------| + *****************************************/ + if (is_first_bin) { + // load the first top_grad to nram + c_slice = c_limit < c_rem ? c_limit : c_rem; + int top_grad_offset = + ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) * + channel; + __memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T), + GDRAM2NRAM); + } + nram_output = nram_ping + c_limit; + while (c_rem > 0) { + c_slice = c_slice < c_rem ? c_slice : c_rem; + // load the next top_grad to nram + if (c_rem - c_slice > 0) { + // load the rest channels to nram + pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice; + int top_grad_offset = + ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) * + channel + + c_offset + c_slice; + __memcpy_async(nram_pong, top_grad_dram + top_grad_offset, + pongc_slice * sizeof(T), GDRAM2NRAM); + } else if (bin_i + taskDim < bin_end) { + // load next bin's data to nram + getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx, + &pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x, + &pong_roi_center_y, &pong_roi_width, &pong_roi_height, + &pong_theta); + pongc_slice = c_limit < channel ? c_limit : channel; + int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) * + params.pooled_width + + pong_pw) * + channel; + __memcpy_async(nram_pong, top_grad_dram + top_grad_offset, + c_slice * sizeof(T), GDRAM2NRAM); + } + // comput the output in a single bin + + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + const T yy = roi_start_y + ph * bin_size_h + + T(iy + 0.5) * bin_size_h / roi_bin_grid_h; + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + const T xx = roi_start_x + pw * bin_size_w + + T(ix + 0.5) * bin_size_w / roi_bin_grid_w; + T y = yy * cos_theta - xx * sin_theta + roi_center_y; + T x = yy * sin_theta + xx * cos_theta + roi_center_x; + T w1, w2, w3, w4; + bool empty = false; + int x_low, x_high, y_low, y_high; + bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3, + &w4, &x_low, &x_high, &y_low, &y_high, &empty); + if (empty) { + continue; + } else { + __bang_mul_const(nram_output, nram_ping, w1, c_limit); + __bang_atomic_add( + (T *)nram_output, + bottom_grad_dram + batch_idx * height * width * channel + + y_low * width * channel + x_low * channel + c_offset, + (T *)nram_output, c_slice); + __bang_mul_const(nram_output, nram_ping, w2, c_limit); + __bang_atomic_add( + (T *)nram_output, + bottom_grad_dram + batch_idx * height * width * channel + + y_low * width * channel + x_high * channel + c_offset, + (T *)nram_output, c_slice); + __bang_mul_const(nram_output, nram_ping, w3, c_limit); + __bang_atomic_add( + (T *)nram_output, + bottom_grad_dram + batch_idx * height * width * channel + + y_high * width * channel + x_low * channel + c_offset, + (T *)nram_output, c_slice); + __bang_mul_const(nram_output, nram_ping, w4, c_limit); + __bang_atomic_add( + (T *)nram_output, + bottom_grad_dram + batch_idx * height * width * channel + + y_high * width * channel + x_high * channel + c_offset, + (T *)nram_output, c_slice); + } + } + } + swap(nram_ping, nram_pong); + c_rem -= c_slice; + c_offset += c_slice; + __asm__ volatile("sync;"); + } + is_first_bin = false; + } +} + +__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward( + const void *features, const void *rois, void *output, const int batch, + const int height, const int width, const int channel, const int rois_num, + const RoiAlignRotatedParams rroiAlignParams, + const cnrtDataType_t data_type) { + if (0x80 == coreId) { + return; + } + + if (data_type == CNRT_FLOAT32) { + roiAlignRotatedForward((float *)features, (float *)rois, batch, height, + width, channel, rois_num, rroiAlignParams, + (float *)output); + } else { + roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width, + channel, rois_num, rroiAlignParams, (half *)output); + } +} + +__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward( + const void *top_grad, const void *rois, void *bottom_grad, const int batch, + const int height, const int width, const int channel, const int rois_num, + const RoiAlignRotatedParams rroiAlignParams, + const cnrtDataType_t data_type) { + if (0x80 == coreId) { + return; + } + + if (data_type == CNRT_FLOAT32) { + roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height, + width, channel, rois_num, rroiAlignParams, + (float *)bottom_grad); + } else { + roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height, + width, channel, rois_num, rroiAlignParams, + (half *)bottom_grad); + } +} + +void KernelRoiAlignRotatedForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const cnrtDataType_t d_type, const void *features, const void *rois, + void *output, const int batch, const int height, const int width, + const int channel, const int rois_num, + const RoiAlignRotatedParams roiAlignRotatedParams) { + MLUUnion1KernelRoiAlignRotatedForward<<>>( + features, rois, output, batch, height, width, channel, rois_num, + roiAlignRotatedParams, d_type); +} + +void KernelRoiAlignRotatedBackward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const cnrtDataType_t d_type, const void *top_grad, const void *rois, + void *bottom_grad, const int batch, const int height, const int width, + const int channel, const int rois_num, + const RoiAlignRotatedParams roiAlignRotatedParams) { + MLUUnion1KernelRoiAlignRotatedBackward<<>>( + top_grad, rois, bottom_grad, batch, height, width, channel, rois_num, + roiAlignRotatedParams, d_type); +} diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..cd0ec02484fef395db7d401976d64f9c5ca59622 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp @@ -0,0 +1,24 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_ +#define ROI_ALIGN_ROTATED_UTILS_HPP_ + +struct RoiAlignRotatedParams { + int pooled_height; + int pooled_width; + int sample_ratio; + float spatial_scale; + bool aligned; + bool clockwise; +}; + +#endif // ROI_ALIGN_ROTATED_UTILS_HPP_ diff --git a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..7186cdfac3e93677ed2727234a71def607fcd79b --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu @@ -0,0 +1,749 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +#define ALIGN_SIZE 64 +#define PIPELINE_COMMON_NUM 2 +#define PIPELINE_PINGPONG_NUM 10 + +__nram__ char nram_buffer[MAX_NRAM_SIZE]; + +namespace forward { +template +__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height, + int width, int channels, int p_height, + int p_width, T spatial_scale, int *bin_x1, + int *bin_y1, int *bin_x2, int *bin_y2, + int *bin_wdim, int *bin_hdim, int *bin_dims, + T **input_base, bool *is_empty) { + int pw = bin_i % p_width; + int ph = (bin_i / p_width) % p_height; + int roi_n = bin_i / p_width / p_height; + + /*roi*/ + const T *roi_info = rois_v + roi_n * 5; // {{batch, x1, y1, x2, y2},,,} + int batch_index = (int)roi_info[0]; + int roi_x1 = round(roi_info[1] * spatial_scale); + int roi_y1 = round(roi_info[2] * spatial_scale); + int roi_x2 = round(roi_info[3] * spatial_scale); + int roi_y2 = round(roi_info[4] * spatial_scale); + int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1; + int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1; + + /*bin*/ + T bin_w = (T)roi_w / (T)p_width; + T bin_h = (T)roi_h / (T)p_height; + + *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1; + *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0; + *bin_x1 = *bin_x1 < width ? *bin_x1 : width; + + *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1; + *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0; + *bin_y1 = *bin_y1 < height ? *bin_y1 : height; + + *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1; + *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0; + *bin_x2 = *bin_x2 < width ? *bin_x2 : width; + + *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1; + *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0; + *bin_y2 = *bin_y2 < height ? *bin_y2 : height; + + *input_base = input_v + batch_index * height * width * channels; + *bin_wdim = *bin_x2 - *bin_x1; + *bin_hdim = *bin_y2 - *bin_y1; + *bin_dims = (*bin_hdim) * (*bin_wdim); + *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1); +} + +template +__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch, + int channels, int height, int width, + int p_height, int p_width, int rois_num, + T spatial_scale, T *output_v, int *argmax) { + /* + * NRAM partition + * |---------------------------------------------------| + * | ping | + * |---------------------------------------------------| + * | pong | + * |---------------------------------------------------| + * | out | + * |---------------------------------------------------| + * | argmax | + * |---------------------------------------------------| + * | a | + * |---------------------------------------------------| + * | b | + * |---------------------------------------------------| + */ + uint32_t is_half = sizeof(T) == sizeof(half) ? true : false; + uint32_t t_size = sizeof(T); + uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float); + uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half); + + uint32_t channels_align = PAD_UP(channels, float_div); + uint32_t nram_limit = PAD_DOWN( + (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div); + + // nram PING/PONG, output, argamx, a, b + float *nram_ping = (float *)nram_buffer; + float *nram_pong = (float *)nram_buffer + nram_limit; + float *nram_out = (float *)nram_buffer + 2 * nram_limit; + float *nram_argmax = nram_out + channels_align; + float *nram_a = nram_out + 2 * channels_align; + float *nram_b = nram_out + 3 * channels_align; + + uint32_t c_bins_num = rois_num * p_height * p_width; + uint32_t task_bins = c_bins_num / taskDim; + uint32_t rem_bins = c_bins_num % taskDim; + if (taskId < rem_bins) { + task_bins += 1; + } + int bin_first = + (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId); + int bins_loop = bin_first + task_bins; + + T *input_base = NULL; + T *output_base = output_v + bin_first * channels; + int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL; + int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims; + int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims; + bool is_empty = false; + bool pong_is_empty = false; + bool is_first_bin = true; + uint32_t src_offset = 0; + uint32_t dst_offset = 0; + uint32_t nram_offset = 0; + uint32_t half_offset = + is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0; + float *nram_tmp = NULL; + + uint32_t c_slice = 0; + uint32_t c_slice_align = 0; + uint32_t pongc_slice = 0; + uint32_t pongc_slice_align = 0; + for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) { + getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels, + p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1, + &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims, + &input_base, &is_empty); + uint32_t c_rem = channels; + c_slice = nram_limit / bin_dims / float_div * float_div; + + if (is_first_bin && !is_empty) { + c_slice = c_slice > c_rem ? c_rem : c_slice; + c_slice_align = PAD_UP(c_slice, float_div); + for (int h = bin_y1; h < bin_y2; h++) { + src_offset = (h * width + bin_x1) * channels; + nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset; + if (c_slice_align == channels) { + __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, + bin_wdim * c_slice * t_size, GDRAM2NRAM); + } else { + __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset, + c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size, + channels * t_size, bin_wdim - 1); + } + } + } + uint32_t c_offset = 0; + while (c_rem > 0) { + c_slice = c_slice > c_rem ? c_rem : c_slice; + c_slice_align = PAD_UP(c_slice, float_div); + + /*__memcpy_async*/ + if (c_rem - c_slice > 0 && !is_empty) { + pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice; + pongc_slice_align = PAD_UP(pongc_slice, float_div); + for (int h = bin_y1; h < bin_y2; h++) { + src_offset = (h * width + bin_x1) * channels + c_offset; + nram_offset = + (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset; + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset + c_slice, + pongc_slice * t_size, GDRAM2NRAM, + pongc_slice_align * t_size, channels * t_size, + bin_wdim - 1); + } + } else if (bin_i + 1 < bins_loop) { + getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width, + channels, p_height, p_width, (T)spatial_scale, &pbin_x1, + &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim, + &pbin_dims, &input_base, &pong_is_empty); + pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div); + pongc_slice = pongc_slice > channels ? channels : pongc_slice; + pongc_slice_align = PAD_UP(pongc_slice, float_div); + if (!pong_is_empty) { + for (int h = pbin_y1; h < pbin_y2; h++) { + src_offset = (h * width + pbin_x1) * channels; + nram_offset = + (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset; + if (pongc_slice_align == channels) { + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset, + pbin_wdim * pongc_slice * t_size, GDRAM2NRAM); + } else { + __memcpy_async((T *)nram_pong + nram_offset, + (T *)input_base + src_offset, pongc_slice * t_size, + GDRAM2NRAM, pongc_slice_align * t_size, + channels * t_size, pbin_wdim - 1); + } + } + } + } + + if (is_empty) { + __nramset((T *)nram_out, c_slice_align, (T)0); + __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, + c_slice * t_size, NRAM2GDRAM); + if (NULL != argmax) { + __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1)); + __memcpy((int32_t *)argmax_base + dst_offset + c_offset, + (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM); + } + } else { + if (is_half) { + uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div); + __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset, + bin_align64); + } + __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align, + bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1); + if (is_half) { + uint32_t c_align64 = PAD_UP(c_slice_align, half_div); + __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64); + } + __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out, + c_slice * t_size, NRAM2GDRAM); + if (NULL != argmax) { + /*compute max_index*/ + __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping, + c_slice_align, bin_hdim, bin_wdim, bin_hdim, + bin_wdim, 1, 1); + convertInt2Float((float *)nram_argmax, (float *)nram_a, + (int32_t *)nram_out, (float *)nram_b, c_slice_align); + + /*compute input_h*/ + for (int i = 0; i < c_slice; i++) { + nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim); + } + __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1, + c_slice_align); + __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width, + c_slice_align); + + /*compute input_w*/ + __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim, + c_slice_align); + __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a, + c_slice_align); + __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1, + c_slice_align); + __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a, + c_slice_align); + convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a, + (float *)nram_out, (float *)nram_b, c_slice_align); + __memcpy((int32_t *)argmax_base + dst_offset + c_offset, + (int32_t *)nram_argmax, c_slice * sizeof(int32_t), + NRAM2GDRAM); + } + } + nram_tmp = nram_ping; + nram_ping = nram_pong; + nram_pong = nram_tmp; + c_offset += c_slice; + c_rem -= c_slice; + __asm__ volatile("sync;"); + } + dst_offset += channels; + is_first_bin = false; + } +} + +__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type, + const void *input_data, + const void *input_rois, int batch, + int channels, int height, int width, + int pooled_height, int pooled_width, + int rois_num, float spatial_scale, + void *output_data, int *argmax) { + switch (data_type) { + case CNRT_FLOAT16: { + MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels, + height, width, pooled_height, pooled_width, rois_num, + (half)spatial_scale, (half *)output_data, argmax); + }; break; + case CNRT_FLOAT32: { + MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch, + channels, height, width, pooled_height, pooled_width, + rois_num, (float)spatial_scale, (float *)output_data, + argmax); + }; break; + default: { + break; + } + } +} +} // namespace forward + +namespace backward { +// Convert index of argmax from global grads_image to local bin in RoI. Vector +// operations do not support int type, so conversion from int to float is +// performed here. +__mlu_func__ void convertIndex( + int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1, + int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int, + int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w, + int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w, + float *nram_atomic_add, float *nram_grads_image, int width, int height, + int wstart, int hstart, int w_compute, int h_compute, int align_c, + int channels, int loop_flag, int loop_id, int true_limit) { + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); + + // This step uses scalar division, because the above vector division causes + // rounding accuracy problem. + for (int i = 0; i < channels; ++i) { + *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width; + } + + // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width' + // operation. + convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1, + (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2, + align_c); + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2, + align_c); + + // Perform 'temp_result - hstart' operation + __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart, + align_c); + + // Perform 'temp_result1 - temp_result2 * width' operation + __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width, + align_c); + convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1, + (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c); + __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, + (float *)nram_argmax_fp_w, align_c); + + // Perform 'temp_result - wstart' operation + __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart, + align_c); + + // Perform 'temp_result = h * w_compute + w' operation + __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + w_compute, align_c); + __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + (float *)nram_argmax_fp_w, align_c); + + if (loop_flag == 1) { + __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h, + (loop_id * true_limit), align_c); + } + convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1, + (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2, + align_c); +} + +template +__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads, + const int32_t *argmax, T *grads_image, + int channels, int height, int width, + int pooled_height, int pooled_width, + int rois_num, const T spatial_scale, + int high_precision) { + // Calculate the number of rois processed by each core + int bin_num = rois_num * pooled_height * pooled_width; + int loop = + (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim); + int tid = taskId * loop; + if (bin_num % taskDim != 0) { + if (tid >= bin_num) { + return; + } else { + // last part is (bin_num - tid). + loop = bin_num - tid < loop ? bin_num - tid : loop; + } + } + int align_c = PAD_UP(channels, ALIGN_SIZE); + // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM. + int data_size = + PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c - + (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) / + 2), + ALIGN_SIZE); + int hw_limit = data_size / align_c; + float *nram_grads = (float *)nram_buffer; + for (int idx = tid; idx < tid + loop; ++idx) { + // (n, ph, pw) is a C in the pooled output + int pw = idx % pooled_width; + int ph = (idx / pooled_width) % pooled_height; + int n = idx / pooled_width / pooled_height; + + const T *offset_rois = (const T *)(rois + n * 5); + int roi_batch_ind = int(offset_rois[0]); + // Calculate the roi region on feature maps + int roi_start_w = round(offset_rois[1] * spatial_scale); + int roi_start_h = round(offset_rois[2] * spatial_scale); + int roi_end_w = round(offset_rois[3] * spatial_scale); + int roi_end_h = round(offset_rois[4] * spatial_scale); + // Force malformed rois to 1x1 + int roi_width = + roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1; + int roi_height = + roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1; + T bin_size_h = (T)roi_height / (T)pooled_height; + T bin_size_w = (T)roi_width / (T)pooled_width; + + // The corresponding bin region + int hstart = int(floor((T)ph * bin_size_h)); + int wstart = int(floor((T)pw * bin_size_w)); + int hend = int(ceil((T)(ph + 1) * bin_size_h)); + int wend = int(ceil((T)(pw + 1) * bin_size_w)); + + // Add roi offsets and clip to input boundaries, min(max(A, B), C); + hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0; + hstart = hstart < height ? hstart : height; + hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0; + hend = hend < height ? hend : height; + wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0; + wstart = wstart < width ? wstart : width; + wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0; + wend = wend < width ? wend : width; + + bool is_empty = (hend <= hstart) || (wend <= wstart); + if (!is_empty) { + int h_compute = hend - hstart; + int w_compute = wend - wstart; + int true_limit = + hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute; + int loop_int = (h_compute * w_compute) / true_limit; + int rem = (h_compute * w_compute) % true_limit; + int32_t *nram_argmax = (int32_t *)nram_grads + align_c; + int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c; + int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; + int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; + int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; + int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; + int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; + int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; + int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; + float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c; + float *nram_grads_image = (float *)nram_atomic_add + align_c; + if (true_limit == h_compute * w_compute) { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |---------------------------------------------------| + * | argmax_temp | + * |---------------------------------------------------| + * | atomic_add | + * |---------------------------------------------------| + * | grads_image | + * |---------------------------------------------------| + */ + + // Load the data from GDRAM to NRAM. + __memcpy((T *)nram_grads + align_c * high_precision, + (const T *)grads + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + align_c * high_precision, + align_c); + } + + __memcpy((int32_t *)nram_argmax, + (const int32_t *)argmax + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(int32_t), GDRAM2NRAM); + + // Perform pooling operation on NRAM. + convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart, + hstart, w_compute, h_compute, align_c, channels, 0, 0, 0); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, align_c, h_compute, + w_compute, h_compute, w_compute, h_compute, + w_compute); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, + h_compute * w_compute * align_c); + } + + // Store the result on NRAM back to GDRAM. + for (int hc = 0; hc < h_compute; ++hc) { + for (int wc = 0; wc < w_compute; ++wc) { + T *dst = (T *)nram_atomic_add; + int grad_image_offset = (roi_batch_ind * height * width + + (hc + hstart) * width + wc + wstart) * + channels; + T *src1 = (T *)grads_image + grad_image_offset; + int nram_grads_image_offset = (hc * w_compute + wc) * align_c; + T *src2 = (T *)nram_grads_image + nram_grads_image_offset; + __bang_atomic_add(dst, src1, src2, channels); + } + } + } else if (true_limit > 0) { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |--------------------ping_pong----------------------| + * | argmax_temp | argmax_temp | + * |------------------------|--------------------------| + * | atomic_add | atomic_add | + * |------------------------|--------------------------| + * | grads_image | grads_image | + * |---------------------------------------------------| + */ + + // Load the data from GDRAM to NRAM. + __memcpy((T *)nram_grads + align_c * high_precision, + (const T *)grads + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + align_c * high_precision, + align_c); + } + __memcpy((int32_t *)nram_argmax, + (const int32_t *)argmax + (n * pooled_height * pooled_width + + ph * pooled_width + pw) * + channels, + channels * sizeof(int32_t), GDRAM2NRAM); + + int ping_pong = 0; + int ping_pong_offset = + (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2; + for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { + int size = (loop_id == loop_int) ? rem : true_limit; + if (size == 0) { + break; + } + // Perform pooling operation on NRAM. + nram_argmax_fp = + (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset; + nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c; + nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c; + nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c; + nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c; + nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c; + nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c; + nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c; + nram_atomic_add = (float *)nram_argmax_fp_w + align_c; + nram_grads_image = (float *)nram_atomic_add + align_c; + int loop_id_1 = loop_id; + int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit; + if (size_1 == 0) { + break; + } + convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart, + hstart, w_compute, h_compute, align_c, channels, 1, + loop_id_1, true_limit); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, align_c, size_1, 1, + size_1, 1, size_1, 1); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, size_1 * align_c); + } + + // Store the result on NRAM back to GDRAM. + for (int index_size = 0; index_size < size; ++index_size) { + int h = (loop_id * true_limit + index_size) / w_compute; + int w = (loop_id * true_limit + index_size) % w_compute; + T *dst = (T *)nram_atomic_add; + T *grads_image_n = + (T *)grads_image + roi_batch_ind * height * width * channels; + T *src1 = (T *)grads_image_n + + ((h + hstart) * width + (w + wstart)) * channels; + T *src2 = (T *)nram_grads_image + index_size * align_c; + __bang_atomic_add(dst, src1, src2, channels); + } + ping_pong = 1 - ping_pong; + } + } else { + /* + * NRAM partition + * |---------------------------------------------------| + * | grads | + * |---------------------------------------------------| + * | argmax | + * |--------------------ping_pong----------------------| + * | argmax_temp | argmax_temp | + * |------------------------|--------------------------| + * | atomic_add | atomic_add | + * |------------------------|--------------------------| + * | grads_image | grads_image | + * |---------------------------------------------------| + */ + + int c_limit = + PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) / + (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2), + ALIGN_SIZE); + int loop_int = channels / c_limit; + int rem = channels % c_limit; + int ping_pong = 0; + int ping_pong_offset = + (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2; + for (int loop_id = 0; loop_id <= loop_int; ++loop_id) { + int size = (loop_id == loop_int) ? rem : c_limit; + if (size == 0) { + break; + } + nram_argmax_fp = + (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset; + nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit; + nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit; + nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit; + nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit; + nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit; + nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit; + nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit; + nram_atomic_add = (float *)nram_argmax_fp_w + c_limit; + nram_grads_image = (float *)nram_atomic_add + c_limit; + + // This pipeline loads the data from GDRAM to NRAM. + __memcpy((T *)nram_grads + c_limit * high_precision, + (const T *)grads + + n * pooled_height * pooled_width * channels + + ph * pooled_width * channels + pw * channels + + loop_id * c_limit, + size * sizeof(T), GDRAM2NRAM); + if (high_precision) { + __bang_half2float((float *)nram_grads, + (half *)nram_grads + c_limit * high_precision, + c_limit); + } + __memcpy((int32_t *)nram_argmax, + (const int32_t *)argmax + + n * pooled_height * pooled_width * channels + + ph * pooled_width * channels + pw * channels + + loop_id * c_limit, + size * sizeof(int32_t), GDRAM2NRAM); + + for (int hc = 0; hc < h_compute; ++hc) { + for (int wc = 0; wc < w_compute; ++wc) { + // This pipeline performs pooling operation on NRAM. + convertIndex( + nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1, + nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h, + nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w, + nram_atomic_add, nram_grads_image, width, height, wstart + wc, + hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0); + __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads, + (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1, + 1, 1); + if (high_precision) { + __bang_float2half_rd((half *)nram_grads_image, + (float *)nram_grads_image, c_limit); + } + // This pipeline stores the result on NRAM back to GDRAM. + T *dst = (T *)nram_atomic_add; + T *grads_image_n = + (T *)grads_image + roi_batch_ind * height * width * channels; + T *src1 = (T *)grads_image_n + + ((hc + hstart) * width + (wc + wstart)) * channels + + loop_id * c_limit; + T *src2 = (T *)nram_grads_image; + __bang_atomic_add(dst, src1, src2, size); + } + } + ping_pong = 1 - ping_pong; + } + } + } + } +} + +__mlu_global__ void MLUKernelRoiPoolBackward( + const void *grads, const void *rois, const int *argmax, void *grads_image, + int rois_num, int pooled_height, int pooled_width, int channels, int no, + int height, int width, const float spatial_scale, + const cnrtDataType_t k_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (k_dtype) { + case CNRT_FLOAT16: { + // Using the float type '__bang_max_pool_bp' instruction to increase the + // bit width. + const int high_precision = 1; + MLUUnion1Roipool((const half *)rois, (const half *)grads, + (const int32_t *)argmax, (half *)grads_image, channels, + height, width, pooled_height, pooled_width, rois_num, + (const half)spatial_scale, high_precision); + }; break; + case CNRT_FLOAT32: { + const int high_precision = 0; + MLUUnion1Roipool((const float *)rois, (const float *)grads, + (const int32_t *)argmax, (float *)grads_image, channels, + height, width, pooled_height, pooled_width, rois_num, + (const float)spatial_scale, high_precision); + }; break; + default: { + break; + } + } +} +} // namespace backward + +void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, cnrtDataType_t data_type, + const void *input_data, const void *input_rois, + const int batch, const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, const int rois_num, + const float spatial_scale, void *output_data, + int *argmax) { + forward::MLUKernelRoiPool<<>>( + data_type, input_data, input_rois, batch, channels, height, width, + pooled_height, pooled_width, rois_num, spatial_scale, output_data, + argmax); +} + +void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, + cnrtQueue_t queue, cnrtDataType_t k_dtype, + const void *grad_output_ptr, const void *rois_ptr, + const int *argmax_ptr, void *grad_input_ptr, + const int box_num, const int pooled_height, + const int pooled_width, const int channels, + const int batch, const int height, const int width, + const float spatial_scale) { + backward::MLUKernelRoiPoolBackward<<>>( + grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num, + pooled_height, pooled_width, channels, batch, height, width, + spatial_scale, k_dtype); +} diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu new file mode 100644 index 0000000000000000000000000000000000000000..7cb6df0e5d531afa6c2d548a6f3f7b8a8110da28 --- /dev/null +++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu @@ -0,0 +1,307 @@ +/************************************************************************* + * Copyright (C) 2022 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#include "common_mlu_helper.hpp" + +__nram__ char data_nram[MAX_NRAM_SIZE]; + +template +__mlu_func__ void mluMultiKernelTinShift( + const T *input, const int *shifts, T *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel) { + for (int cur_channel_index = taskId; + cur_channel_index < batch_size * channel_size; + cur_channel_index += taskDim) { + int n_index = cur_channel_index / channel_size; + int group_id = cur_channel_index % channel_size / group_channel; + int t_shift = shifts[n_index * group_size + group_id]; + int index = cur_channel_index % channel_size * hw_size + + n_index * time_size * channel_size * hw_size; + __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __asm__ volatile("sync;"); + if (abs(t_shift) >= time_size) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } else { + if (t_shift > 0) { + __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index, + hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T), + channel_size * hw_size * sizeof(T), time_size - 1 - t_shift); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } else { + __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size), + hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T), + channel_size * hw_size * sizeof(T), time_size - 1 + t_shift); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + time_size - 1); + } + } + __asm__ volatile("sync;"); + } +} + +template +__mlu_func__ void mluHwSplit(const T *input, const int t_shift, + const int time_size, const int hw_size, + const int channel_size, const int index, + const int cur_sequence_index, + const int max_length_per_core, T *output) { + for (int cur_index = index; cur_index < index + hw_size; + cur_index += max_length_per_core) { + int memcpy_size = max_length_per_core; + if (cur_index + max_length_per_core > index + hw_size) { + memcpy_size = index + hw_size - cur_index; + } + if (cur_sequence_index - t_shift < 0 || + cur_sequence_index - t_shift >= time_size) { + __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T), + NRAM2GDRAM); + } else { + __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size, + memcpy_size * sizeof(T), GDRAM2NRAM); + __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T), + NRAM2GDRAM); + } + __asm__ volatile("sync;"); + } +} + +template +__mlu_func__ void mluMultiKernelTinShiftSplitSequence( + const T *input, const int *shifts, T *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const int max_number_hw_per_core, const int max_length_per_core) { + const int tmp_max_number_hw_per_core = + max_number_hw_per_core > 0 ? max_number_hw_per_core : 1; + const int loop_time = time_size / tmp_max_number_hw_per_core + + ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0); + int segmentime_size = tmp_max_number_hw_per_core; + int res_segment = time_size % tmp_max_number_hw_per_core; + + for (int cur_segment_index = taskId; + cur_segment_index < loop_time * batch_size * channel_size; + cur_segment_index += taskDim) { + int n_index = cur_segment_index / loop_time / channel_size; + int group_id = cur_segment_index / loop_time % channel_size / group_channel; + int t_shift = shifts[n_index * group_size + group_id]; + int index = n_index * time_size * channel_size * hw_size + + (cur_segment_index / loop_time % channel_size) * hw_size + + cur_segment_index % loop_time * segmentime_size * hw_size * + channel_size; + char *dst_gdram2nram = data_nram; + const T *src_gdram2nram = input + index; + int count_gdram2nram = -1; + int count_nram2gdram = -1; + int next_sequence_index = + index / hw_size / channel_size % time_size + segmentime_size; + int cur_sequence_index = index / hw_size / channel_size % time_size; + __nramset(data_nram, MAX_NRAM_SIZE, (char)0); + __asm__ volatile("sync;"); + if (max_number_hw_per_core == 0) { + mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index, + cur_sequence_index, max_length_per_core, output); + continue; + } + if (abs(t_shift) >= time_size) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + res_segment - 1); + } else { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + } + continue; + } + if (t_shift == 0) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index; + count_gdram2nram = res_segment - 1; + count_nram2gdram = res_segment - 1; + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index; + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } + } else if (t_shift > 0) { + int first_index_cur_channel = + n_index * time_size * channel_size * hw_size + + (cur_segment_index / loop_time % channel_size) * hw_size; + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + dst_gdram2nram = data_nram; + src_gdram2nram = + input + + (index - t_shift * channel_size * hw_size < first_index_cur_channel + ? first_index_cur_channel + : index - t_shift * channel_size * hw_size); + count_gdram2nram = res_segment - 1; + count_nram2gdram = res_segment - 1; + if (cur_sequence_index < t_shift && t_shift < next_sequence_index) { + dst_gdram2nram = + data_nram + t_shift % segmentime_size * hw_size * sizeof(T); + count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1; + } + } else { + if (t_shift >= next_sequence_index) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + continue; + } else if (cur_sequence_index < t_shift && + t_shift < next_sequence_index) { + dst_gdram2nram = + data_nram + t_shift % segmentime_size * hw_size * sizeof(T); + src_gdram2nram = input + first_index_cur_channel; + count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1; + count_nram2gdram = segmentime_size - 1; + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index - t_shift * channel_size * hw_size; + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } + } + } else { + int offset_index = time_size + t_shift; + if (cur_sequence_index >= offset_index) { + if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + res_segment - 1); + continue; + } else { + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + segmentime_size - 1); + continue; + } + } else { + dst_gdram2nram = data_nram; + src_gdram2nram = input + index - t_shift * channel_size * hw_size; + if (cur_sequence_index - t_shift + segmentime_size < time_size) { + count_gdram2nram = segmentime_size - 1; + count_nram2gdram = segmentime_size - 1; + } else { + count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1; + count_nram2gdram = + (segmentime_size - 1) < (time_size - cur_sequence_index - 1) + ? (segmentime_size - 1) + : (time_size - cur_sequence_index - 1); + } + } + } + __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM, + hw_size * sizeof(T), channel_size * hw_size * sizeof(T), + count_gdram2nram); + __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM, + channel_size * hw_size * sizeof(T), hw_size * sizeof(T), + count_nram2gdram); + __asm__ volatile("sync;"); + } +} + +__mlu_entry__ void MLUUnion1KernelTinShift( + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const cnrtDataType_t data_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (data_dtype) { + case CNRT_FLOAT16: { + mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output, + batch_size, time_size, channel_size, hw_size, + group_size, group_channel); + }; break; + case CNRT_FLOAT32: { + mluMultiKernelTinShift((float *)input, (const int *)shifts, + (float *)output, batch_size, time_size, + channel_size, hw_size, group_size, group_channel); + }; break; + default: { return; } + } +} + +__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence( + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const int max_number_hw_per_core, const int max_length_per_core, + const cnrtDataType_t data_dtype) { + // make sure that memcore is not used + if (coreId == 0x80) { + return; + } + switch (data_dtype) { + case CNRT_FLOAT16: { + mluMultiKernelTinShiftSplitSequence( + (half *)input, (const int *)shifts, (half *)output, batch_size, + time_size, channel_size, hw_size, group_size, group_channel, + max_number_hw_per_core, max_length_per_core); + }; break; + case CNRT_FLOAT32: { + mluMultiKernelTinShiftSplitSequence( + (float *)input, (const int *)shifts, (float *)output, batch_size, + time_size, channel_size, hw_size, group_size, group_channel, + max_number_hw_per_core, max_length_per_core); + }; break; + default: { return; } + } +} + +void KernelTinShiftForward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *input, const void *shifts, void *output, const int batch_size, + const int time_size, const int channel_size, const int hw_size, + const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core) { + if (channel_per_core >= 1) { + MLUUnion1KernelTinShift<<>>( + input, shifts, output, batch_size, time_size, channel_size, hw_size, + group_size, group_channel, data_dtype); + } else { + MLUUnion1KernelTinShiftSplitSequence<<>>( + input, shifts, output, batch_size, time_size, channel_size, hw_size, + group_size, group_channel, max_number_hw_per_core, max_length_per_core, + data_dtype); + } +} + +void KernelTinShiftBackward( + cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue, + const void *grad_output, const void *shifts, void *grad_input, + const int batch_size, const int time_size, const int channel_size, + const int hw_size, const int group_size, const int group_channel, + const cnrtDataType_t data_dtype, const int channel_per_core, + const int max_number_hw_per_core, const int max_length_per_core) { + if (channel_per_core >= 1) { + MLUUnion1KernelTinShift<<>>( + grad_output, shifts, grad_input, batch_size, time_size, channel_size, + hw_size, group_size, group_channel, data_dtype); + } else { + MLUUnion1KernelTinShiftSplitSequence<<>>( + grad_output, shifts, grad_input, batch_size, time_size, channel_size, + hw_size, group_size, group_channel, max_number_hw_per_core, + max_length_per_core, data_dtype); + } +} diff --git a/mmcv/ops/csrc/common/mps/MPSDevice.h b/mmcv/ops/csrc/common/mps/MPSDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac --- /dev/null +++ b/mmcv/ops/csrc/common/mps/MPSDevice.h @@ -0,0 +1,64 @@ +// Copyright © 2022 Apple Inc. + +// This file is modify from: +// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h + +#pragma once +#include +#include +#include + +#ifdef __OBJC__ +#include +#include +#include +typedef id MTLDevice_t; +#else +typedef void* MTLDevice; +typedef void* MTLDevice_t; +#endif + +using namespace std; + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSDevice +// +// MPSDevice is a singleton class that returns the default device +//----------------------------------------------------------------- + +class TORCH_API MPSDevice { + public: + /** + * MPSDevice should not be cloneable. + */ + MPSDevice(MPSDevice& other) = delete; + /** + * MPSDevice should not be assignable. + */ + void operator=(const MPSDevice&) = delete; + /** + * Gets single instance of the Device. + */ + static MPSDevice* getInstance(); + /** + * Returns the single device. + */ + MTLDevice_t device() { return _mtl_device; } + + ~MPSDevice(); + + private: + static MPSDevice* _device; + MTLDevice_t _mtl_device; + MPSDevice(); +}; + +TORCH_API bool is_available(); + +TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false); + +} // namespace mps +} // namespace at diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.h b/mmcv/ops/csrc/common/mps/MPSLibrary.h new file mode 100644 index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b --- /dev/null +++ b/mmcv/ops/csrc/common/mps/MPSLibrary.h @@ -0,0 +1,61 @@ +#ifndef _MPS_LIBRARY_H_ +#define _MPS_LIBRARY_H_ + +#include +#include + +#ifdef __OBJC__ +#include +#include +#include + +typedef id MTLComputePipelineState_t; +typedef id MTLLibrary_t; +#else +typedef void* MTLComputePipelineState; +typedef void* MTLComputePipelineState_t; +typedef void* MTLLibrary; +typedef void* MTLLibrary_t; +#endif + +class MPSLibrary { + public: + // disable constructor for singleton + static MPSLibrary* createFromUrl(const std::string& library_url); + static MPSLibrary* createFromSource(const std::string& source); + ~MPSLibrary(); + + MTLLibrary_t library() { return _library; } + + MTLComputePipelineState_t getComputePipelineState( + const std::string& function_name); + + private: + MTLLibrary_t _library; + std::unordered_map _pso_map; +}; + +class MPSLibraryManager { + public: + // disable constructor for singleton + MPSLibraryManager(const MPSLibraryManager&) = delete; + MPSLibraryManager& operator=(const MPSLibraryManager&) = delete; + MPSLibraryManager(MPSLibraryManager&&) = delete; + MPSLibraryManager& operator=(MPSLibraryManager&&) = delete; + + static MPSLibraryManager* getInstance(); + + bool hasLibrary(const std::string& name); + + MPSLibrary* getLibrary(const std::string& library_url); + + MPSLibrary* createLibraryFromSouce(const std::string& name, + const std::string& sources); + + ~MPSLibraryManager(); + + private: + MPSLibraryManager(); + std::unordered_map> _library_map; +}; +#endif diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/mmcv/ops/csrc/common/mps/MPSLibrary.mm new file mode 100644 index 0000000000000000000000000000000000000000..1a3d635ca95666e110a94b33315d94af16888b7c --- /dev/null +++ b/mmcv/ops/csrc/common/mps/MPSLibrary.mm @@ -0,0 +1,110 @@ +#include "MPSLibrary.h" +#include +#include "MPSDevice.h" + +static std::unique_ptr mps_library_manager; +static c10::once_flag mpsdev_init; + +MPSLibraryManager* MPSLibraryManager::getInstance() { + c10::call_once(mpsdev_init, [] { + mps_library_manager = std::unique_ptr(new MPSLibraryManager()); + }); + return mps_library_manager.get(); +} + +MPSLibraryManager::~MPSLibraryManager() {} + +MPSLibraryManager::MPSLibraryManager() {} + +bool MPSLibraryManager::hasLibrary(const std::string& name) { + return _library_map.find(name) != _library_map.end(); +} + +MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) { + if (_library_map.find(library_url) != _library_map.end()) { + return _library_map[library_url].get(); + } + _library_map.emplace(std::make_pair( + library_url, std::unique_ptr(MPSLibrary::createFromUrl(library_url)))); + return _library_map[library_url].get(); +} + +MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name, + const std::string& source) { + NSString* ns_name = [NSString stringWithCString:name.c_str()]; + if (_library_map.find(name) != _library_map.end()) { + NSLog(@"Library %@ already exist.", ns_name); + return nullptr; + } + + _library_map.emplace( + std::make_pair(name, std::unique_ptr(MPSLibrary::createFromSource(source)))); + return _library_map[name].get(); +} + +MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) { + MPSLibrary* library = new MPSLibrary(); + @autoreleasepool { + NSError* error = nil; + + // load library and func + NSString* utl_str = [NSString stringWithCString:library_url.c_str()]; + NSURL* metal_url = [NSURL fileURLWithPath:utl_str]; + library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url + error:&error]; + if (library->_library == nil) { + NSLog(@"Failed to find library, error %@.", error); + exit(1); + } + } + + return library; +} + +MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) { + MPSLibrary* library = new MPSLibrary(); + @autoreleasepool { + NSError* error = nil; + + // load library and func + NSString* code_str = [NSString stringWithCString:sources.c_str()]; + library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str + options:nil + error:&error]; + if (library->_library == nil) { + NSLog(@"Failed to find library, error %@.", error); + exit(1); + } + } + + return library; +} + +MPSLibrary::~MPSLibrary() { + [_library release]; + _library = nil; +} + +MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) { + if (_pso_map.find(function_name) != _pso_map.end()) { + return _pso_map[function_name]; + } + + MTLComputePipelineState_t pso; + @autoreleasepool { + NSError* error = nil; + + // create function + NSString* function_name_str = [NSString stringWithCString:function_name.c_str()]; + id func = [_library newFunctionWithName:function_name_str]; + if (func == nil) { + NSLog(@"Failed to created pipeline state object, error %@.", error); + exit(1); + } + // create pipeline + pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func + error:&error]; + _pso_map.emplace(std::make_pair(function_name, pso)); + } + return _pso_map[function_name]; +} diff --git a/mmcv/ops/csrc/common/mps/MPSStream.h b/mmcv/ops/csrc/common/mps/MPSStream.h new file mode 100644 index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6 --- /dev/null +++ b/mmcv/ops/csrc/common/mps/MPSStream.h @@ -0,0 +1,132 @@ +// Copyright © 2022 Apple Inc. + +// This file is modify from: +// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h + +#pragma once + +#include +#include + +#include +#include +#include +#include "MPSDevice.h" + +#ifdef __OBJC__ +#include +#include +#include +#include +typedef id MTLCommandQueue_t; +typedef id MTLCommandBuffer_t; +typedef id MTLSharedEvent_t; +typedef id MTLDevice_t; +#else +typedef void* MTLCommandQueue_t; +typedef void* MTLCommandQueue; +typedef void* MTLCommandBuffer_t; +typedef void* MTLCommandBuffer; +typedef void* MTLSharedEvent_t; +typedef void* dispatch_queue_t; +typedef void* MTLDevice_t; +#define nil NULL; +#endif + +namespace at { +namespace mps { + +//----------------------------------------------------------------- +// MPSStream +//----------------------------------------------------------------- + +class TORCH_API MPSStream { + public: + enum Unchecked { UNCHECKED }; + /// Construct a MPSStream from a Stream. This construction is checked, + /// and will raise an error if the Stream is not, in fact, a MPS stream. + explicit MPSStream(Stream stream); + + ~MPSStream(); + MTLCommandQueue_t commandQueue() const { return _commandQueue; }; + dispatch_queue_t queue() const { return _serialQueue; } + + MTLCommandBuffer_t commandBuffer(); + void commit(bool flush); + void commitAndWait(); + void synchronize(); + + void flush(); + + /// Get the MPS device index that this stream is associated with. + c10::DeviceIndex device_index() const { return _stream.device_index(); } + + MTLCommandQueue_t stream() const { return _commandQueue; }; + + MTLDevice_t device() const { return [_commandQueue device]; } + + /// Explicit conversion to Stream. + Stream unwrap() const { return _stream; } + + private: + Stream _stream; + MTLCommandQueue_t _commandQueue = nil; + MTLCommandBuffer_t _commandBuffer = nil; + void _flush(bool commitAndWait) const; + + dispatch_queue_t _serialQueue = nullptr; +}; + +/** + * Get the current MPS stream + */ +TORCH_API MPSStream* getCurrentMPSStream(); + +/** + * Get the default MPS stream + */ +TORCH_API MPSStream* getDefaultMPSStream(); + +//----------------------------------------------------------------- +// MPSStreamImpl +//----------------------------------------------------------------- + +class TORCH_API MPSStreamImpl { + public: + /** + * Gets single instance of the MPSStream. + */ + static MPSStream* getInstance(); + + private: + static MPSStream* _stream; + MPSStreamImpl(); +}; + +//----------------------------------------------------------------- +// MPSEvent +//----------------------------------------------------------------- + +struct TORCH_API MPSEvent { + MPSEvent(); + // MPSEvent(id device); + + ~MPSEvent(); + MTLSharedEvent_t event() const { return _event; } + + void recordEvent(MPSStream* stream); + void waitForEvent(MPSStream* queue); // waits on the cpu + bool queryEvent(); + uint64_t getCurrentValue() { return _currentValue; } + void setCurrentValue(uint64_t currValue) { _currentValue = currValue; } + + private: + bool _isRecorded = false; + uint64_t _currentValue = 0; + MTLSharedEvent_t _event; +}; + +typedef MPSEvent* mpsEvent_t; + +} // namespace mps +} // namespace at diff --git a/mmcv/ops/csrc/common/mps/MPSUtils.h b/mmcv/ops/csrc/common/mps/MPSUtils.h new file mode 100644 index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de --- /dev/null +++ b/mmcv/ops/csrc/common/mps/MPSUtils.h @@ -0,0 +1,51 @@ +#ifndef _MPS_UTILS_H_ +#define _MPS_UTILS_H_ +#include +#ifdef __OBJC__ +#include +#include +#include + +typedef id MTLBuffer_t; +typedef id MTLComputeCommandEncoder_t; +#else +typedef void* MTLBuffer; +typedef void* MTLBuffer_t; +typedef void* MTLComputeCommandEncoder; +typedef void* MTLComputeCommandEncoder_t; +#endif + +// utils +static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) { + return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data()); +} + +template , at::Tensor>::value, bool> = true> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t); + +template , at::Tensor>::value, bool> = true> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { + [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index]; +} + +template , at::Tensor>::value, bool>> +void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) { + [encoder setBytes:&t length:sizeof(t) atIndex:index]; +} + +inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {} + +template +void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) { + setMTLArg(encoder, index, std::forward(t)); + setMTLArgsImpl(encoder, index + 1, std::forward(args)...); +} + +template +void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) { + [encoder setComputePipelineState:pso]; + setMTLArgsImpl(encoder, 0, std::forward(args)...); +} +#endif diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp index c7f9f35b7b0af6bc91052a1d038809e46c74effd..f68e8740561ef833c09e1ba9f999922f5d04bce5 100644 --- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp +++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp @@ -1,22 +1,25 @@ #ifndef PYTORCH_CPP_HELPER #define PYTORCH_CPP_HELPER -#include +#include #include using namespace at; -#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) - #define CHECK_CUDA(x) \ TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_MLU(x) \ + TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor") #define CHECK_CPU(x) \ - TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor") + TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_CUDA_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) +#define CHECK_MLU_INPUT(x) \ + CHECK_MLU(x); \ + CHECK_CONTIGUOUS(x) #define CHECK_CPU_INPUT(x) \ CHECK_CPU(x); \ CHECK_CONTIGUOUS(x) diff --git a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp new file mode 100644 index 0000000000000000000000000000000000000000..72dbe5880bfed2bcebaf6b20c6f169639e34fa38 --- /dev/null +++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp @@ -0,0 +1,28 @@ +/************************************************************************* + * Copyright (C) 2021 Cambricon. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + *************************************************************************/ +#ifndef PYTORCH_MLU_HELPER_HPP_ +#define PYTORCH_MLU_HELPER_HPP_ + +#ifdef MMCV_WITH_MLU +#include "aten.h" + +#define NFU_ALIGN_SIZE 128 + +#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y)) + +#define PAD_DOWN(x, y) (((x) / (y)) * (y)) + +#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y)) + +#endif + +#endif // PYTORCH_MLU_HELPER_HPP_ diff --git a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h new file mode 100644 index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h @@ -0,0 +1,70 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PARAMS_GRID_H_ +#define PARAMS_GRID_H_ +#include +#include + +namespace detail { +template +int getTotalSize(std::vector arg) { + return arg.size(); +} + +template +int getTotalSize(std::vector arg, std::vector... args) { + return arg.size() * getTotalSize(args...); +} + +template +int getSize(std::vector arg) { + return arg.size(); +} + +template +void assigner(TT &src, std::vector counter, std::vector &arg) { + std::get(src) = arg[counter[Idx]]; +} + +template +void assigner(TT &src, std::vector counter, std::vector &arg, + std::vector &... args) { + std::get(src) = arg[counter[Idx]]; + assigner(src, counter, args...); +} +} // namespace detail + +template +std::vector> paramsGrid(std::vector... args) { + int length = detail::getTotalSize(args...); + std::vector sizes = {detail::getSize(args)...}; + int size = sizes.size(); + + std::vector> params(length); + std::vector counter(size); + for (int i = 0; i < length; ++i) { + detail::assigner<0>(params[i], counter, args...); + counter[size - 1] += 1; + for (int c = size - 1; c >= 0; --c) { + if (counter[c] == sizes[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return params; +} + +#endif diff --git a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h new file mode 100644 index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h @@ -0,0 +1,493 @@ +// Copyright Louis Delacroix 2010 - 2014. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +// A pretty printing library for C++ +// +// Usage: +// Include this header, and operator<< will "just work". + +#ifndef H_PRETTY_PRINT +#define H_PRETTY_PRINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pretty_print { +namespace detail { +// SFINAE type trait to detect whether T::const_iterator exists. + +struct sfinae_base { + using yes = char; + using no = yes[2]; +}; + +template +struct has_const_iterator : private sfinae_base { + private: + template + static yes &test(typename C::const_iterator *); + template + static no &test(...); + + public: + static const bool value = sizeof(test(nullptr)) == sizeof(yes); + using type = T; +}; + +template +struct has_begin_end : private sfinae_base { + private: + template + static yes & + f(typename std::enable_if< + std::is_same(&C::begin)), + typename C::const_iterator (C::*)() const>::value>::type *); + + template + static no &f(...); + + template + static yes &g(typename std::enable_if< + std::is_same(&C::end)), + typename C::const_iterator (C::*)() const>::value, + void>::type *); + + template + static no &g(...); + + public: + static bool const beg_value = sizeof(f(nullptr)) == sizeof(yes); + static bool const end_value = sizeof(g(nullptr)) == sizeof(yes); +}; + +} // namespace detail + +// Holds the delimiter values for a specific character type + +template +struct delimiters_values { + using char_type = TChar; + const char_type *prefix; + const char_type *delimiter; + const char_type *postfix; +}; + +// Defines the delimiter values for a specific container and character type + +template +struct delimiters { + using type = delimiters_values; + static const type values; +}; + +// Functor to print containers. You can use this directly if you want +// to specify a non-default delimiters type. The printing logic can +// be customized by specializing the nested template. + +template , + typename TDelimiters = delimiters> +struct print_container_helper { + using delimiters_type = TDelimiters; + using ostream_type = std::basic_ostream; + + template + struct printer { + static void print_body(const U &c, ostream_type &stream) { + using std::begin; + using std::end; + + auto it = begin(c); + const auto the_end = end(c); + + if (it != the_end) { + for (;;) { + stream << *it; + + if (++it == the_end) break; + + if (delimiters_type::values.delimiter != NULL) + stream << delimiters_type::values.delimiter; + } + } + } + }; + + print_container_helper(const T &container) : container_(container) {} + + inline void operator()(ostream_type &stream) const { + if (delimiters_type::values.prefix != NULL) + stream << delimiters_type::values.prefix; + + printer::print_body(container_, stream); + + if (delimiters_type::values.postfix != NULL) + stream << delimiters_type::values.postfix; + } + + private: + const T &container_; +}; + +// Specialization for pairs + +template +template +struct print_container_helper::printer> { + using ostream_type = + typename print_container_helper::ostream_type; + + static void print_body(const std::pair &c, ostream_type &stream) { + stream << c.first; + if (print_container_helper::delimiters_type::values + .delimiter != NULL) + stream << print_container_helper::delimiters_type::values + .delimiter; + stream << c.second; + } +}; + +// Specialization for tuples + +template +template +struct print_container_helper::printer> { + using ostream_type = + typename print_container_helper::ostream_type; + using element_type = std::tuple; + + template + struct Int {}; + + static void print_body(const element_type &c, ostream_type &stream) { + tuple_print(c, stream, Int<0>()); + } + + static void tuple_print(const element_type &, ostream_type &, + Int) {} + + static void tuple_print( + const element_type &c, ostream_type &stream, + typename std::conditional, + std::nullptr_t>::type) { + stream << std::get<0>(c); + tuple_print(c, stream, Int<1>()); + } + + template + static void tuple_print(const element_type &c, ostream_type &stream, Int) { + if (print_container_helper::delimiters_type::values + .delimiter != NULL) + stream << print_container_helper::delimiters_type::values + .delimiter; + + stream << std::get(c); + + tuple_print(c, stream, Int()); + } +}; + +// Prints a print_container_helper to the specified stream. + +template +inline std::basic_ostream &operator<<( + std::basic_ostream &stream, + const print_container_helper &helper) { + helper(stream); + return stream; +} + +// Basic is_container template; specialize to derive from std::true_type for all +// desired container types + +template +struct is_container + : public std::integral_constant::value && + detail::has_begin_end::beg_value && + detail::has_begin_end::end_value> {}; + +template +struct is_container : std::true_type {}; + +template +struct is_container : std::false_type {}; + +template +struct is_container> : std::true_type {}; + +template +struct is_container> : std::true_type {}; + +template +struct is_container> : std::true_type {}; + +// Default delimiters + +template +struct delimiters { + static const delimiters_values values; +}; +template +const delimiters_values delimiters::values = {"[", ", ", "]"}; +template +struct delimiters { + static const delimiters_values values; +}; +template +const delimiters_values delimiters::values = {L"[", L", ", + L"]"}; + +// Delimiters for (multi)set and unordered_(multi)set + +template +struct delimiters<::std::set, char> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::set, char>::values = {"{", ", ", + "}"}; + +template +struct delimiters<::std::set, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::set, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::multiset, char> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::multiset, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::multiset, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::multiset, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::unordered_set, char> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_set, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::unordered_set, wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_set, wchar_t>::values = { + L"{", L", ", L"}"}; + +template +struct delimiters<::std::unordered_multiset, + char> { + static const delimiters_values values; +}; + +template +const delimiters_values delimiters< + ::std::unordered_multiset, char>::values = { + "{", ", ", "}"}; + +template +struct delimiters<::std::unordered_multiset, + wchar_t> { + static const delimiters_values values; +}; + +template +const delimiters_values + delimiters<::std::unordered_multiset, + wchar_t>::values = {L"{", L", ", L"}"}; + +// Delimiters for pair and tuple + +template +struct delimiters, char> { + static const delimiters_values values; +}; +template +const delimiters_values delimiters, char>::values = { + "(", ", ", ")"}; +template +struct delimiters<::std::pair, wchar_t> { + static const delimiters_values values; +}; +template +const delimiters_values + delimiters<::std::pair, wchar_t>::values = {L"(", L", ", L")"}; + +template +struct delimiters, char> { + static const delimiters_values values; +}; +template +const delimiters_values delimiters, char>::values = { + "(", ", ", ")"}; +template +struct delimiters<::std::tuple, wchar_t> { + static const delimiters_values values; +}; +template +const delimiters_values + delimiters<::std::tuple, wchar_t>::values = {L"(", L", ", L")"}; + +// Type-erasing helper class for easy use of custom delimiters. +// Requires TCharTraits = std::char_traits and TChar = char or wchar_t, +// and MyDelims needs to be defined for TChar. Usage: "cout << +// pretty_print::custom_delims(x)". + +struct custom_delims_base { + virtual ~custom_delims_base() {} + virtual std::ostream &stream(::std::ostream &) = 0; + virtual std::wostream &stream(::std::wostream &) = 0; +}; + +template +struct custom_delims_wrapper : custom_delims_base { + custom_delims_wrapper(const T &t_) : t(t_) {} + + std::ostream &stream(std::ostream &s) { + return s << print_container_helper, Delims>( + t); + } + + std::wostream &stream(std::wostream &s) { + return s << print_container_helper, + Delims>(t); + } + + private: + const T &t; +}; + +template +struct custom_delims { + template + custom_delims(const Container &c) + : base(new custom_delims_wrapper(c)) {} + + std::unique_ptr base; +}; + +template +inline std::basic_ostream &operator<<( + std::basic_ostream &s, const custom_delims &p) { + return p.base->stream(s); +} + +// A wrapper for a C-style array given as pointer-plus-size. +// Usage: std::cout << pretty_print_array(arr, n) << std::endl; + +template +struct array_wrapper_n { + typedef const T *const_iterator; + typedef T value_type; + + array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {} + inline const_iterator begin() const { return _array; } + inline const_iterator end() const { return _array + _n; } + + private: + const T *const _array; + size_t _n; +}; + +// A wrapper for hash-table based containers that offer local iterators to each +// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl; (Prints bucket +// 5 of container m.) + +template +struct bucket_print_wrapper { + typedef typename T::const_local_iterator const_iterator; + typedef typename T::size_type size_type; + + const_iterator begin() const { return m_map.cbegin(n); } + + const_iterator end() const { return m_map.cend(n); } + + bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {} + + private: + const T &m_map; + const size_type n; +}; + +} // namespace pretty_print + +// Global accessor functions for the convenience wrappers + +template +inline pretty_print::array_wrapper_n pretty_print_array(const T *const a, + size_t n) { + return pretty_print::array_wrapper_n(a, n); +} + +template +pretty_print::bucket_print_wrapper bucket_print(const T &m, + typename T::size_type n) { + return pretty_print::bucket_print_wrapper(m, n); +} + +// Main magic entry point: An overload snuck into namespace std. +// Can we do better? + +namespace std { +// Prints a container to the stream using default delimiters + +template +inline typename enable_if<::pretty_print::is_container::value, + basic_ostream &>::type +operator<<(basic_ostream &stream, const T &container) { + return stream + << ::pretty_print::print_container_helper( + container); +} +} // namespace std + +#endif // H_PRETTY_PRINT diff --git a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h @@ -0,0 +1,60 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace py = pybind11; + +template +std::vector array2Vector(TPyObject arr) { + py::array arr_np = arr; + size_t size = arr.attr("size").template cast(); + py::array_t arr_cc = arr_np; + std::vector data(arr_cc.data(), arr_cc.data() + size); + return data; +} + +template +std::vector arrayT2Vector(py::array_t arr) { + std::vector data(arr.data(), arr.data() + arr.size()); + return data; +} + +template +tv::TensorView array2TensorView(TPyObject arr) { + py::array arr_np = arr; + py::array_t arr_cc = arr_np; + tv::Shape shape; + for (int i = 0; i < arr_cc.ndim(); ++i) { + shape.push_back(arr_cc.shape(i)); + } + return tv::TensorView(arr_cc.mutable_data(), shape); +} +template +tv::TensorView arrayT2TensorView(py::array_t arr) { + tv::Shape shape; + for (int i = 0; i < arr.ndim(); ++i) { + shape.push_back(arr.shape(i)); + } + return tv::TensorView(arr.mutable_data(), shape); +} diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h new file mode 100644 index 0000000000000000000000000000000000000000..e5e093fbbed4f0485559d9860b291e258337443f --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h @@ -0,0 +1,297 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPCONV_GEOMETRY_H_ +#define SPCONV_GEOMETRY_H_ + +#include + +#include +#include + +template +TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos, + const Index *kernelSize, + const Index *stride, const Index *padding, + const Index *dilation, + const Index *outSpatialShape, Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 + + stride[i] + padding[i]) / + stride[i]; + uppers[i] = (input_pos[i] + padding[i]) / stride[i]; + } + +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } + +#pragma unroll + for (int i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + // break; + } + offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j]; + m *= kernelSize[j]; + } + + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +TV_HOST_DEVICE Index getValidOutPosTranspose( + const Index *input_pos, const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, const Index *outSpatialShape, + Index *out) { + Index lowers[NDim]; + Index uppers[NDim]; + Index counter[NDim]; + Index counterSize[NDim]; + Index pointCounter = 0; + Index val; + Index numPoints = 1; + Index m, offset; + bool valid = false; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + lowers[i] = input_pos[i] * stride[i] - padding[i]; + uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i]; + } +#pragma unroll + for (unsigned i = 0; i < NDim; ++i) { + counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1); + numPoints *= counterSize[i]; + } +#pragma unroll + for (int i = 0; i < NDim; ++i) { + counter[i] = 0; + } + for (int i = 0; i < numPoints; ++i) { + valid = true; + m = 1; + offset = 0; +#pragma unroll + for (int j = NDim - 1; j >= 0; --j) { + val = uppers[j] - counter[j] * dilation[j]; + out[pointCounter * (NDim + 1) + j] = val; + if (val < 0 || (val > outSpatialShape[j] - 1)) { + valid = false; + } + offset += m * (val - lowers[j]) / dilation[j]; + m *= kernelSize[j]; + } + out[pointCounter * (NDim + 1) + NDim] = offset; + if (valid) ++pointCounter; + counter[NDim - 1] += 1; +#pragma unroll + for (int c = NDim - 1; c >= 0; --c) { + if (counter[c] == counterSize[c] && c > 0) { + counter[c - 1] += 1; + counter[c] = 0; + } + } + } + return pointCounter; +} + +template +Index getIndicePairsConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + // indicesOut: num_active * kernelVolume * (NDim + 1) + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsDeConv(tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *kernelSize, const Index *stride, + const Index *padding, const Index *dilation, + const Index *outSpatialShape) { + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + for (int j = 0; j < numActIn; ++j) { + batchIdx = indicesIn(j, 0); + numValidPoints = getValidOutPosTranspose( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + auto index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * batchIdx; + if (gridsOut[index] == -1) { + for (unsigned k = 1; k < NDim + 1; ++k) { + indicesOut(numAct, k) = pointPtr[k - 1]; + } + indicesOut(numAct, 0) = batchIdx; + gridsOut[index] = numAct++; + } + // indicePairs: [K, 2, L] + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + return numAct; +} + +template +Index getIndicePairsSubM(tv::TensorView indicesIn, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const Index *const kernelSize, + const Index *const stride, const Index *const padding, + const Index *dilation, + const Index *const outSpatialShape) { + Index numAct = 0; + auto numActIn = indicesIn.dim(0); + Index batchIdx = 0; + Index spatialVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + spatialVolume *= outSpatialShape[i]; + } + Index kernelVolume = 1; +#pragma unroll + for (int i = 0; i < NDim; ++i) { + kernelVolume *= kernelSize[i]; + } + Index numValidPoints = 0; + // Index validPoints[kernelVolume * (NDim + 1)]; + std::vector validPoints_(kernelVolume * (NDim + 1)); + Index *validPoints = validPoints_.data(); + Index *pointPtr = nullptr; + Index index = 0; + for (int j = 0; j < numActIn; ++j) { + index = tv::rowArrayIdx(indicesIn.data() + j * (NDim + 1) + 1, + outSpatialShape) + + spatialVolume * indicesIn(j, 0); + gridsOut[index] = j; + } + for (int j = 0; j < numActIn; ++j) { + numValidPoints = getValidOutPos( + indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding, + dilation, outSpatialShape, validPoints); + for (Index i = 0; i < numValidPoints; ++i) { + pointPtr = validPoints + i * (NDim + 1); + auto offset = pointPtr[NDim]; + index = tv::rowArrayIdx(pointPtr, outSpatialShape) + + spatialVolume * indicesIn(j, 0); + if (gridsOut[index] > -1) { + indicePairs(offset, 0, indiceNum[offset]) = j; + indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index]; + } + } + } + return numActIn; +} + +#endif diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h new file mode 100644 index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h @@ -0,0 +1,78 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_ +#define SPARSE_CONV_INDICE_FUNCTOR_H_ +#include + +namespace functor { +template +struct CreateConvIndicePairFunctorP1 { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose); +}; + +template +struct CreateConvIndicePairFunctorP2 { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + tv::TensorView indicePairUnique, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; + +template +struct CreateConvIndicePairFunctor { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView indicesOut, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; + +template +struct CreateSubMIndicePairFunctor { + Index operator()(const Device& d, tv::TensorView indicesIn, + tv::TensorView gridsOut, + tv::TensorView indicePairs, + tv::TensorView indiceNum, + const tv::SimpleVector kernelSize, + const tv::SimpleVector stride, + const tv::SimpleVector padding, + const tv::SimpleVector dilation, + const tv::SimpleVector outSpatialShape, + bool transpose, bool resetGrid = false); +}; +} // namespace functor + +#endif diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h new file mode 100644 index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h @@ -0,0 +1,37 @@ +// Copyright 2019 Yan Yan +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef SPARSE_MAXPOOL_FUNCTOR_H_ +#define SPARSE_MAXPOOL_FUNCTOR_H_ +#include + +namespace functor { +template +struct SparseMaxPoolForwardFunctor { + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView indices, int size); +}; + +template +struct SparseMaxPoolBackwardFunctor { + void operator()(const Device& d, tv::TensorView outFeatures, + tv::TensorView inFeatures, + tv::TensorView fout, + tv::TensorView fin, + tv::TensorView indices, int size); +}; +} // namespace functor + +#endif diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6 --- /dev/null +++ b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h @@ -0,0 +1,50 @@ +#ifndef MP_HELPER_H_ +#define MP_HELPER_H_ +#include +#include + +template +struct mp_list {}; + +template +using mp_list_c = mp_list...>; + +namespace detail { + +template +constexpr F mp_for_each_impl(mp_list, F &&f) { + return std::initializer_list{(f(T()), 0)...}, std::forward(f); +} + +template +constexpr F mp_for_each_impl(mp_list<>, F &&f) { + return std::forward(f); +} + +} // namespace detail + +namespace detail { + +template class B> +struct mp_rename_impl { + // An error "no type named 'type'" here means that the first argument to + // mp_rename is not a list +}; + +template