diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2fdf8a2d23cff3f69ea753466370b6dc3c719686..eea0b2544fd606d8593f1b2f12008a76673829d1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -16,6 +16,7 @@ All kinds of contributions are welcome, including but not limited to the followi
 ```{note}
 If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
 ```
+
 ### Code style
 
 #### Python
@@ -24,10 +25,11 @@ We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code
 
 We use the following tools for linting and formatting:
 
-- [flake8](http://flake8.pycqa.org/en/latest/): A wrapper around some linter tools.
-- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
 - [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
-- [markdownlint](https://github.com/markdownlint/markdownlint): A linter to check markdown files and flag style issues.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
 - [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
 
 Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
@@ -48,23 +50,9 @@ From the repository folder
 pre-commit install
 ```
 
-Try the following steps to install ruby when you encounter an issue on installing markdownlint
-
-```shell
-# install rvm
-curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail
-[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm"
-rvm autolibs disable
-
-# install ruby
-rvm install 2.7.1
-```
-
-Or refer to [this repo](https://github.com/innerlee/setup) and take [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh) according its instruction.
-
 After this on every commit check code linters and formatter will be enforced.
 
->Before you create a PR, make sure that your code lints and is formatted by yapf.
+> Before you create a PR, make sure that your code lints and is formatted by yapf.
 
 #### C++ and CUDA
 
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index e163b312ca5b45dac195232979fa31024ff55ef2..0000000000000000000000000000000000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM python:3.7
-
-WORKDIR /mmcv
-
-COPY . /mmcv
-
-RUN pip install -e .
diff --git a/LICENSES.md b/LICENSES.md
index 9bb0c8cafa72033f503fd3f46b98d30dcfd75c29..5de8358331f4d21529e016807b86b66dc6ca29da 100644
--- a/LICENSES.md
+++ b/LICENSES.md
@@ -2,7 +2,7 @@
 
 In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.
 
-|    Operation     |                                                                    Files                                                                              |    License     |
-| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
+|    Operation     |                                                                               Files                                                                               |    License     |
+| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
 |    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
 | fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
diff --git a/MANIFEST.in b/MANIFEST.in
index 65f232e070d43ce40d0fd425201e3b140b5af551..5de8494b5df3656a4f6a09da26d9f4bb27ed69a5 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,7 @@
 include requirements/runtime.txt
-include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json
+include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json mmcv/model_zoo/torchvision_0.12.json
 include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
 include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
 include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
+include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
+recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
diff --git a/README.md b/README.md
index 9b64100479f8f8030f1736173aa6ee3e25be8f8a..1a6541a689a48944394db84b48d5b484e63a8708 100644
--- a/README.md
+++ b/README.md
@@ -1,61 +1,274 @@
-# <div align="center"><strong>MMCV</strong></div>
-## 简介
-MMCV是计算机视觉研究的基础库，主要提供以下功能：图像处理、图像和标注结果可视化、图像转换、多种CNN网络结构、高质量实现的常见CUDA算子。
+<div align="center">
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab website</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab platform</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+</div>
 
-## 安装
-组件支持
-+ Python 3.7、3.8、3.9
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/en/latest/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+
+English | [简体中文](README_zh-CN.md)
+
+## Introduction
+
+MMCV is a foundational library for computer vision research and supports many
+research projects as below:
+
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
+
+It provides the following functionalities.
+
+- Universal IO APIs
+- Image/Video processing
+- Image and annotation visualization
+- Useful utilities (progress bar, timer, ...)
+- PyTorch runner with hooking mechanism
+- Various CNN architectures
+- High-quality implementation of common CUDA ops
+
+It supports the following systems.
+
+- Linux
+- Windows
+- macOS
+
+See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.
+
+Note: MMCV requires Python 3.6+.
+
+## Installation
+
+There are two versions of MMCV:
+
+- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
+- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
+
+**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.
+
+a. Install the full version.
+
+Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
+
+We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building for **Linux and Windows systems**. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
+
+i. Install the latest version.
+
+The rule for installing the latest `mmcv-full` is as follows:
 
-### 1、使用pip方式安装
-mmcv whl包下载目录：[https://cancon.hpccube.com:65024/4/main/mmcv](https://cancon.hpccube.com:65024/4/main/mmcv)，选择对应的pytorch版本和python版本下载对应mmcv的whl包
 ```shell
-pip install mmcv* (下载的mmcv的whl包)
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
-### 2、使用源码编译方式安装
 
-#### 编译环境准备
-提供2种环境准备方式：
+Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. For example,
+to install the latest `mmcv-full` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
 
-1. 基于光源pytorch基础镜像环境：镜像下载地址：[https://sourcefind.cn/#/image/dcu/pytorch](https://sourcefind.cn/#/image/dcu/pytorch)，根据pytorch、python、dtk及系统下载对应的镜像版本。
+**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full.
 
-2. 基于现有python环境：安装pytorch，pytorch whl包下载目录：[https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1](https://cancon.hpccube.com:65024/4/main/pytorch/dtk24.04.1)，根据python、dtk版本,下载对应pytorch的whl包。安装命令如下：
 ```shell
-pip install torch* (下载的torch的whl包)
-pip install setuptools==59.5.0 wheel
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
 ```
 
-#### 源码编译安装
-- 代码下载
+For more details, please refer the the following tables and delete `=={mmcv_version}`.
+
+ii. Install a specified version.
+
+The rule for installing a specified `mmcv-full` is as follows:
+
 ```shell
-git clone https://developer.hpccube.com/codes/aicomponent/mmcv # 根据编译需要切换分支
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
-- 提供2种源码编译方式（进入mmcv目录）：
+
+First of all, please refer to the Releases and replace `{mmcv_version}` a specified one. e.g. `1.3.9`.
+Then replace `{cu_version}` and `{torch_version}` in the url to your desired versions. For example,
+to install `mmcv-full==1.3.9` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
+
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
 ```
-1. 编译whl包并安装
-MMCV_WITH_OPS=1 python3 setup.py -v bdist_wheel
-pip install dist/mmcv*
 
-2. 源码编译安装
-MMCV_WITH_OPS=1 python3 setup.py install
+For more details, please refer the the following tables.
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"></td>
+      <td align="left"></td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details></td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/en/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+
+**Note**: mmcv-full does not provide pre-built packages for `cu102-torch1.11` and `cu92-torch*` on Windows.
+
+Another way is to compile locally by running
+
+```python
+pip install mmcv-full
 ```
-3. 测试验证
+
+Note that the local compiling may take up to 10 mins.
+
+b. Install the lite version.
+
+```python
+pip install mmcv
 ```
-cd test
-pytest -s ./test_arraymisc.py
-pytest -s ./test_ops
+
+c. Install full version with custom operators for onnxruntime
+
+- Check [here](docs/en/deployment/onnxruntime_op.md) for detailed instruction.
+
+If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
+
+## FAQ
+
+If you face some installation issues, CUDA related issues or RuntimeErrors,
+you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```latex
+@misc{mmcv,
+    title={{MMCV: OpenMMLab} Computer Vision Foundation},
+    author={MMCV Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmcv}},
+    year={2018}
+}
 ```
-#### 注意事项
-+ 若使用pip install下载安装过慢，可添加pypi清华源：-i https://pypi.tuna.tsinghua.edu.cn/simple/
-+ ROCM_PATH为dtk的路径，默认为/opt/dtk
 
-## 验证
-- python -c "import mmcv; mmcv.\_\_version__"，版本号与官方版本同步，查询该软件的版本号，例如2.0.0；
+## Contributing
 
-## Known Issue
-- 无
+We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
 
-## 参考资料
-- [README_ORIGIN](README_ORIGIN.md)
-- [README_zh-CN](README_zh-CN.md)
-- [https://github.com/open-mmlab/mmcv](https://github.com/open-mmlab/mmcv)
+## License
 
+MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
deleted file mode 100644
index e9e3f8efaf86059c8e7bef3fec73513b69e31442..0000000000000000000000000000000000000000
--- a/README_ORIGIN.md
+++ /dev/null
@@ -1,222 +0,0 @@
-<div align="center">
-    <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/mmcv-logo.png" width="300"/>
-</div>
-
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
-
-English | [简体中文](README_zh-CN.md)
-
-## Introduction
-
-MMCV is a foundational library for computer vision research and supports many
-research projects as below:
-
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM Installs OpenMMLab Packages.
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMOCR](https://github.com/open-mmlab/mmocr): A Comprehensive Toolbox for Text Detection, Recognition and Understanding.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
-- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab FewShot Learning Toolbox and Benchmark.
-
-It provides the following functionalities.
-
-- Universal IO APIs
-- Image/Video processing
-- Image and annotation visualization
-- Useful utilities (progress bar, timer, ...)
-- PyTorch runner with hooking mechanism
-- Various CNN architectures
-- High-quality implementation of common CUDA ops
-
-See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.
-
-Note: MMCV requires Python 3.6+.
-
-## Installation
-
-There are two versions of MMCV:
-
-- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
-- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops.
-
-**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.
-
-a. Install the full version.
-
-Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
-
-We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
-
-i. Install the latest version.
-
-The rule for installing the latest ``mmcv-full`` is as follows:
-
-```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
-```
-
-Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example,
-to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
-
-```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
-```
-
-**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full.
-
-```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
-```
-
-For more details, please refer the the following tables and delete ``=={mmcv_version}``.
-
-ii. Install a specified version.
-
-The rule for installing a specified ``mmcv-full`` is as follows:
-
-```shell
-pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
-```
-
-First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``.
-Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example,
-to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
-
-```shell
-pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
-```
-
-For more details, please refer the the following tables.
-
-<table class="docutils">
-  <tbody>
-    <tr>
-      <th width="80"> CUDA </th>
-      <th valign="bottom" align="left" width="120">torch1.10</th>
-      <th valign="bottom" align="left" width="100">torch1.9</th>
-      <th valign="bottom" align="left" width="100">torch1.8</th>
-      <th valign="bottom" align="left" width="100">torch1.7</th>
-      <th valign="bottom" align="left" width="100">torch1.6</th>
-      <th valign="bottom" align="left" width="100">torch1.5</th>
-    </tr>
-    <tr>
-      <td align="left">11.3</td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
-      <td align="left"></td>
-      <td align="left"></code></pre> </details> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-    </tr>
-    <tr>
-      <td align="left">11.1</td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-    </tr>
-    <tr>
-      <td align="left">11.0</td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-    </tr>
-    <tr>
-      <td align="left">10.2</td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
-    </tr>
-    <tr>
-      <td align="left">10.1</td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
-    </tr>
-    <tr>
-      <td align="left">9.2</td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
-    </tr>
-    <tr>
-      <td align="left">cpu</td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
-      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
-    </tr>
-  </tbody>
-</table>
-
-**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
-
-Another way is to compile locally by running
-
-```python
-pip install mmcv-full
-```
-
-Note that the local compiling may take up to 10 mins.
-
-b. Install the lite version.
-
-```python
-pip install mmcv
-```
-
-c. Install full version with custom operators for onnxruntime
-
-- Check [here](docs/deployment/onnxruntime_op.md) for detailed instruction.
-
-If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
-
-## FAQ
-
-If you face some installation issues, CUDA related issues or RuntimeErrors,
-you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).
-
-## Citation
-
-If you find this project useful in your research, please consider cite:
-
-```latex
-@misc{mmcv,
-    title={{MMCV: OpenMMLab} Computer Vision Foundation},
-    author={MMCV Contributors},
-    howpublished = {\url{https://github.com/open-mmlab/mmcv}},
-    year={2018}
-}
-```
-
-## Contributing
-
-We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
-
-## License
-
-MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index e3288ee31403d02c6d4c2c9335aff556c2c3d23c..8c768c837ecddc7f6c4d7e036f590d9d2b96fa64 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -1,8 +1,30 @@
 <div align="center">
-    <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/mmcv-logo.png" width="300"/>
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
 </div>
 
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/) [![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv) [![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions) [![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv) [![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/zh_CN/latest/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
 
 [English](README.md) | 简体中文
 
@@ -10,20 +32,24 @@
 
 MMCV 是一个面向计算机视觉的基础库，它支持了很多开源项目，例如：
 
-- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
-- [MIM](https://github.com/open-mmlab/mim): OpenMMLab 项目、算法、模型的统一入口
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱与测试基准
-- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 检测工具箱与测试基准
-- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用3D目标检测平台
-- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱与测试基准
-- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱与测试基准
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱与测试基准
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 新一代生成模型工具箱
 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
-- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
 
 MMCV 提供了如下众多功能：
 
@@ -35,7 +61,13 @@ MMCV 提供了如下众多功能：
 - 多种 CNN 网络结构
 - 高质量实现的常见 CUDA 算子
 
-如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/en/latest)。
+MMCV 支持以下的系统：
+
+- Linux
+- Windows
+- macOS
+
+如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。
 
 提示: MMCV 需要 Python 3.6 以上版本。
 
@@ -50,19 +82,19 @@ MMCV 有两个版本：
 
 a. 安装完整版
 
-在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch 官方[文档](https://pytorch.org/)。
+在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch [官方文档](https://pytorch.org/)。
 
-我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
+我们提供了 **Linux 和 Windows 平台** PyTorch 和 CUDA 版本组合的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
 
 i. 安装最新版本
 
-如下是安装最新版 ``mmcv-full`` 的命令
+如下是安装最新版 `mmcv-full` 的命令
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``，使用如下替换过的命令
+请将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的最新版 `mmcv-full`，使用如下替换过的命令
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
@@ -74,18 +106,18 @@ pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
 ```
 
-如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 ``=={mmcv_version}`` 删去即可。
+如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 `=={mmcv_version}` 删去即可。
 
 ii. 安装特定的版本
 
-如下是安装特定版本 ``mmcv-full`` 的命令
+如下是安装特定版本 `mmcv-full` 的命令
 
 ```shell
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-首先请参考版本发布信息找到想要安装的版本号，将 ``{mmcv_version}`` 替换成该版本号，例如 ``1.3.9``。
-然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本，使用如下替换过的命令
+首先请参考版本发布信息找到想要安装的版本号，将 `{mmcv_version}` 替换成该版本号，例如 `1.3.9`。
+然后将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的 `mmcv-full` 1.3.9 版本，使用如下替换过的命令
 
 ```shell
 pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
@@ -97,15 +129,27 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
   <tbody>
     <tr>
       <th width="80"> CUDA </th>
-      <th valign="bottom" align="left" width="120">torch1.10</th>
-      <th valign="bottom" align="left" width="100">torch1.9</th>
-      <th valign="bottom" align="left" width="100">torch1.8</th>
-      <th valign="bottom" align="left" width="100">torch1.7</th>
-      <th valign="bottom" align="left" width="100">torch1.6</th>
-      <th valign="bottom" align="left" width="100">torch1.5</th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">11.3</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"></td>
       <td align="left"></code></pre> </details> </td>
@@ -115,6 +159,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
     </tr>
     <tr>
       <td align="left">11.1</td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
@@ -127,12 +172,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"> </td>
       <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">10.2</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
@@ -144,6 +191,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left">10.1</td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
@@ -154,12 +202,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
     </tr>
     <tr>
       <td align="left">cpu</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
        <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
@@ -170,7 +220,9 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
   </tbody>
 </table>
 
-**注意**：以上提供的预编译包并不囊括所有的 mmcv-full 版本，你可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+**注意**：以上提供的预编译包并不囊括所有的 mmcv-full 版本，你可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs/zh_cn/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+
+**注意**：mmcv-full 没有提供 Windows 平台 `cu102-torch1.8.0` 和 `cu92-torch*` 的预编译包。
 
 除了使用预编译包之外，另一种方式是在本地进行编译，直接运行下述命令
 
@@ -188,13 +240,13 @@ pip install mmcv
 
 c. 安装完整版并且编译 onnxruntime 的自定义算子
 
-- 详细的指南请查看 [这里](docs/deployment/onnxruntime_op.md)。
+- 详细的指南请查看[这里](docs/zh_cn/deployment/onnxruntime_op.md)。
 
-如果想从源码编译 MMCV，请参考[该文档](https://mmcv.readthedocs.io/en/latest/get_started/build.html)。
+如果想从源码编译 MMCV，请参考[该文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。
 
 ## FAQ
 
-如果你遇到了安装问题，CUDA 相关的问题或者 RuntimeErrors，可以首先参考[问题解决页面](https://mmcv.readthedocs.io/en/latest/faq.html) 看是否已经有解决方案。
+如果你遇到了安装问题，CUDA 相关的问题或者 RuntimeErrors，可以首先参考[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html) 看是否已经有解决方案。
 
 ## 贡献指南
 
@@ -203,12 +255,13 @@ c. 安装完整版并且编译 onnxruntime 的自定义算子
 ## 许可证
 
 `MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。
+
 ## 欢迎加入 OpenMMLab 社区
 
-扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=GJP18SjI)
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=3ijNTqfg)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
 
 <div align="center">
-<img src="docs/_static/zhihu_qrcode.jpg" height="400" />  <img src="docs/_static/qq_group_qrcode.jpg" height="400" />
+<img src="docs/en/_static/zhihu_qrcode.jpg" height="400" />  <img src="docs/en/_static/qq_group_qrcode.jpg" height="400" /> <img src="docs/en/_static/wechat_qrcode.jpg" height="400" />
 </div>
 
 我们会在 OpenMMLab 社区为大家
diff --git a/TERMINOLOGY.md b/TERMINOLOGY.md
index 61941e3306c7dc2c0f7b0e181248cac841571a7a..07411b7774c2ed713f472c1287b98b871c7f4d02 100644
--- a/TERMINOLOGY.md
+++ b/TERMINOLOGY.md
@@ -4,27 +4,27 @@ This document is used as a reference for English-Chinese terminology translation
 
 该文档用作中英文翻译对照参考。
 
-| English | 中文 |
-| :-----: | :---:|
-| annotation | 标注 |
-| backbone | 主干网络 |
-| benchmark | 基准测试 |
-| checkpoint | 模型权重文件 |
-| classifier | 分类器 |
-| cls_head | 分类头 |
-| decoder | 解码器 |
-| detector | 检测器 |
-| encoder | 编码器 |
-| finetune | 微调 |
-| ground truth | 真实标签 |
-| hook | 钩子 |
-| localizer | 定位器 |
-| neck | 模型颈部 |
-| pipeline | 流水线 |
-| recognizer | 识别器 |
-| register | 注册器 |
-| schedule | 调整 |
-| scheduler | 调度器 |
-| segmentor | 分割器 |
-| tensor | 张量 |
-| training schedule | 训练策略 |
+|      English      |     中文     |
+| :---------------: | :----------: |
+|    annotation     |     标注     |
+|     backbone      |   主干网络   |
+|     benchmark     |   基准测试   |
+|    checkpoint     | 模型权重文件 |
+|    classifier     |    分类器    |
+|     cls_head      |    分类头    |
+|      decoder      |    解码器    |
+|     detector      |    检测器    |
+|      encoder      |    编码器    |
+|     finetune      |     微调     |
+|   ground truth    |   真实标签   |
+|       hook        |     钩子     |
+|     localizer     |    定位器    |
+|       neck        |   模型颈部   |
+|     pipeline      |    流水线    |
+|    recognizer     |    识别器    |
+|     register      |    注册器    |
+|     schedule      |     调整     |
+|     scheduler     |    调度器    |
+|     segmentor     |    分割器    |
+|      tensor       |     张量     |
+| training schedule |   训练策略   |
diff --git a/docker/README.md b/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9985b4ca645a14c9e3f18bf7afcc0cb4f52bf73
--- /dev/null
+++ b/docker/README.md
@@ -0,0 +1,70 @@
+# Docker images
+
+There are two `Dockerfile` files to build docker images, one to build an image with the mmcv-full pre-built package and the other with the mmcv development environment.
+
+```text
+.
+|-- README.md
+|-- dev  # build with mmcv development environment
+|   `-- Dockerfile
+`-- release  # build with mmcv pre-built package
+    `-- Dockerfile
+```
+
+## Build docker images
+
+### Build with mmcv pre-built package
+
+Build with local repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/release/Dockerfile .
+```
+
+Or build with remote repository
+
+```bash
+docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release
+```
+
+The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.
+
+```bash
+docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=1.5.0 .
+```
+
+If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
+
+An example to build an image with PyTorch 1.11 and CUDA 11.3.
+
+```bash
+docker build -t mmcv -f docker/release/Dockerfile \
+    --build-arg PYTORCH=1.9.0 \
+    --build-arg CUDA=11.1 \
+    --build-arg CUDNN=8 \
+    --build-arg MMCV=1.5.0 .
+```
+
+More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
+
+### Build with mmcv development environment
+
+If you want to build an docker image with the mmcv development environment, you can use the following command
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .
+```
+
+Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).
+
+The building process may take 10 minutes or more.
+
+## Run images
+
+```bash
+docker run --gpus all --shm-size=8g -it mmcv
+```
+
+See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.
diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0c673e958f2909cd80f589100c2b7cbfa726c499
--- /dev/null
+++ b/docker/dev/Dockerfile
@@ -0,0 +1,32 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install git and system dependencies for opencv-python
+RUN apt-get update && apt-get install -y git \
+    && apt-get update && apt-get install -y libgl1 libglib2.0-0
+
+# Install system dependencies for unit tests
+RUN apt-get install -y ffmpeg libturbojpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# build mmcv-full from source with develop mode
+ARG HTTPS_PROXY=""
+ENV https_proxy=${HTTPS_PROXY}
+ENV FORCE_CUDA="1"
+ENV MMCV_WITH_OPS="1"
+ARG CUDA_ARCH=""
+ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}
+RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv
+WORKDIR /mmcv
+RUN git rev-parse --short HEAD
+RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install
diff --git a/docker/release/Dockerfile b/docker/release/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..493aa6d1625c9bdee1b9f3bd8121c6ff2f723d4a
--- /dev/null
+++ b/docker/release/Dockerfile
@@ -0,0 +1,20 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install system dependencies for opencv-python
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install mmcv-full
+ARG MMCV="1.5.1"
+RUN pip install openmim && mim install mmcv-full==${MMCV} && python -c 'import mmcv;print(mmcv.__version__)'
diff --git a/docs/_static/qq_group_qrcode.jpg b/docs/_static/qq_group_qrcode.jpg
deleted file mode 100644
index 7c6b04f561da283ae622f4219ea9b8cabf8f301a..0000000000000000000000000000000000000000
Binary files a/docs/_static/qq_group_qrcode.jpg and /dev/null differ
diff --git a/docs/_static/zhihu_qrcode.jpg b/docs/_static/zhihu_qrcode.jpg
deleted file mode 100644
index c745fb027f06564d41794e9a40069b06c34e2bb5..0000000000000000000000000000000000000000
Binary files a/docs/_static/zhihu_qrcode.jpg and /dev/null differ
diff --git a/docs/community/contributing.md b/docs/community/contributing.md
deleted file mode 120000
index f939e75f21a8badb5c40f527abd0e098fe9bc472..0000000000000000000000000000000000000000
--- a/docs/community/contributing.md
+++ /dev/null
@@ -1 +0,0 @@
-../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/community/pr.md b/docs/community/pr.md
deleted file mode 100644
index 77bdbf77080577d48ca734ffeb45d12269a166e4..0000000000000000000000000000000000000000
--- a/docs/community/pr.md
+++ /dev/null
@@ -1,94 +0,0 @@
-## Pull Request (PR)
-
-### What is PR
-
-`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github.
-
-> Pull requests let you tell others about changes you've pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
-
-### Basic Workflow
-
-1. Get the most recent codebase
-2. Checkout a new branch from the master branch
-3. Commit your changes
-4. Push your changes and create a PR
-5. Discuss and review your code
-6. Merge your branch to the master branch
-
-### Procedures in detail
-
-1. Get the most recent codebase
-    + When you work on your first PR
-        - Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page
-        ![avatar](../_static/community/1.png)
-
-        - Clone forked repository to local
-            ```bash
-            git clone git@github.com:XXX/mmcv.git
-            ```
-
-        - Add source repository to upstream
-            ```bash
-            git remote add upstream git@github.com:open-mmlab/mmcv
-            ```
-
-    + After your first PR
-       - Checkout master branch of the local repository and pull the latest master branch of the source repository
-            ```bash
-            git checkout master
-            git pull upstream master
-            ```
-
-2. Checkout a new branch from the master branch
-    ```bash
-    git checkout -b branchname
-    ```
-
-```{tip}
-To make commit history clear, we strongly recommend you checkout the master branch before create a new branch.
-```
-
-3. Commit your changes
-    ```bash
-    # coding
-    git add [files]
-    git commit -m 'messages'
-    ```
-
-4. Push your changes to the forked repository and create a PR
-    + Push the branch to your forked remote repository
-        ```bash
-        git push origin branchname
-        ```
-
-    + Create a PR
-    ![avatar](../_static/community/2.png)
-
-    + Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)).
-
-5. Discuss and review your code
-   + After creating a pull request, you can ask a specific person to review the changes you've proposed
-    ![avatar](../_static/community/3.png)
-
-    + Modify your codes according to reviewers' suggestions and then push your changes
-
-6.  Merge your branch to the master branch and delete the branch
-    ```bash
-    git branch -d branchname # delete local branch
-    git push origin --delete branchname # delete remote branch
-    ```
-
-### PR Specs
-
-1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
-2. One short-time branch should be matched with only one PR
-3. Accomplish a detailed change in one PR. Avoid large PR
-   >- Bad: Support Faster R-CNN
-   >- Acceptable: Add a box head to Faster R-CNN
-   >- Good: Add a parameter to box head to support custom conv-layer number
-4. Provide clear and significant commit message
-5. Provide clear and meaningful PR description
-   >- Task name should be clarified in title. The general format is: [Prefix] Short description of the PR (Suffix)
-   >- Prefix: add new feature [Feature], fix bug [Fix], related to documents [Docs], in developing [WIP] (which will not be reviewed temporarily)
-   >- Introduce main changes, results and influences on other modules in short description
-   >- Associate related issues and pull requests with a milestone
diff --git a/docs/deployment/onnx.md b/docs/deployment/onnx.md
deleted file mode 100644
index be6c59c5c5dbe3d17d62f4c01c79df35afb19d6d..0000000000000000000000000000000000000000
--- a/docs/deployment/onnx.md
+++ /dev/null
@@ -1,19 +0,0 @@
-## Introduction of onnx module in MMCV (Experimental)
-
-### register_extra_symbolics
-
-Some extra symbolic functions need to be registered before exporting PyTorch model to ONNX.
-
-#### Example
-
-```python
-import mmcv
-from mmcv.onnx import register_extra_symbolics
-
-opset_version = 11
-register_extra_symbolics(opset_version)
-```
-
-#### FAQs
-
-- None
diff --git a/docs/Makefile b/docs/en/Makefile
similarity index 100%
rename from docs/Makefile
rename to docs/en/Makefile
diff --git a/docs/_static/community/1.png b/docs/en/_static/community/1.png
similarity index 100%
rename from docs/_static/community/1.png
rename to docs/en/_static/community/1.png
diff --git a/docs/_static/community/2.png b/docs/en/_static/community/2.png
similarity index 100%
rename from docs/_static/community/2.png
rename to docs/en/_static/community/2.png
diff --git a/docs/_static/community/3.png b/docs/en/_static/community/3.png
similarity index 100%
rename from docs/_static/community/3.png
rename to docs/en/_static/community/3.png
diff --git a/docs/_static/css/readthedocs.css b/docs/en/_static/css/readthedocs.css
similarity index 100%
rename from docs/_static/css/readthedocs.css
rename to docs/en/_static/css/readthedocs.css
diff --git a/docs/_static/flow_img2toimg1.png b/docs/en/_static/flow_img2toimg1.png
similarity index 100%
rename from docs/_static/flow_img2toimg1.png
rename to docs/en/_static/flow_img2toimg1.png
diff --git a/docs/_static/flow_raw_images.png b/docs/en/_static/flow_raw_images.png
similarity index 100%
rename from docs/_static/flow_raw_images.png
rename to docs/en/_static/flow_raw_images.png
diff --git a/docs/_static/flow_visualization.png b/docs/en/_static/flow_visualization.png
similarity index 100%
rename from docs/_static/flow_visualization.png
rename to docs/en/_static/flow_visualization.png
diff --git a/docs/_static/flow_warp.png b/docs/en/_static/flow_warp.png
similarity index 100%
rename from docs/_static/flow_warp.png
rename to docs/en/_static/flow_warp.png
diff --git a/docs/_static/flow_warp_diff.png b/docs/en/_static/flow_warp_diff.png
similarity index 100%
rename from docs/_static/flow_warp_diff.png
rename to docs/en/_static/flow_warp_diff.png
diff --git a/docs/_static/image/mmcv-logo.png b/docs/en/_static/image/mmcv-logo.png
similarity index 100%
rename from docs/_static/image/mmcv-logo.png
rename to docs/en/_static/image/mmcv-logo.png
diff --git a/docs/_static/parallel_progress.gif b/docs/en/_static/parallel_progress.gif
similarity index 100%
rename from docs/_static/parallel_progress.gif
rename to docs/en/_static/parallel_progress.gif
diff --git a/docs/_static/parallel_progress.png b/docs/en/_static/parallel_progress.png
similarity index 100%
rename from docs/_static/parallel_progress.png
rename to docs/en/_static/parallel_progress.png
diff --git a/docs/_static/progress.gif b/docs/en/_static/progress.gif
similarity index 100%
rename from docs/_static/progress.gif
rename to docs/en/_static/progress.gif
diff --git a/docs/_static/progress.png b/docs/en/_static/progress.png
similarity index 100%
rename from docs/_static/progress.png
rename to docs/en/_static/progress.png
diff --git a/docs/en/_static/qq_group_qrcode.jpg b/docs/en/_static/qq_group_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8216326ad442c37c706bdf6dc8f7203c532849d2
Binary files /dev/null and b/docs/en/_static/qq_group_qrcode.jpg differ
diff --git a/docs/en/_static/wechat_qrcode.jpg b/docs/en/_static/wechat_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1f453ab91436264e5795569e8e3fdc86204024d5
Binary files /dev/null and b/docs/en/_static/wechat_qrcode.jpg differ
diff --git a/docs/en/_static/zhihu_qrcode.jpg b/docs/en/_static/zhihu_qrcode.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f71e00615174516f9befa43ef20eff4216bded4c
Binary files /dev/null and b/docs/en/_static/zhihu_qrcode.jpg differ
diff --git a/docs/api.rst b/docs/en/api.rst
similarity index 90%
rename from docs/api.rst
rename to docs/en/api.rst
index 8ca9118c3b033f1b7311ec3c1533ce9c93fa1aa2..5d3e623037e3fb102f8c927ff5909d478a46cab9 100644
--- a/docs/api.rst
+++ b/docs/en/api.rst
@@ -38,6 +38,11 @@ runner
 .. automodule:: mmcv.runner
     :members:
 
+engine
+------
+.. automodule:: mmcv.engine
+    :members:
+
 ops
 ------
 .. automodule:: mmcv.ops
diff --git a/docs/en/community/contributing.md b/docs/en/community/contributing.md
new file mode 120000
index 0000000000000000000000000000000000000000..72723396444c0a6cc0516f6f2379b2d868ba59f7
--- /dev/null
+++ b/docs/en/community/contributing.md
@@ -0,0 +1 @@
+../../../CONTRIBUTING.md
diff --git a/docs/en/community/pr.md b/docs/en/community/pr.md
new file mode 100644
index 0000000000000000000000000000000000000000..12b7535e749109820b60d59776c91f6be25c2fa3
--- /dev/null
+++ b/docs/en/community/pr.md
@@ -0,0 +1,114 @@
+## Pull Request (PR)
+
+### What is PR
+
+`PR` is the abbreviation of `Pull Request`. Here's the definition of `PR` in the [official document](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests) of Github.
+
+```
+Pull requests let you tell others about changes you have pushed to a branch in a repository on GitHub. Once a pull request is opened, you can discuss and review the potential changes with collaborators and add follow-up commits before your changes are merged into the base branch.
+```
+
+### Basic Workflow
+
+1. Get the most recent codebase
+2. Checkout a new branch from the master branch
+3. Commit your changes
+4. Push your changes and create a PR
+5. Discuss and review your code
+6. Merge your branch to the master branch
+
+### Procedures in detail
+
+#### 1. Get the most recent codebase
+
+- When you work on your first PR
+
+  Fork the OpenMMLab repository: click the **fork** button at the top right corner of Github page
+  ![avatar](../_static/community/1.png)
+
+  Clone forked repository to local
+
+  ```bash
+  git clone git@github.com:XXX/mmcv.git
+  ```
+
+  Add source repository to upstream
+
+  ```bash
+  git remote add upstream git@github.com:open-mmlab/mmcv
+  ```
+
+- After your first PR
+
+  Checkout master branch of the local repository and pull the latest master branch of the source repository
+
+  ```bash
+  git checkout master
+  git pull upstream master
+  ```
+
+#### 2. Checkout a new branch from the master branch
+
+```bash
+git checkout -b branchname
+```
+
+```{tip}
+To make commit history clear, we strongly recommend you checkout the master branch before create a new branch.
+```
+
+#### 3. Commit your changes
+
+```bash
+# coding
+git add [files]
+git commit -m 'messages'
+```
+
+#### 4. Push your changes to the forked repository and create a PR
+
+- Push the branch to your forked remote repository
+
+  ```bash
+  git push origin branchname
+  ```
+
+- Create a PR
+  ![avatar](../_static/community/2.png)
+
+- Revise PR message template to describe your motivation and modifications made in this PR. You can also link the related issue to the PR manually in the PR message (For more information, checkout the [official guidance](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)).
+
+#### 5. Discuss and review your code
+
+- After creating a pull request, you can ask a specific person to review the changes you've proposed
+  ![avatar](../_static/community/3.png)
+
+- Modify your codes according to reviewers' suggestions and then push your changes
+
+#### 6.  Merge your branch to the master branch and delete the branch
+
+```bash
+git branch -d branchname # delete local branch
+git push origin --delete branchname # delete remote branch
+```
+
+### PR Specs
+
+1. Use [pre-commit](https://pre-commit.com) hook to avoid issues of code style
+
+2. One short-time branch should be matched with only one PR
+
+3. Accomplish a detailed change in one PR. Avoid large PR
+
+   - Bad: Support Faster R-CNN
+   - Acceptable: Add a box head to Faster R-CNN
+   - Good: Add a parameter to box head to support custom conv-layer number
+
+4. Provide clear and significant commit message
+
+5. Provide clear and meaningful PR description
+
+   - Task name should be clarified in title. The general format is: \[Prefix\] Short description of the PR (Suffix)
+   - Prefix: add new feature \[Feature\], fix bug \[Fix\], related to documents \[Docs\], in developing \[WIP\] (which will not be reviewed temporarily)
+   - Introduce main changes, results and influences on other modules in short description
+   - Associate related issues and pull requests with a milestone
diff --git a/docs/compatibility.md b/docs/en/compatibility.md
similarity index 100%
rename from docs/compatibility.md
rename to docs/en/compatibility.md
diff --git a/docs_zh_CN/conf.py b/docs/en/conf.py
similarity index 61%
rename from docs_zh_CN/conf.py
rename to docs/en/conf.py
index e0c65d0eeca3bc99ef827b3fa36fc903422e8832..e38dfab1d2673a9bf07dcb9635cab4096e1960c8 100644
--- a/docs_zh_CN/conf.py
+++ b/docs/en/conf.py
@@ -15,21 +15,19 @@ import os
 import sys
 
 import pytorch_sphinx_theme
-from m2r import MdInclude
-from recommonmark.transform import AutoStructify
 from sphinx.builders.html import StandaloneHTMLBuilder
 
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
 
-version_file = '../mmcv/version.py'
-with open(version_file, 'r') as f:
+version_file = '../../mmcv/version.py'
+with open(version_file) as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
 # -- Project information -----------------------------------------------------
 
 project = 'mmcv'
-copyright = '2018-2021, OpenMMLab'
+copyright = '2018-2022, OpenMMLab'
 author = 'MMCV Authors'
 
 # The short X.Y version
@@ -51,14 +49,14 @@ extensions = [
     'sphinx.ext.autodoc',
     'sphinx.ext.napoleon',
     'sphinx.ext.viewcode',
-    'sphinx.ext.autosectionlabel',
     'sphinx_markdown_tables',
     'myst_parser',
     'sphinx_copybutton',
 ]  # yapf: disable
 
+myst_heading_anchors = 4
+
 autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
-autosectionlabel_prefix_document = True
 
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
@@ -79,7 +77,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = 'zh_CN'
+language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -108,94 +106,9 @@ html_theme_options = {
             'name': 'GitHub',
             'url': 'https://github.com/open-mmlab/mmcv'
         },
-        {
-            'name':
-            '文档',
-            'children': [
-                {
-                    'name': 'MMCV',
-                    'url': 'https://mmcv.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MIM',
-                    'url': 'https://openmim.readthedocs.io/en/latest/'
-                },
-                {
-                    'name': 'MMAction2',
-                    'url': 'https://mmaction2.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMClassification',
-                    'url':
-                    'https://mmclassification.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMDetection',
-                    'url': 'https://mmdetection.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMDetection3D',
-                    'url':
-                    'https://mmdetection3d.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMEditing',
-                    'url': 'https://mmediting.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMGeneration',
-                    'url': 'https://mmgeneration.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMOCR',
-                    'url': 'https://mmocr.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMPose',
-                    'url': 'https://mmpose.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMSegmentation',
-                    'url':
-                    'https://mmsegmentation.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMTracking',
-                    'url': 'https://mmtracking.readthedocs.io/zh_CN/latest/',
-                },
-                {
-                    'name': 'MMFlow',
-                    'url': 'https://mmflow.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMFewShot',
-                    'url': 'https://mmfewshot.readthedocs.io/zh_CN/latest/',
-                },
-            ]
-        },
-        {
-            'name':
-            'OpenMMLab',
-            'children': [
-                {
-                    'name': '主页',
-                    'url': 'https://openmmlab.com/'
-                },
-                {
-                    'name': 'GitHub',
-                    'url': 'https://github.com/open-mmlab/'
-                },
-                {
-                    'name': '推特',
-                    'url': 'https://twitter.com/OpenMMLab'
-                },
-                {
-                    'name': '知乎',
-                    'url': 'https://zhihu.com/people/openmmlab'
-                },
-            ]
-        },
-    ]
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'en',
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -288,16 +201,3 @@ StandaloneHTMLBuilder.supported_image_types = [
 # Ignore >>> when copying code
 copybutton_prompt_text = r'>>> |\.\.\. '
 copybutton_prompt_is_regexp = True
-
-
-def setup(app):
-    app.add_config_value('no_underscore_emphasis', False, 'env')
-    app.add_config_value('m2r_parse_relative_links', False, 'env')
-    app.add_config_value('m2r_anonymous_references', False, 'env')
-    app.add_config_value('m2r_disable_inline_math', False, 'env')
-    app.add_directive('mdinclude', MdInclude)
-    app.add_config_value('recommonmark_config', {
-        'auto_toc_tree_section': 'Contents',
-        'enable_eval_rst': True,
-    }, True)
-    app.add_transform(AutoStructify)
diff --git a/docs/deployment/mmcv_ops_definition.md b/docs/en/deployment/mmcv_ops_definition.md
similarity index 80%
rename from docs/deployment/mmcv_ops_definition.md
rename to docs/en/deployment/mmcv_ops_definition.md
index 5696316be5b1fb9234faab74cd83ad579655724e..d7eabb33fd41855116ed975d4e48daea81e4d74d 100644
--- a/docs/deployment/mmcv_ops_definition.md
+++ b/docs/en/deployment/mmcv_ops_definition.md
@@ -1,7 +1,10 @@
-# Definition of custom operators in MMCV
+# MMCV Operators
+
+To make custom operators in MMCV more standard, precise definitions of each operator are listed in this document.
 
 <!-- TOC -->
-- [Definition of custom operators in MMCV](#definition-of-custom-operators-in-mmcv)
+
+- [MMCV Operators](#mmcv-operators)
   - [MMCVBorderAlign](#mmcvborderalign)
     - [Description](#description)
     - [Parameters](#parameters)
@@ -80,25 +83,26 @@
     - [Inputs](#inputs-12)
     - [Outputs](#outputs-12)
     - [Type Constraints](#type-constraints-12)
-- [torch](#torch)
-  - [grid_sampler](#grid_sampler)
+  - [grid_sampler\*](#grid_sampler)
     - [Description](#description-13)
     - [Parameters](#parameters-13)
     - [Inputs](#inputs-13)
     - [Outputs](#outputs-13)
     - [Type Constraints](#type-constraints-13)
-  - [cummax](#cummax)
+  - [cummax\*](#cummax)
     - [Description](#description-14)
     - [Parameters](#parameters-14)
     - [Inputs](#inputs-14)
     - [Outputs](#outputs-14)
     - [Type Constraints](#type-constraints-14)
-  - [cummin](#cummin)
+  - [cummin\*](#cummin)
     - [Description](#description-15)
     - [Parameters](#parameters-15)
     - [Inputs](#inputs-15)
     - [Outputs](#outputs-15)
     - [Type Constraints](#type-constraints-15)
+  - [Reminders](#reminders)
+
 <!-- TOC -->
 
 ## MMCVBorderAlign
@@ -118,9 +122,9 @@ Read [BorderDet: Border Feature for Dense Object Detection](ttps://arxiv.org/abs
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
-| `int`   | `pool_size`        | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |
+| Type  | Parameter   | Description                                                                         |
+| ----- | ----------- | ----------------------------------------------------------------------------------- |
+| `int` | `pool_size` | number of positions sampled over the boxes' borders(e.g. top, bottom, left, right). |
 
 ### Inputs
 
@@ -152,11 +156,11 @@ Read [CARAFE: Content-Aware ReAssembly of FEatures](https://arxiv.org/abs/1905.0
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
-| `int`   | `kernel_size`        | reassemble kernel size, should be odd integer|
-| `int`   | `group_size`        | reassemble group size |
-| `float`   | `scale_factor`        | upsample ratio(>=1) |
+| Type    | Parameter      | Description                                   |
+| ------- | -------------- | --------------------------------------------- |
+| `int`   | `kernel_size`  | reassemble kernel size, should be odd integer |
+| `int`   | `group_size`   | reassemble group size                         |
+| `float` | `scale_factor` | upsample ratio(>=1)                           |
 
 ### Inputs
 
@@ -187,8 +191,7 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
+None
 
 ### Inputs
 
@@ -219,8 +222,7 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
+None
 
 ### Inputs
 
@@ -242,7 +244,6 @@ Read [CCNet: Criss-Cross Attention for SemanticSegmentation](https://arxiv.org/p
 
 - T:tensor(float32)
 
-
 ## MMCVCornerPool
 
 ### Description
@@ -251,9 +252,9 @@ Perform CornerPool on `input` features. Read [CornerNet -- Detecting Objects as
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                      |
-| ------- | --------------- | ---------------------------------------------------------------- |
-| `int`   | `mode`          | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
+| Type  | Parameter | Description                                                      |
+| ----- | --------- | ---------------------------------------------------------------- |
+| `int` | `mode`    | corner pool mode, (0: `top`, 1: `bottom`, 2: `left`, 3: `right`) |
 
 ### Inputs
 
@@ -283,15 +284,15 @@ Read [Deformable Convolutional Networks](https://arxiv.org/pdf/1703.06211.pdf) f
 
 ### Parameters
 
-| Type           | Parameter          | Description                                                                           |
-| -------------- | ------------------ | ------------------------------------------------------------------------------------- |
-| `list of ints` | `stride`           | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                        |
-| `list of ints` | `padding`          | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                   |
-| `list of ints` | `dilation`         | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                      |
-| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.|
-| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                         |
-| `int`          | `bias` | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`.                                                          |
-| `int`          | `im2col_step` | Groups of deformable offset. Defaults to `32`.                                                         |
+| Type           | Parameter           | Description                                                                                                       |
+| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`            | The stride of the convolving kernel, (sH, sW). Defaults to `(1, 1)`.                                              |
+| `list of ints` | `padding`           | Paddings on both sides of the input, (padH, padW).  Defaults to `(0, 0)`.                                         |
+| `list of ints` | `dilation`          | The spacing between kernel elements (dH, dW). Defaults to `(1, 1)`.                                               |
+| `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. Defaults to `1`.            |
+| `int`          | `deformable_groups` | Groups of deformable offset. Defaults to `1`.                                                                     |
+| `int`          | `bias`              | Whether to add a learnable bias to the output. `0` stands for `False` and `1` stands for `True`. Defaults to `0`. |
+| `int`          | `im2col_step`       | Groups of deformable offset. Defaults to `32`.                                                                    |
 
 ### Inputs
 
@@ -323,11 +324,11 @@ Perform Modulated Deformable Convolution on input feature, read [Deformable Conv
 
 ### Parameters
 
-| Type           | Parameter          | Description                                                                           |
-| -------------- | ------------------ | ------------------------------------------------------------------------------------- |
-| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW)                                         |
-| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW)                                     |
-| `list of ints` | `dilation`         | The spacing between kernel elements. (dH, dW)                                         |
+| Type           | Parameter           | Description                                                                           |
+| -------------- | ------------------- | ------------------------------------------------------------------------------------- |
+| `list of ints` | `stride`            | The stride of the convolving kernel. (sH, sW)                                         |
+| `list of ints` | `padding`           | Paddings on both sides of the input. (padH, padW)                                     |
+| `list of ints` | `dilation`          | The spacing between kernel elements. (dH, dW)                                         |
 | `int`          | `deformable_groups` | Groups of deformable offset.                                                          |
 | `int`          | `groups`            | Split input into groups. `input_channel` should be divisible by the number of groups. |
 
@@ -365,13 +366,13 @@ Deformable roi pooling layer
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
 | `int`   | `output_height`  | height of output roi                                                                                          |
 | `int`   | `output_width`   | width of output roi                                                                                           |
 | `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
 | `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
-| `float`   | `gamma`        | gamma |
+| `float` | `gamma`          | gamma                                                                                                         |
 
 ### Inputs
 
@@ -404,10 +405,10 @@ Read [Pixel Recurrent Neural Networks](https://arxiv.org/abs/1601.06759) for mor
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
-| `list of ints` | `stride`           | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv**                                         |
-| `list of ints` | `padding`          | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.                                     |
+| Type           | Parameter | Description                                                                      |
+| -------------- | --------- | -------------------------------------------------------------------------------- |
+| `list of ints` | `stride`  | The stride of the convolving kernel. (sH, sW). **Only support stride=1 in mmcv** |
+| `list of ints` | `padding` | Paddings on both sides of the input. (padH, padW). Defaults to `(0, 0)`.         |
 
 ### Inputs
 
@@ -443,10 +444,10 @@ Read [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://hs
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
-| `int`   | `psa_type`        | `0` means collect and `1` means `distribute` |
-| `list of ints`   | `mask_size`        | The size of mask |
+| Type           | Parameter   | Description                                  |
+| -------------- | ----------- | -------------------------------------------- |
+| `int`          | `psa_type`  | `0` means collect and `1` means `distribute` |
+| `list of ints` | `mask_size` | The size of mask                             |
 
 ### Inputs
 
@@ -478,9 +479,9 @@ Note this definition is slightly different with [onnx: NonMaxSuppression](https:
 
 | Type    | Parameter                    | Description                                                                                                                          |
 | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
-| `int`   | `center_point_box`           | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height].                 |
+| `int`   | `center_point_box`           | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\].             |
 | `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
-| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0.                     |
+| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0.                   |
 | `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
 | `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |
 
@@ -543,7 +544,6 @@ Perform RoIAlign on output feature, used in bbox_head of most two-stage detector
 
 - T:tensor(float32)
 
-
 ## MMCVRoIAlignRotated
 
 ### Description
@@ -552,15 +552,15 @@ Perform RoI align pooling for rotated proposals
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                    |
-| ------- | --------------- | -------------------------------------------------------------- |
+| Type    | Parameter        | Description                                                                                                   |
+| ------- | ---------------- | ------------------------------------------------------------------------------------------------------------- |
 | `int`   | `output_height`  | height of output roi                                                                                          |
 | `int`   | `output_width`   | width of output roi                                                                                           |
 | `float` | `spatial_scale`  | used to scale the input boxes                                                                                 |
 | `int`   | `sampling_ratio` | number of input samples to take for each output sample. `0` means to take samples densely for current models. |
 | `str`   | `mode`           | pooling mode in each bin. `avg` or `max`                                                                      |
 | `int`   | `aligned`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
-| `int`   | `clockwise`        | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
+| `int`   | `clockwise`      | If `aligned=0`, use the legacy implementation in MMDetection. Else, align the results more perfectly.         |
 
 ### Inputs
 
@@ -581,9 +581,7 @@ Perform RoI align pooling for rotated proposals
 
 - T:tensor(float32)
 
-# torch
-
-## grid_sampler
+## grid_sampler\*
 
 ### Description
 
@@ -619,7 +617,7 @@ Check [torch.nn.functional.grid_sample](https://pytorch.org/docs/stable/generate
 
 - T:tensor(float32, Linear)
 
-## cummax
+## cummax\*
 
 ### Description
 
@@ -627,9 +625,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                      |
-| ------- | --------------- | ---------------------------------------------------------------- |
-| `int`   | `dim`           | the dimension to do the operation over                           |
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
 
 ### Inputs
 
@@ -651,7 +649,7 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative maximum e
 
 - T:tensor(float32)
 
-## cummin
+## cummin\*
 
 ### Description
 
@@ -659,9 +657,9 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e
 
 ### Parameters
 
-| Type    | Parameter       | Description                                                      |
-| ------- | --------------- | ---------------------------------------------------------------- |
-| `int`   | `dim`           | the dimension to do the operation over                           |
+| Type  | Parameter | Description                            |
+| ----- | --------- | -------------------------------------- |
+| `int` | `dim`     | the dimension to do the operation over |
 
 ### Inputs
 
@@ -682,3 +680,7 @@ Returns a tuple (`values`, `indices`) where `values` is the cumulative minimum e
 ### Type Constraints
 
 - T:tensor(float32)
+
+## Reminders
+
+- Operators endwith `*` are defined in Torch and are included here for the conversion to ONNX.
diff --git a/docs/en/deployment/onnx.md b/docs/en/deployment/onnx.md
new file mode 100644
index 0000000000000000000000000000000000000000..528a9fdb91a4306bb41edf242efa9705a8a52c37
--- /dev/null
+++ b/docs/en/deployment/onnx.md
@@ -0,0 +1,28 @@
+## Introduction of mmcv.onnx module
+
+### <span style="color:red">DeprecationWarning</span>
+
+ONNX support will be deprecated in the future.
+Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy
+
+### register_extra_symbolics
+
+Some extra symbolic functions need to be registered before exporting PyTorch model to ONNX.
+
+#### Example
+
+```python
+import mmcv
+from mmcv.onnx import register_extra_symbolics
+
+opset_version = 11
+register_extra_symbolics(opset_version)
+```
+
+#### Reminder
+
+- *Please note that this feature is experimental and may change in the future.*
+
+#### FAQs
+
+- None
diff --git a/docs/deployment/onnxruntime_custom_ops.md b/docs/en/deployment/onnxruntime_custom_ops.md
similarity index 98%
rename from docs/deployment/onnxruntime_custom_ops.md
rename to docs/en/deployment/onnxruntime_custom_ops.md
index baaa576f6d789f0eb53b4005dec537de5e06e700..85df4e2a2ee31e1b1097ff270af5b710f3244a87 100644
--- a/docs/deployment/onnxruntime_custom_ops.md
+++ b/docs/en/deployment/onnxruntime_custom_ops.md
@@ -1,8 +1,8 @@
-## Onnxruntime Custom Ops
+## ONNX Runtime Custom Ops
 
 <!-- TOC -->
 
-- [Onnxruntime Custom Ops](#onnxruntime-custom-ops)
+- [ONNX Runtime Custom Ops](#onnx-runtime-custom-ops)
   - [SoftNMS](#softnms)
     - [Description](#description)
     - [Parameters](#parameters)
@@ -143,10 +143,10 @@ Filter out boxes has high IoU overlap with previously selected boxes.
 
 #### Parameters
 
-| Type    | Parameter       | Description                                                                                                      |
-| ------- | --------------- | ---------------------------------------------------------------------------------------------------------------- |
-| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0. |
-| `int`   | `offset`        | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                            |
+| Type    | Parameter       | Description                                                                                                        |
+| ------- | --------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `float` | `iou_threshold` | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0. |
+| `int`   | `offset`        | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                              |
 
 #### Inputs
 
@@ -338,13 +338,13 @@ Perform Modulated Deformable Convolution on input feature, read [Deformable Conv
 
 - T:tensor(float32, Linear)
 
-## MMCVDeformConv2d
+### MMCVDeformConv2d
 
-### Description
+#### Description
 
 Perform Deformable Convolution on input feature, read [Deformable Convolutional Network](https://arxiv.org/abs/1703.06211) for detail.
 
-### Parameters
+#### Parameters
 
 | Type           | Parameter          | Description                                                                                                                       |
 | -------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------- |
@@ -355,7 +355,7 @@ Perform Deformable Convolution on input feature, read [Deformable Convolutional
 | `int`          | `group`            | Split input into groups. `input_channel` should be divisible by the number of groups.                                             |
 | `int`          | `im2col_step`      | DeformableConv2d use im2col to compute convolution. im2col_step is used to split input and offset, reduce memory usage of column. |
 
-### Inputs
+#### Inputs
 
 <dl>
 <dt><tt>inputs[0]</tt>: T</dt>
@@ -366,13 +366,13 @@ Perform Deformable Convolution on input feature, read [Deformable Convolutional
 <dd>Input weight; 4-D tensor of shape (output_channel, input_channel, kH, kW).</dd>
 </dl>
 
-### Outputs
+#### Outputs
 
 <dl>
 <dt><tt>outputs[0]</tt>: T</dt>
 <dd>Output feature; 4-D tensor of shape (N, output_channel, outH, outW).</dd>
 </dl>
 
-### Type Constraints
+#### Type Constraints
 
 - T:tensor(float32, Linear)
diff --git a/docs/deployment/onnxruntime_op.md b/docs/en/deployment/onnxruntime_op.md
similarity index 65%
rename from docs/deployment/onnxruntime_op.md
rename to docs/en/deployment/onnxruntime_op.md
index f17b32a0647e2f25b1736580f385e7ae1fcb8163..2778ba3448813ca1e63fa250c4fc99e170dea736 100644
--- a/docs/deployment/onnxruntime_op.md
+++ b/docs/en/deployment/onnxruntime_op.md
@@ -1,4 +1,9 @@
-## Custom operators for ONNX Runtime in MMCV
+## ONNX Runtime Deployment
+
+### <span style="color:red">DeprecationWarning</span>
+
+ONNX support will be deprecated in the future.
+Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy
 
 ### Introduction of ONNX Runtime
 
@@ -15,15 +20,15 @@
 
 ### List of operators for ONNX Runtime supported in MMCV
 
-|                        Operator                        |  CPU  |  GPU  | MMCV Releases |
-| :----------------------------------------------------: | :---: | :---: | :-----------: |
-|      [SoftNMS](onnxruntime_custom_ops.md#softnms)      |   Y   |   N   |     1.2.3     |
-|     [RoIAlign](onnxruntime_custom_ops.md#roialign)     |   Y   |   N   |     1.2.5     |
-|          [NMS](onnxruntime_custom_ops.md#nms)          |   Y   |   N   |     1.2.7     |
-| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) |   Y   |   N   |     1.3.1     |
-|   [CornerPool](onnxruntime_custom_ops.md#cornerpool)   |   Y   |   N   |     1.3.4     |
-|       [cummax](onnxruntime_custom_ops.md#cummax)       |   Y   |   N   |    master     |
-|       [cummin](onnxruntime_custom_ops.md#cummin)       |   Y   |   N   |    master     |
+| Operator                                               | CPU | GPU | MMCV Releases |
+| :----------------------------------------------------- | :-: | :-: | :-----------: |
+| [SoftNMS](onnxruntime_custom_ops.md#softnms)           |  Y  |  N  |     1.2.3     |
+| [RoIAlign](onnxruntime_custom_ops.md#roialign)         |  Y  |  N  |     1.2.5     |
+| [NMS](onnxruntime_custom_ops.md#nms)                   |  Y  |  N  |     1.2.7     |
+| [grid_sampler](onnxruntime_custom_ops.md#grid_sampler) |  Y  |  N  |     1.3.1     |
+| [CornerPool](onnxruntime_custom_ops.md#cornerpool)     |  Y  |  N  |     1.3.4     |
+| [cummax](onnxruntime_custom_ops.md#cummax)             |  Y  |  N  |     1.3.4     |
+| [cummin](onnxruntime_custom_ops.md#cummin)             |  Y  |  N  |     1.3.4     |
 
 ### How to build custom operators for ONNX Runtime
 
@@ -88,7 +93,10 @@ onnx_results = sess.run(None, {'input' : input_data})
 
 #### Reminder
 
+- *Please note that this feature is experimental and may change in the future. Strongly suggest users always try with the latest master branch.*
+
 - The custom operator is not included in [supported operator list](https://github.com/microsoft/onnxruntime/blob/master/docs/OperatorKernels.md) in ONNX Runtime.
+
 - The custom operator should be able to be exported to ONNX.
 
 #### Main procedures
@@ -96,18 +104,20 @@ onnx_results = sess.run(None, {'input' : input_data})
 Take custom operator `soft_nms` for example.
 
 1. Add header `soft_nms.h` to ONNX Runtime include directory `mmcv/ops/csrc/onnxruntime/`
+
 2. Add source `soft_nms.cpp` to ONNX Runtime source directory `mmcv/ops/csrc/onnxruntime/cpu/`
-3. Register `soft_nms` operator in [onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)
 
-    ```c++
-    #include "soft_nms.h"
+3. Register `soft_nms` operator in [onnxruntime_register.cpp](../../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)
+
+   ```c++
+   #include "soft_nms.h"
 
-    SoftNmsOp c_SoftNmsOp;
+   SoftNmsOp c_SoftNmsOp;
 
-    if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
-    return status;
-    }
-    ```
+   if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+   return status;
+   }
+   ```
 
 4. Add unit test into `tests/test_ops/test_onnx.py`
    Check [here](../../tests/test_ops/test_onnx.py) for examples.
@@ -117,10 +127,10 @@ Take custom operator `soft_nms` for example.
 ### Known Issues
 
 - "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`."
-   1. Note generally `cummax` or `cummin` is exportable to ONNX as long as the torch version >= 1.5.0, since `torch.cummax` is only supported with torch >= 1.5.0. But when `cummax` or `cummin` serves as an intermediate component whose outputs is used as inputs for another modules, it's expected that torch version must be >= 1.7.0. Otherwise the above error might arise, when running exported ONNX model with onnxruntime.
-   2. Solution: update the torch version to 1.7.0 or higher.
+  1. Note generally `cummax` or `cummin` is exportable to ONNX as long as the torch version >= 1.5.0, since `torch.cummax` is only supported with torch >= 1.5.0. But when `cummax` or `cummin` serves as an intermediate component whose outputs is used as inputs for another modules, it's expected that torch version must be >= 1.7.0. Otherwise the above error might arise, when running exported ONNX model with onnxruntime.
+  2. Solution: update the torch version to 1.7.0 or higher.
 
 ### References
 
 - [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md)
-- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md)
+- [How to add a custom operator/kernel in ONNX Runtime](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html)
diff --git a/docs/deployment/tensorrt_custom_ops.md b/docs/en/deployment/tensorrt_custom_ops.md
similarity index 96%
rename from docs/deployment/tensorrt_custom_ops.md
rename to docs/en/deployment/tensorrt_custom_ops.md
index be47e355be6316295ca18f12450630e9fe6d3854..37ebb27bf20870b944fe9cca1e029f2499957245 100644
--- a/docs/deployment/tensorrt_custom_ops.md
+++ b/docs/en/deployment/tensorrt_custom_ops.md
@@ -102,7 +102,7 @@ detectors.
 
 #### Description
 
-ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1, and `updates` tensor of rank q + r - indices.shape[-1] - 1. The output of the operation is produced by creating a copy of the input `data`, and then updating its value to values specified by updates at specific index positions specified by `indices`. Its output shape is the same as the shape of `data`. Note that `indices` should not have duplicate entries. That is, two or more updates for the same index-location is not supported.
+ScatterND takes three inputs `data` tensor of rank r >= 1, `indices` tensor of rank q >= 1, and `updates` tensor of rank q + r - indices.shape\[-1\] - 1. The output of the operation is produced by creating a copy of the input `data`, and then updating its value to values specified by updates at specific index positions specified by `indices`. Its output shape is the same as the shape of `data`. Note that `indices` should not have duplicate entries. That is, two or more updates for the same index-location is not supported.
 
 The `output` is calculated via the following equation:
 
@@ -151,9 +151,9 @@ Filter out boxes has high IoU overlap with previously selected boxes or low scor
 
 | Type    | Parameter                    | Description                                                                                                                          |
 | ------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
-| `int`   | `center_point_box`           | 0 - the box data is supplied as [y1, x1, y2, x2], 1-the box data is supplied as [x_center, y_center, width, height].                 |
+| `int`   | `center_point_box`           | 0 - the box data is supplied as \[y1, x1, y2, x2\], 1-the box data is supplied as \[x_center, y_center, width, height\].             |
 | `int`   | `max_output_boxes_per_class` | The maximum number of boxes to be selected per batch per class. Default to 0, number of output boxes equal to number of input boxes. |
-| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range [0, 1]. Default to 0.                     |
+| `float` | `iou_threshold`              | The threshold for deciding whether boxes overlap too much with respect to IoU. Value range \[0, 1\]. Default to 0.                   |
 | `float` | `score_threshold`            | The threshold for deciding when to remove boxes based on score.                                                                      |
 | `int`   | `offset`                     | 0 or 1, boxes' width or height is (x2 - x1 + offset).                                                                                |
 
diff --git a/docs/deployment/tensorrt_plugin.md b/docs/en/deployment/tensorrt_plugin.md
similarity index 72%
rename from docs/deployment/tensorrt_plugin.md
rename to docs/en/deployment/tensorrt_plugin.md
index cd8924e33e5183516dcc86d5dc5b2fd786a54f87..de7809b6aac64c126ec8b8cfd3291d65053f60e5 100644
--- a/docs/deployment/tensorrt_plugin.md
+++ b/docs/en/deployment/tensorrt_plugin.md
@@ -1,8 +1,14 @@
-## TensorRT Plugins for custom operators in MMCV (Experimental)
+## TensorRT Deployment
+
+### <span style="color:red">DeprecationWarning</span>
+
+TensorRT support will be deprecated in the future.
+Welcome to use the unified model deployment toolbox MMDeploy: https://github.com/open-mmlab/mmdeploy
 
 <!-- TOC -->
 
-- [TensorRT Plugins for custom operators in MMCV (Experimental)](#tensorrt-plugins-for-custom-operators-in-mmcv-experimental)
+- [TensorRT Deployment](#tensorrt-deployment)
+  - [<span style="color:red">DeprecationWarning</span>](#deprecationwarning)
   - [Introduction](#introduction)
   - [List of TensorRT plugins supported in MMCV](#list-of-tensorrt-plugins-supported-in-mmcv)
   - [How to build TensorRT plugins in MMCV](#how-to-build-tensorrt-plugins-in-mmcv)
@@ -24,17 +30,17 @@ To ease the deployment of trained models with custom operators from `mmcv.ops` u
 
 ### List of TensorRT plugins supported in MMCV
 
-|       ONNX Operator       |                                 TensorRT Plugin                                 | MMCV Releases |
-| :-----------------------: | :-----------------------------------------------------------------------------: | :-----------: |
-|       MMCVRoiAlign        |              [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign)              |     1.2.6     |
-|         ScatterND         |                 [ScatterND](./tensorrt_custom_ops.md#scatternd)                 |     1.2.6     |
-|     NonMaxSuppression     |         [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression)         |     1.3.0     |
-|     MMCVDeformConv2d      |          [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d)          |     1.3.0     |
-|       grid_sampler        |              [grid_sampler](./tensorrt_custom_ops.md#grid-sampler)              |     1.3.1     |
-|          cummax           |                    [cummax](./tensorrt_custom_ops.md#cummax)                    |     1.3.5     |
-|          cummin           |                    [cummin](./tensorrt_custom_ops.md#cummin)                    |     1.3.5     |
+| ONNX Operator             | TensorRT Plugin                                                                 | MMCV Releases |
+| :------------------------ | :------------------------------------------------------------------------------ | :-----------: |
+| MMCVRoiAlign              | [MMCVRoiAlign](./tensorrt_custom_ops.md#mmcvroialign)                           |     1.2.6     |
+| ScatterND                 | [ScatterND](./tensorrt_custom_ops.md#scatternd)                                 |     1.2.6     |
+| NonMaxSuppression         | [NonMaxSuppression](./tensorrt_custom_ops.md#nonmaxsuppression)                 |     1.3.0     |
+| MMCVDeformConv2d          | [MMCVDeformConv2d](./tensorrt_custom_ops.md#mmcvdeformconv2d)                   |     1.3.0     |
+| grid_sampler              | [grid_sampler](./tensorrt_custom_ops.md#grid-sampler)                           |     1.3.1     |
+| cummax                    | [cummax](./tensorrt_custom_ops.md#cummax)                                       |     1.3.5     |
+| cummin                    | [cummin](./tensorrt_custom_ops.md#cummin)                                       |     1.3.5     |
 | MMCVInstanceNormalization | [MMCVInstanceNormalization](./tensorrt_custom_ops.md#mmcvinstancenormalization) |     1.3.5     |
-| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) |    master     |
+| MMCVModulatedDeformConv2d | [MMCVModulatedDeformConv2d](./tensorrt_custom_ops.md#mmcvmodulateddeformconv2d) |     1.3.8     |
 
 Notes
 
@@ -75,6 +81,10 @@ pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
 
 For more detailed information of installing TensorRT using tar, please refer to [Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar).
 
+- Install cuDNN
+
+Install cuDNN 8 following [Nvidia' website](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar).
+
 #### Build on Linux
 
 ```bash
@@ -142,27 +152,32 @@ Below are the main steps:
 **Take RoIAlign plugin `roi_align` for example.**
 
 1. Add header `trt_roi_align.hpp` to TensorRT include directory `mmcv/ops/csrc/tensorrt/`
+
 2. Add source `trt_roi_align.cpp` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/`
+
 3. Add cuda kernel `trt_roi_align_kernel.cu` to TensorRT source directory `mmcv/ops/csrc/tensorrt/plugins/`
+
 4. Register `roi_align` plugin in [trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp)
 
-    ```c++
-    #include "trt_plugin.hpp"
+   ```c++
+   #include "trt_plugin.hpp"
 
-    #include "trt_roi_align.hpp"
+   #include "trt_roi_align.hpp"
 
-    REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+   REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
 
-    extern "C" {
-    bool initLibMMCVInferPlugins() { return true; }
-    }  // extern "C"
-    ```
+   extern "C" {
+   bool initLibMMCVInferPlugins() { return true; }
+   }  // extern "C"
+   ```
 
 5. Add unit test into `tests/test_ops/test_tensorrt.py`
    Check [here](https://github.com/open-mmlab/mmcv/blob/master/tests/test_ops/test_tensorrt.py) for examples.
 
 #### Reminders
 
+- *Please note that this feature is experimental and may change in the future. Strongly suggest users always try with the latest master branch.*
+
 - Some of the [custom ops](https://mmcv.readthedocs.io/en/latest/ops.html) in `mmcv` have their cuda implementations, which could be referred.
 
 ### Known Issues
diff --git a/docs/en/faq.md b/docs/en/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..02d31c233a9ff66d5e8f3f288b5d5f64e5c5298c
--- /dev/null
+++ b/docs/en/faq.md
@@ -0,0 +1,93 @@
+## Frequently Asked Questions
+
+We list some common troubles faced by many users and their corresponding solutions here.
+Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
+
+### Installation
+
+- KeyError: "xxx: 'yyy is not in the zzz registry'"
+
+  The registry mechanism will be triggered only when the file of the module is imported.
+  So you need to import that file somewhere. More details can be found at [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974).
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
+
+  1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`
+  2. Install mmcv-full following the [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) or [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html)
+
+- "invalid device function" or "no kernel image is available for execution"
+
+  1. Check the CUDA compute capability of you GPU
+  2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built for the correct GPU architecture. You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV. The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
+  3. Check whether the running environment is the same as that when mmcv/mmdet is compiled. For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0 environments
+
+- "undefined symbol" or "cannot open xxx.so"
+
+  1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
+     whether the CUDA/GCC runtimes are the same as those used for compiling mmcv
+  2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether the Pytorch version is the same as that used for compiling mmcv
+  3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision, and MMCV are built by and running on the same environment
+
+- "RuntimeError: CUDA error: invalid configuration argument"
+
+  This error may be caused by the poor performance of GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+  and recompile mmcv.
+
+- "RuntimeError: nms is not compiled with GPU support"
+
+  This error is because your CUDA environment is not installed correctly.
+  You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.
+
+- "Segmentation fault"
+
+  1. Check your GCC version and use GCC >= 5.4. This usually caused by the incompatibility between PyTorch and the environment (e.g., GCC \< 4.9 for PyTorch). We also recommend the users to avoid using GCC 5.5 because many feedbacks report that GCC 5.5 will cause "segmentation fault" and simply changing it to GCC 5.4 could solve the problem
+  2. Check whether PyTorch is correctly installed and could use CUDA op, e.g. type the following command in your terminal and see whether they could correctly output results
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+  3. If PyTorch is correctly installed, check whether MMCV is correctly installed. If MMCV is correctly installed, then there will be no issue of the command
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+  4. If MMCV and PyTorch are correctly installed, you can use `ipdb` to set breakpoints or directly add `print` to debug and see which part leads the `segmentation fault`
+
+- "libtorch_cuda_cu.so: cannot open shared object file"
+
+  `mmcv-full` depends on the share object but it can not be found. We can check whether the object exists in `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` or try to re-install the PyTorch.
+
+- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
+
+  If you are building mmcv-full on Windows and the version of CUDA is 9.2, you will probably encounter the error `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`, in which case you can use a lower version of Microsoft Visual Studio like vs2017.
+
+- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
+
+  If your version of PyTorch is 1.5.0 and you are building mmcv-full on Windows, you will probably encounter the error `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`. The way to solve the error is to replace all the `static constexpr bool all_slots = false;` with `static bool all_slots = false;` at this file `https://github.com/pytorch/pytorch/blob/v1.5.0/torch/csrc/jit/api/module.h`. More details can be found at [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394).
+
+- "error: a member with an in-class initializer must be const"
+
+  If your version of PyTorch is 1.6.0 and you are building mmcv-full on Windows, you will probably encounter the error `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. The way to solve the error is to replace all the `CONSTEXPR_EXCEPT_WIN_CUDA ` with `const` at `torch/include\torch/csrc/jit/api/module.h`. More details can be found at [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575).
+
+- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
+
+  If your version of PyTorch is 1.7.0 and you are building mmcv-full on Windows, you will probably encounter the error `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. The way to solve the error needs to modify several local files of PyTorch:
+
+  - delete `static constexpr Symbol Kind = ::c10::prim::profile;` and `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;` at `torch/include\torch/csrc/jit/ir/ir.h`
+  - replace `explicit operator type&() { return *(this->value); }` with `explicit operator type&() { return *((type*)this->value); }` at `torch\include\pybind11\cast.h`
+  - replace all the `CONSTEXPR_EXCEPT_WIN_CUDA` with `const` at `torch/include\torch/csrc/jit/api/module.h`
+
+  More details can be found at [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956).
+
+- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"
+
+  Please install the correct version of MMCV for the version of your MMDetection following the [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation).
+
+### Usage
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. This error indicates that your module has parameters that were not used in producing loss. This phenomenon may be caused by running different branches in your code in DDP mode. More datails at [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582).
+  2. You can set ` find_unused_parameters = True` in the config to solve the above problems or find those unused parameters manually
+
+- "RuntimeError: Trying to backward through the graph a second time"
+
+  `GradientCumulativeOptimizerHook` and `OptimizerHook` are both set which causes the `loss.backward()` to be called twice so `RuntimeError` was raised. We can only use one of these. More datails at [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379).
diff --git a/docs/get_started/build.md b/docs/en/get_started/build.md
similarity index 61%
rename from docs/get_started/build.md
rename to docs/en/get_started/build.md
index 758a83a4fb84398c9e192df37f7778a736109813..d987c1e17e2e91f232cb733ac7bc1f425dba27a8 100644
--- a/docs/get_started/build.md
+++ b/docs/en/get_started/build.md
@@ -9,6 +9,12 @@ git clone https://github.com/open-mmlab/mmcv.git
 cd mmcv
 ```
 
+It is recommended to install `ninja` to speed up the compilation
+
+```bash
+pip install -r requirements/optional.txt
+```
+
 You can either
 
 - install the lite version
@@ -40,6 +46,7 @@ If you would like to use `opencv-python-headless` instead of `opencv-python`,
 e.g., in a minimum container environment or servers without GUI,
 you can first install it before installing MMCV to skip the installation of `opencv-python`.
 ```
+
 ### Build on Windows
 
 Building MMCV on Windows is a bit more complicated than that on Linux.
@@ -68,35 +75,41 @@ You should know how to set up environment variables, especially `Path`, on Windo
 
 1. Launch Anaconda prompt from Windows Start menu
 
-    Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.
+   Do not use raw `cmd.exe` s instruction is based on PowerShell syntax.
 
-1. Create a new conda environment
+2. Create a new conda environment
 
-    ```shell
-    conda create --name mmcv python=3.7  # 3.6, 3.7, 3.8 should work too as tested
-    conda activate mmcv  # make sure to activate environment before any operation
-    ```
+   ```shell
+   conda create --name mmcv python=3.7  # 3.6, 3.7, 3.8 should work too as tested
+   conda activate mmcv  # make sure to activate environment before any operation
+   ```
 
-1. Install PyTorch. Choose a version based on your need.
+3. Install PyTorch. Choose a version based on your need.
 
-    ```shell
-    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
-    ```
+   ```shell
+   conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+   ```
 
-    We only tested PyTorch version >= 1.6.0.
+   We only tested PyTorch version >= 1.6.0.
 
-1. Prepare MMCV source code
+4. Prepare MMCV source code
 
-    ```shell
-    git clone https://github.com/open-mmlab/mmcv.git
-    cd mmcv
-    ```
+   ```shell
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
 
-1. Install required Python packages
+5. Install required Python packages
 
-    ```shell
-    pip3 install -r requirements.txt
-    ```
+   ```shell
+   pip3 install -r requirements/runtime.txt
+   ```
+
+6. It is recommended to install `ninja` to speed up the compilation
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
 
 #### Build and install MMCV
 
@@ -106,11 +119,11 @@ MMCV can be built in three ways:
 
    In this way, no custom ops are compiled and mmcv is a pure python package.
 
-1. Full version (CPU ops)
+2. Full version (CPU ops)
 
    Module `ops` will be compiled as a pytorch extension, but only x86 code will be compiled. The compiled ops can be executed on CPU only.
 
-1. Full version (CUDA ops)
+3. Full version (CUDA ops)
 
    Both x86 and CUDA codes of `ops` module will be compiled. The compiled version can be run on both CPU and CUDA-enabled GPU (if implemented).
 
@@ -118,19 +131,19 @@ MMCV can be built in three ways:
 
 1. Set up MSVC compiler
 
-    Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.
+   Set Environment variable, add `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` to `PATH`, so that `cl.exe` will be available in prompt, as shown below.
 
-    ```none
-    (base) PS C:\Users\xxx> cl
-    Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
-    Copyright (C) Microsoft Corporation.   All rights reserved.
+   ```none
+   (base) PS C:\Users\xxx> cl
+   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+   Copyright (C) Microsoft Corporation.   All rights reserved.
 
-    usage: cl [ option... ] filename... [ / link linkoption... ]
-    ```
+   usage: cl [ option... ] filename... [ / link linkoption... ]
+   ```
 
-    For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.
+   For compatibility, we use the x86-hosted and x64-targeted compiler. note `Hostx86\x64` in the path.
 
-    You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.
+   You may want to change the system language to English because pytorch will parse text output from `cl.exe` to check its version. However only utf-8 is recognized. Navigate to Control Panel -> Region -> Administrative -> Language for Non-Unicode programs and change it to English.
 
 ##### Option 1: Build MMCV (lite version)
 
@@ -150,32 +163,34 @@ pip list
 ##### Option 2: Build MMCV (full version with CPU)
 
 1. Finish above common steps
-1. Set up environment variables
-
-    ```shell
-    $env:MMCV_WITH_OPS = 1
-    $env:MAX_JOBS = 8  # based on your available number of CPU cores and amount of memory
-    ```
-
-1. Following build steps of the lite version
-
-    ```shell
-    # activate environment
-    conda activate mmcv
-    # change directory
-    cd mmcv
-    # build
-    python setup.py build_ext # if success, cl will be launched to compile ops
-    # install
-    python setup.py develop
-    # check
-    pip list
-    ```
+
+2. Set up environment variables
+
+   ```shell
+   $env:MMCV_WITH_OPS = 1
+   $env:MAX_JOBS = 8  # based on your available number of CPU cores and amount of memory
+   ```
+
+3. Following build steps of the lite version
+
+   ```shell
+   # activate environment
+   conda activate mmcv
+   # change directory
+   cd mmcv
+   # build
+   python setup.py build_ext # if success, cl will be launched to compile ops
+   # install
+   python setup.py develop
+   # check
+   pip list
+   ```
 
 ##### Option 3: Build MMCV (full version with CUDA)
 
 1. Finish above common steps
-1. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:
+
+2. Make sure `CUDA_PATH` or `CUDA_HOME` is already set in `envs` via `ls env:`, desired output is shown as below:
 
    ```none
    (base) PS C:\Users\WRH> ls env:
@@ -197,7 +212,7 @@ pip list
    $env:CUDA_HOME = $env:CUDA_PATH_V10_2 # if CUDA_PATH_V10_2 is in envs:
    ```
 
-1. Set CUDA target arch
+3. Set CUDA target arch
 
    ```shell
    # Suppose you are using GTX 1080, which is of capability 6.1
@@ -210,7 +225,7 @@ pip list
 Check your the compute capability of your GPU from [here](https://developer.nvidia.com/cuda-gpus).
 ```
 
-1. Launch compiling the same way as CPU
+4. Launch compiling the same way as CPU
 
    ```shell
    $env:MMCV_WITH_OPS = 1
@@ -232,3 +247,23 @@ If you are compiling against PyTorch 1.6.0, you might meet some errors from PyTo
 ```
 
 If you meet issues when running or compiling mmcv, we list some common issues in [Frequently Asked Question](../faq.html).
+
+## \[Optional\] Build MMCV on IPU machine
+
+Firstly, you need to apply for an IPU cloud machine, see [here](https://www.graphcore.ai/ipus-in-the-cloud).
+
+### Option 1: Docker
+
+1. Pull docker
+
+```shell
+  docker pull graphcore/pytorch
+```
+
+2. Build MMCV under same python environment
+
+### Option 2: Install from SDK
+
+1. Build MMCV
+
+2. Use pip to install sdk according to [IPU PyTorch document](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/installation.html). Also, you need to apply for machine and sdk to Graphcore.
diff --git a/docs/get_started/installation.md b/docs/en/get_started/installation.md
similarity index 75%
rename from docs/get_started/installation.md
rename to docs/en/get_started/installation.md
index 0c64ea825cad548f21c2b41a9538f9447b7431b8..d9fd1b33607684b5c2c39fdc4d86635e0e41e263 100644
--- a/docs/get_started/installation.md
+++ b/docs/en/get_started/installation.md
@@ -3,7 +3,7 @@
 There are two versions of MMCV:
 
 - **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
-- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv<1.0.0. It is useful when you do not need those CUDA ops.
+- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
 
 ```{warning}
 Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is avaliable`.
@@ -13,36 +13,36 @@ a. Install the full version.
 
 Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
 
-We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
+We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building for **Linux and Windows systems**. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
 
 i. Install the latest version.
 
-The rule for installing the latest ``mmcv-full`` is as follows:
+The rule for installing the latest `mmcv-full` is as follows:
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-Please replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired one. For example,
-to install the latest ``mmcv-full`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
+Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. For example,
+to install the latest `mmcv-full` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
 ```
 
-For more details, please refer the the following tables and delete ``=={mmcv_version}``.
+For more details, please refer the the following tables and delete `=={mmcv_version}`.
 
 ii. Install a specified version.
 
-The rule for installing a specified ``mmcv-full`` is as follows:
+The rule for installing a specified `mmcv-full` is as follows:
 
 ```shell
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-First of all, please refer to the Releases and replace ``{mmcv_version}`` a specified one. e.g. ``1.3.9``.
-Then replace ``{cu_version}`` and ``{torch_version}`` in the url to your desired versions. For example,
-to install ``mmcv-full==1.3.9`` with ``CUDA 11.1`` and ``PyTorch 1.9.0``, use the following command:
+First of all, please refer to the Releases and replace `{mmcv_version}` a specified one. e.g. `1.3.9`.
+Then replace `{cu_version}` and `{torch_version}` in the url to your desired versions. For example,
+to install `mmcv-full==1.3.9` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
 
 ```shell
 pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
@@ -64,16 +64,28 @@ For more details, please refer the the following tables.
   <tbody>
     <tr>
       <th width="80"> CUDA </th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.10</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.9</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.8</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.7</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.6</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.5</th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"></td>
+      <td align="left"></td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">11.3</td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details></td>
       <td align="left"></td>
       <td align="left"></code></pre> </details> </td>
       <td align="left"> </td>
@@ -82,6 +94,7 @@ For more details, please refer the the following tables.
     </tr>
     <tr>
       <td align="left">11.1</td>
+      <td align="left"> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
@@ -94,13 +107,15 @@ For more details, please refer the the following tables.
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"> </td>
       <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">10.2</td>
-      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
@@ -111,6 +126,7 @@ For more details, please refer the the following tables.
       <td align="left">10.1</td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
@@ -121,12 +137,14 @@ For more details, please refer the the following tables.
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
     </tr>
     <tr>
       <td align="left">cpu</td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
@@ -138,7 +156,11 @@ For more details, please refer the the following tables.
 </table>
 
 ```{note}
-The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, if you click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html), you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, if you click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html), you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+```
+
+```{note}
+mmcv-full does not provide pre-built packages for `cu102-torch1.11` and `cu92-torch*` on Windows.
 ```
 
 Another way is to compile locally by running
diff --git a/docs/get_started/introduction.md b/docs/en/get_started/introduction.md
similarity index 62%
rename from docs/get_started/introduction.md
rename to docs/en/get_started/introduction.md
index 4ffb59d2d57cd24c23dd5d9fb0558ab1d66a06a8..9ef6ee99dc400267b1fb465be689e7831a9ca858 100644
--- a/docs/get_started/introduction.md
+++ b/docs/en/get_started/introduction.md
@@ -3,16 +3,24 @@
 MMCV is a foundational library for computer vision research and supports many
 research projects as below:
 
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition and understanding toolbox.
 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
 
 It provides the following functionalities.
 
@@ -24,6 +32,12 @@ It provides the following functionalities.
 - Various CNN architectures
 - High-quality implementation of common CUDA ops
 
+It supports the following systems.
+
+- Linux
+- Windows
+- macOS
+
 ```{note}
 MMCV requires Python 3.6+.
 ```
diff --git a/docs/get_started/previous_versions.md b/docs/en/get_started/previous_versions.md
similarity index 93%
rename from docs/get_started/previous_versions.md
rename to docs/en/get_started/previous_versions.md
index c91180d2203dc5cf21c4dccbc4b4e20891879795..a9c3717667fec3e8f338c319413aa6ad639dc6d3 100644
--- a/docs/get_started/previous_versions.md
+++ b/docs/en/get_started/previous_versions.md
@@ -4,7 +4,7 @@ We no longer provide `mmcv-full` packages compiled under lower versions of `PyTo
 
 ### PyTorch 1.4
 
-| 1.0.0 <= mmcv_version <= 1.2.1
+| 1.0.0 \<= mmcv_version \<= 1.2.1
 
 #### CUDA 10.1
 
@@ -26,7 +26,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis
 
 ### PyTorch v1.3
 
-| 1.0.0 <= mmcv_version <= 1.3.16
+| 1.0.0 \<= mmcv_version \<= 1.3.16
 
 #### CUDA 10.1
 
diff --git a/docs/index.rst b/docs/en/index.rst
similarity index 100%
rename from docs/index.rst
rename to docs/en/index.rst
index 6019f107a842107f5e38989df313ca7cc7fe9f9c..bccbc372976a491dabbe90d8c519ec8d5f00850a 100644
--- a/docs/index.rst
+++ b/docs/en/index.rst
@@ -29,12 +29,12 @@ You can switch between Chinese and English documents in the lower-left corner of
    :maxdepth: 2
    :caption: Deployment
 
+   deployment/mmcv_ops_definition.md
    deployment/onnx.md
-   deployment/onnxruntime_op.md
    deployment/onnxruntime_custom_ops.md
-   deployment/tensorrt_plugin.md
+   deployment/onnxruntime_op.md
    deployment/tensorrt_custom_ops.md
-   deployment/mmcv_ops_definition.md
+   deployment/tensorrt_plugin.md
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/make.bat b/docs/en/make.bat
similarity index 100%
rename from docs/make.bat
rename to docs/en/make.bat
diff --git a/docs/mmcv-logo.png b/docs/en/mmcv-logo.png
similarity index 100%
rename from docs/mmcv-logo.png
rename to docs/en/mmcv-logo.png
diff --git a/docs/en/understand_mmcv/cnn.md b/docs/en/understand_mmcv/cnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c401c6b609f093e6bf854c9abdbe78a13b04ac1
--- /dev/null
+++ b/docs/en/understand_mmcv/cnn.md
@@ -0,0 +1,583 @@
+## CNN
+
+We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.
+
+### Layer building
+
+We may need to try different layers of the same type when running experiments,
+but do not want to modify the code from time to time.
+Here we provide some layer building methods to construct layers from a dict,
+which can be written in configs or specified via command line arguments.
+
+#### Usage
+
+A simplest example is
+
+```python
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
+- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
+- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
+- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
+- `build_padding_layer`: Supported types are zero, reflect, replicate.
+
+#### Extension
+
+We also allow extending the building methods with custom layers and operators.
+
+1. Write and register your own module.
+
+   ```python
+   from mmcv.cnn import UPSAMPLE_LAYERS
+
+   @UPSAMPLE_LAYERS.register_module()
+   class MyUpsample:
+
+       def __init__(self, scale_factor):
+           pass
+
+       def forward(self, x):
+           pass
+   ```
+
+2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.
+
+   ```python
+   cfg = dict(type='MyUpsample', scale_factor=2)
+   layer = build_upsample_layer(cfg)
+   ```
+
+### Module bundles
+
+We also provide common module bundles to facilitate the network construction.
+`ConvModule` is a bundle of convolution, normalization and activation layers,
+please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.
+
+```python
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
+
+### Weight initialization
+
+> Implementation details are available at [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)
+
+During training, a proper initialization strategy is beneficial to speed up the
+training or obtain a higher performance. In MMCV, we provide some commonly used
+methods for initializing modules like `nn.Conv2d`. Of course, we also provide
+high-level APIs for initializing models containing one or more
+modules.
+
+#### Initialization functions
+
+Initialize a `nn.Module` such as `nn.Conv2d`, `nn.Linear` in a functional way.
+
+We provide the following initialization methods.
+
+- constant_init
+
+  Initialize module parameters with constant values.
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import constant_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # constant_init(module, val, bias=0)
+  >>> constant_init(conv1, 1, 0)
+  >>> conv1.weight
+  ```
+
+- xavier_init
+
+  Initialize module parameters with values according to the method
+  described in [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import xavier_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
+  >>> xavier_init(conv1, distribution='normal')
+  ```
+
+- normal_init
+
+  Initialize module parameters with the values drawn from a normal distribution.
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import normal_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # normal_init(module, mean=0, std=1, bias=0)
+  >>> normal_init(conv1, std=0.01, bias=0)
+  ```
+
+- uniform_init
+
+  Initialize module parameters with values drawn from a uniform distribution.
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import uniform_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # uniform_init(module, a=0, b=1, bias=0)
+  >>> uniform_init(conv1, a=0, b=1)
+  ```
+
+- kaiming_init
+
+  Initialize module parameters with the values according to the method
+  described in [Delving deep into rectifiers: Surpassing human-level
+  performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf)
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import kaiming_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
+  >>> kaiming_init(conv1)
+  ```
+
+- caffe2_xavier_init
+
+  The xavier initialization is implemented in caffe2, which corresponds to `kaiming_uniform_` in PyTorch.
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import caffe2_xavier_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # caffe2_xavier_init(module, bias=0)
+  >>> caffe2_xavier_init(conv1)
+  ```
+
+- bias_init_with_prob
+
+  Initialize conv/fc bias value according to a given probability, as proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf).
+
+  ```python
+  >>> from mmcv.cnn import bias_init_with_prob
+  >>> # bias_init_with_prob is proposed in Focal Loss
+  >>> bias = bias_init_with_prob(0.01)
+  >>> bias
+  -4.59511985013459
+  ```
+
+#### Initializers and configs
+
+On the basis of the initialization methods, we define the corresponding initialization classes and register them to `INITIALIZERS`, so we can
+use the configuration to initialize the model.
+
+We provide the following initialization classes.
+
+- ConstantInit
+- XavierInit
+- NormalInit
+- UniformInit
+- KaimingInit
+- Caffe2XavierInit
+- PretrainedInit
+
+Let us introduce the usage of `initialize` in detail.
+
+1. Initialize model by `layer` key
+
+   If we only define `layer`, it just initialize the layer in `layer` key.
+
+   NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, so `MultiheadAttention layer` is not supported.
+
+- Define `layer` key for initializing module with same configuration.
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1, 2)
+
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # initialize whole module with same configuration
+  initialize(model, init_cfg)
+  # model.feat.weight
+  # Parameter containing:
+  # tensor([[[1., 1., 1.],
+  #          [1., 1., 1.],
+  #          [1., 1., 1.]]], requires_grad=True)
+  ```
+
+- Define `layer` key for initializing layer with different configurations.
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn.utils import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1,2)
+
+  model = FooNet()
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d will be initialized with dict(type='Constant', val=1)
+  # nn.Conv2d will be initialized with dict(type='Constant', val=2)
+  # nn.Linear will be initialized with dict(type='Constant', val=3)
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]],
+  #          ...,
+  #          [[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]]]], requires_grad=True)
+  ```
+
+2. Initialize model by `override` key
+
+- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg.
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+
+  # if we would like to initialize model's weights as 1 and bias as 2
+  # but weight in `reg` as 3 and bias 4, we can use override key
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(type='Constant', name='reg', val=3, bias=4))
+  # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2)
+  # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4)
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[3., 3., 3.],
+  #           [3., 3., 3.],
+  #           [3., 3., 3.]],
+  #           ...,
+  #           [[3., 3., 3.],
+  #            [3., 3., 3.],
+  #            [3., 3., 3.]]]], requires_grad=True)
+  ```
+
+- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted.
+
+  ```python
+  model = FooNet()
+  init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
+  # self.feat and self.cls will be initialized by Pytorch
+  # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2)
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[1., 1., 1.],
+  #           [1., 1., 1.],
+  #           [1., 1., 1.]],
+  #           ...,
+  #           [[1., 1., 1.],
+  #            [1., 1., 1.],
+  #            [1., 1., 1.]]]], requires_grad=True)
+  ```
+
+- If we don't define `layer` key or `override` key, it will not initialize anything.
+
+- Invalid usage
+
+  ```python
+  # It is invalid that override don't have name key
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                  val=1, bias=2,
+                  override=dict(type='Constant', val=3, bias=4))
+
+  # It is also invalid that override has name and other args except type
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                  val=1, bias=2,
+                  override=dict(name='reg', val=3, bias=4))
+  ```
+
+3. Initialize model with the pretrained model
+
+   ```python
+   import torch.nn as nn
+   import torchvision.models as models
+   from mmcv.cnn import initialize
+
+   # initialize model with pretrained model
+   model = models.resnet50()
+   # model.conv1.weight
+   # Parameter containing:
+   # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
+   #            -1.8077e-03,  3.0338e-03],
+   #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
+   #             1.1655e-02,  4.5889e-03],
+   #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
+   #             1.7314e-02, -9.9998e-03],
+   #           ...,
+
+   init_cfg = dict(type='Pretrained',
+                   checkpoint='torchvision://resnet50')
+   initialize(model, init_cfg)
+   # model.conv1.weight
+   # Parameter containing:
+   # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
+   #            -4.3034e-02, -7.0755e-02],
+   #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
+   #            -2.0912e-02, -3.8517e-02],
+   #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
+   #             6.2641e-02,  5.1977e-02],
+   #           ...,
+
+   # initialize weights of a sub-module with the specific part of a pretrained model by using 'prefix'
+   model = models.resnet50()
+   url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+         'retinanet_r50_fpn_1x_coco/'\
+         'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+   init_cfg = dict(type='Pretrained',
+                   checkpoint=url, prefix='backbone.')
+   initialize(model, init_cfg)
+   ```
+
+4. Initialize model inherited from BaseModule, Sequential, ModuleList, ModuleDict
+
+   `BaseModule` is inherited from `torch.nn.Module`, and the only different between them is that `BaseModule` implements `init_weights()`.
+
+   `Sequential` is inherited from `BaseModule` and `torch.nn.Sequential`.
+
+   `ModuleList` is inherited from `BaseModule` and `torch.nn.ModuleList`.
+
+   `ModuleDict` is inherited from `BaseModule` and `torch.nn.ModuleDict`.
+
+   ```python
+   import torch.nn as nn
+   from mmcv.runner import BaseModule, Sequential, ModuleList, ModuleDict
+
+   class FooConv1d(BaseModule):
+
+       def __init__(self, init_cfg=None):
+           super().__init__(init_cfg)
+           self.conv1d = nn.Conv1d(4, 1, 4)
+
+       def forward(self, x):
+           return self.conv1d(x)
+
+   class FooConv2d(BaseModule):
+
+       def __init__(self, init_cfg=None):
+           super().__init__(init_cfg)
+           self.conv2d = nn.Conv2d(3, 1, 3)
+
+       def forward(self, x):
+           return self.conv2d(x)
+
+   # BaseModule
+   init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+   model = FooConv1d(init_cfg)
+   model.init_weights()
+   # model.conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #        [0., 0., 0., 0.],
+   #        [0., 0., 0., 0.],
+   #        [0., 0., 0., 0.]]], requires_grad=True)
+
+   # Sequential
+   init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+   init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   seq_model = Sequential(model1, model2)
+   seq_model.init_weights()
+   # seq_model[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # seq_model[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   seq_model = Sequential(model1, model2, init_cfg=init_cfg)
+   seq_model.init_weights()
+   # seq_model[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # seq_model[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # ModuleList
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   modellist = ModuleList([model1, model2])
+   modellist.init_weights()
+   # modellist[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modellist[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   modellist = ModuleList([model1, model2], init_cfg=init_cfg)
+   modellist.init_weights()
+   # modellist[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modellist[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # ModuleDict
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   modeldict = ModuleDict(dict(model1=model1, model2=model2))
+   modeldict.init_weights()
+   # modeldict['model1'].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modeldict['model2'].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   modeldict = ModuleDict(dict(model1=model1, model2=model2), init_cfg=init_cfg)
+   modeldict.init_weights()
+   # modeldict['model1'].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modeldict['model2'].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+   ```
+
+### Model Zoo
+
+Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
+
+- VGG Caffe
+- ResNet Caffe
+- ResNeXt
+- ResNet with Group Normalization
+- ResNet with Group Normalization and Weight Standardization
+- HRNetV2
+- Res2Net
+- RegNet
+
+#### Model URLs in JSON
+
+The model zoo links in MMCV are managed by JSON files.
+The json file consists of key-value pair of model name and its url or path.
+An example json file could be like:
+
+```json
+{
+    "model_a": "https://example.com/models/model_a_9e5bac.pth",
+    "model_b": "pretrain/model_b_ab3ef2c.pth"
+}
+```
+
+The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json).
+
+You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
+
+The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
+
+#### Load Checkpoint
+
+The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
+
+- filepath: The filepath of the checkpoint.
+- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
+- `torchvision://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
+- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
diff --git a/docs/understand_mmcv/config.md b/docs/en/understand_mmcv/config.md
similarity index 95%
rename from docs/understand_mmcv/config.md
rename to docs/en/understand_mmcv/config.md
index d0b669b8516c0281000a88c1bd41aac731dc8326..9626dbe2c331273995e6e2fbf095461b171101bd 100644
--- a/docs/understand_mmcv/config.md
+++ b/docs/en/understand_mmcv/config.md
@@ -196,5 +196,5 @@ _deprecation_ = dict(
 ```python
 >>> cfg = Config.fromfile('./deprecated_cfg.py')
 
-UserWarning: The config file deprecated.py will be deprecated in the future. Please use expected_cfg.py instead. More information can be found at https://github.com/open-mmlab/mmcv/pull/1275
+UserWarning: The config file deprecated_cfg.py will be deprecated in the future. Please use expected_cfg.py instead. More information can be found at https://github.com/open-mmlab/mmcv/pull/1275
 ```
diff --git a/docs/understand_mmcv/data_process.md b/docs/en/understand_mmcv/data_process.md
similarity index 99%
rename from docs/understand_mmcv/data_process.md
rename to docs/en/understand_mmcv/data_process.md
index 79e9281b6c88c907e6edfc6d03f73930b2cd51ef..94a4c5431fe6237220cf2d99af1894dd06961d1e 100644
--- a/docs/understand_mmcv/data_process.md
+++ b/docs/en/understand_mmcv/data_process.md
@@ -232,7 +232,7 @@ mmcv.resize_video('test.mp4', 'resized2.mp4', ratio=2)
 
 - IO
 - Visualization
-- Flow warpping
+- Flow warping
 
 We provide two options to dump optical flow files: uncompressed and compressed.
 The uncompressed way just dumps the floating numbers to a binary file. It is
diff --git a/docs/understand_mmcv/io.md b/docs/en/understand_mmcv/io.md
similarity index 98%
rename from docs/understand_mmcv/io.md
rename to docs/en/understand_mmcv/io.md
index f6c28dd425cb0bcc54ca5d92a3a3849103f47e2a..64fbc8b8e60841f8de74235e17a6b42566cf912d 100644
--- a/docs/understand_mmcv/io.md
+++ b/docs/en/understand_mmcv/io.md
@@ -195,8 +195,8 @@ disk_backend = HardDiskBackend()
 with io.BytesIO(disk_backend.get(filepath1)) as buffer:
     checkpoint = torch.load(buffer)
 with io.BytesIO() as buffer:
-    torch.save(checkpoint, f)
-    disk_backend.put(f.getvalue(), filepath2)
+    torch.save(checkpoint, buffer)
+    disk_backend.put(buffer.getvalue(), filepath2)
 ```
 
 If we want to implement an interface which automatically select the corresponding
diff --git a/docs/en/understand_mmcv/ops.md b/docs/en/understand_mmcv/ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..127f92bf959ab89725332a43ebe8630625c4c5ab
--- /dev/null
+++ b/docs/en/understand_mmcv/ops.md
@@ -0,0 +1,60 @@
+## ops
+
+We implement common ops used in detection, segmentation, etc.
+
+| Device                       | CPU | CUDA | MLU | MPS |
+| ---------------------------- | --- | ---- | --- | --- |
+| ActiveRotatedFilter          | √   | √    |     |     |
+| AssignScoreWithK             |     | √    |     |     |
+| BallQuery                    |     | √    |     |     |
+| BBoxOverlaps                 |     | √    | √   | √   |
+| BorderAlign                  |     | √    |     |     |
+| BoxIouRotated                | √   | √    |     |     |
+| CARAFE                       |     | √    |     |     |
+| ChamferDistance              |     | √    |     |     |
+| CrissCrossAttention          |     | √    |     |     |
+| ContourExpand                | √   |      |     |     |
+| ConvexIoU                    |     | √    |     |     |
+| CornerPool                   |     | √    |     |     |
+| Correlation                  |     | √    |     |     |
+| Deformable Convolution v1/v2 | √   | √    |     |     |
+| Deformable RoIPool           |     | √    |     |     |
+| DiffIoURotated               |     | √    |     |     |
+| DynamicScatter               |     | √    |     |     |
+| FurthestPointSample          |     | √    |     |     |
+| FurthestPointSampleWithDist  |     | √    |     |     |
+| FusedBiasLeakyrelu           |     | √    |     |     |
+| GatherPoints                 |     | √    |     |     |
+| GroupPoints                  |     | √    |     |     |
+| Iou3d                        |     | √    |     |     |
+| KNN                          |     | √    |     |     |
+| MaskedConv                   |     | √    |     |     |
+| MergeCells                   |     | √    |     |     |
+| MinAreaPolygon               |     | √    |     |     |
+| ModulatedDeformConv2d        | √   | √    |     |     |
+| MultiScaleDeformableAttn     |     | √    |     |     |
+| NMS                          | √   | √    | √   |     |
+| NMSRotated                   | √   | √    |     |     |
+| PixelGroup                   | √   |      |     |     |
+| PointsInBoxes                | √   | √    |     |     |
+| PointsInPolygons             |     | √    |     |     |
+| PSAMask                      | √   | √    | √   |     |
+| RotatedFeatureAlign          | √   | √    |     |     |
+| RoIPointPool3d               |     | √    |     |     |
+| RoIPool                      |     | √    | √   |     |
+| RoIAlignRotated              | √   | √    | √   |     |
+| RiRoIAlignRotated            |     | √    |     |     |
+| RoIAlign                     | √   | √    | √   |     |
+| RoIAwarePool3d               |     | √    |     |     |
+| SAConv2d                     |     | √    |     |     |
+| SigmoidFocalLoss             |     | √    | √   |     |
+| SoftmaxFocalLoss             |     | √    |     |     |
+| SoftNMS                      |     | √    |     |     |
+| Sparse Convolution           |     | √    |     |     |
+| Synchronized BatchNorm       |     | √    |     |     |
+| ThreeInterpolate             |     | √    |     |     |
+| ThreeNN                      |     | √    |     |     |
+| TINShift                     |     | √    | √   |     |
+| UpFirDn2d                    |     | √    |     |     |
+| Voxelization                 | √   | √    |     |     |
+| PrRoIPool                    |     | √    |     |     |
diff --git a/docs/understand_mmcv/registry.md b/docs/en/understand_mmcv/registry.md
similarity index 74%
rename from docs/understand_mmcv/registry.md
rename to docs/en/understand_mmcv/registry.md
index 2cf10819fea6ac81645cc127c6b7aea54af19d5f..824e0295a4cd16870002ce9098ad46ddc76adbb9 100644
--- a/docs/understand_mmcv/registry.md
+++ b/docs/en/understand_mmcv/registry.md
@@ -3,11 +3,15 @@
 MMCV implements [registry](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) to manage different modules that share similar functionalities, e.g., backbones, head, and necks, in detectors.
 Most projects in OpenMMLab use registry to manage modules of datasets and models, such as [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting), etc.
 
+```{note}
+In v1.5.1 and later, the Registry supports registering functions and calling them.
+```
+
 ### What is registry
 
-In MMCV, registry can be regarded as a mapping that maps a class to a string.
-These classes contained by a single registry usually have similar APIs but implement different algorithms or support different datasets.
-With the registry, users can find and instantiate the class through its corresponding string, and use the instantiated module as they want.
+In MMCV, registry can be regarded as a mapping that maps a class or function to a string.
+These classes or functions contained by a single registry usually have similar APIs but implement different algorithms or support different datasets.
+With the registry, users can find the class or function through its corresponding string, and instantiate the corresponding module or call the function to obtain the result according to needs.
 One typical example is the config systems in most OpenMMLab projects, which use the registry to create hooks, runners, models, and datasets, through configs.
 The API reference could be found [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry).
 
@@ -17,7 +21,7 @@ To manage your modules in the codebase by `Registry`, there are three steps as b
 2. Create a registry.
 3. Use this registry to manage the modules.
 
-`build_func` argument of `Registry` is to customize how to instantiate the class instance, the default one is `build_from_cfg` implemented [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg).
+`build_func` argument of `Registry` is to customize how to instantiate the class instance or how to call the function to obtain the result, the default one is `build_from_cfg` implemented [here](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg).
 
 ### A Simple Example
 
@@ -31,10 +35,10 @@ In the package, we first create a file to implement builders, named `converters/
 ```python
 from mmcv.utils import Registry
 # create a registry for converters
-CONVERTERS = Registry('converter')
+CONVERTERS = Registry('converters')
 ```
 
-Then we can implement different converters in the package. For example, implement `Converter1` in `converters/converter1.py`
+Then we can implement different converters that is class or function in the package. For example, implement `Converter1` in `converters/converter1.py`, and `converter2` in `converters/converter2.py`.
 
 ```python
 
@@ -48,18 +52,38 @@ class Converter1(object):
         self.b = b
 ```
 
+```python
+# converter2.py
+from .builder import CONVERTERS
+from .converter1 import Converter1
+
+# 使用注册器管理模块
+@CONVERTERS.register_module()
+def converter2(a, b)
+    return Converter1(a, b)
+```
+
 The key step to use registry for managing the modules is to register the implemented module into the registry `CONVERTERS` through
-`@CONVERTERS.register_module()` when you are creating the module. By this way, a mapping between a string and the class is built and maintained by `CONVERTERS` as below
+`@CONVERTERS.register_module()` when you are creating the module. By this way, a mapping between a string and the class (function) is built and maintained by `CONVERTERS` as below
 
 ```python
 'Converter1' -> <class 'Converter1'>
+'converter2' -> <function 'converter2'>
+```
+
+```{note}
+The registry mechanism will be triggered only when the file where the module is located is imported.
+So you need to import that file somewhere. More details can be found at https://github.com/open-mmlab/mmdetection/issues/5974.
 ```
 
 If the module is successfully registered, you can use this converter through configs as
 
 ```python
-converter_cfg = dict(type='Converter1', a=a_value, b=b_value)
-converter = CONVERTERS.build(converter_cfg)
+converter1_cfg = dict(type='Converter1', a=a_value, b=b_value)
+converter2_cfg = dict(type='converter2', a=a_value, b=b_value)
+converter1 = CONVERTERS.build(converter1_cfg)
+# returns the calling result
+result = CONVERTERS.build(converter2_cfg)
 ```
 
 ### Customize Build Function
@@ -88,7 +112,7 @@ CONVERTERS = Registry('converter', build_func=build_converter)
 ```{note}
 In this example, we demonstrate how to use the `build_func` argument to customize the way to build a class instance.
 The functionality is similar to the default `build_from_cfg`. In most cases, default one would be sufficient.
-`build_model_from_cfg` is also implemented to build PyTorch module in `nn.Sequentail`, you may directly use them instead of implementing by yourself.
+`build_model_from_cfg` is also implemented to build PyTorch module in `nn.Sequential`, you may directly use them instead of implementing by yourself.
 ```
 
 ### Hierarchy Registry
diff --git a/docs/understand_mmcv/runner.md b/docs/en/understand_mmcv/runner.md
similarity index 88%
rename from docs/understand_mmcv/runner.md
rename to docs/en/understand_mmcv/runner.md
index 2e6e3868335d92f94e98441a5c7ec6d0b92a960b..eeeb859ee82534632365c98b2e6e4370da2b955b 100644
--- a/docs/understand_mmcv/runner.md
+++ b/docs/en/understand_mmcv/runner.md
@@ -8,7 +8,7 @@ The runner class is designed to manage the training. It eases the training proce
 
 ### EpochBasedRunner
 
-As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, [('train', 2), ('val', 1)] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default.
+As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, \[('train', 2), ('val', 1)\] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default.
 
 Let's take a look at its core logic:
 
@@ -44,7 +44,7 @@ def train(self, data_loader, **kwargs):
 
 ### IterBasedRunner
 
-Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, [('train', 2), ('val', 1)] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default.
+Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, \[('train', 2), ('val', 1)\] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default.
 
 Let's take a look at its core logic:
 
@@ -156,8 +156,8 @@ runner.run(data_loaders, cfg.workflow)
 
 Let's take `EpochBasedRunner` for example and go a little bit into details about setting workflow:
 
-- Say we only want to put train in the workflow, then we can set: workflow = [('train', 1)]. The runner will only execute train iteratively in this case.
-- Say we want to put both train and val in the workflow, then we can set: workflow = [('train', 3), ('val',1)]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs.
-- Workflow is highly flexible. Therefore, you can set workflow = [('val', 1), ('train',1)] if you would like the runner to validate first and train after.
+- Say we only want to put train in the workflow, then we can set: workflow = \[('train', 1)\]. The runner will only execute train iteratively in this case.
+- Say we want to put both train and val in the workflow, then we can set: workflow = \[('train', 3), ('val',1)\]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs.
+- Workflow is highly flexible. Therefore, you can set workflow = \[('val', 1), ('train',1)\] if you would like the runner to validate first and train after.
 
 The code we demonstrated above is already in `train.py` in MM repositories. Simply modify the corresponding keys in the configuration files and the script will execute the expected workflow automatically.
diff --git a/docs/understand_mmcv/utils.md b/docs/en/understand_mmcv/utils.md
similarity index 100%
rename from docs/understand_mmcv/utils.md
rename to docs/en/understand_mmcv/utils.md
diff --git a/docs/understand_mmcv/visualization.md b/docs/en/understand_mmcv/visualization.md
similarity index 100%
rename from docs/understand_mmcv/visualization.md
rename to docs/en/understand_mmcv/visualization.md
diff --git a/docs/faq.md b/docs/faq.md
deleted file mode 100644
index ab0dd135f946c63f6dc3d08e2b6ca2f6837c7437..0000000000000000000000000000000000000000
--- a/docs/faq.md
+++ /dev/null
@@ -1,42 +0,0 @@
-## Frequently Asked Questions
-
-We list some common troubles faced by many users and their corresponding solutions here.
-Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them.
-
-- Compatibility issue between MMCV and MMDetection; "ConvWS is already registered in conv layer"
-
-    Please install the correct version of MMCV for the version of your MMDetection following the instruction above.
-
-- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'".
-
-    1. Uninstall existing mmcv in the environment using `pip uninstall mmcv`.
-    2. Install mmcv-full following the instruction above.
-
-- "invalid device function" or "no kernel image is available for execution".
-
-    1. Check the CUDA compute capability of you GPU.
-    2. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision,
-       and MMCV are built for the correct GPU architecture.
-       You may need to set `TORCH_CUDA_ARCH_LIST` to reinstall MMCV.
-       The compatibility issue could happen when  using old GPUS, e.g., Tesla K80 (3.7) on colab.
-    3. Check whether the running environment is the same as that when mmcv/mmdet is compiled.
-       For example, you may compile mmcv using CUDA 10.0 bug run it on CUDA9.0   environments.
-
-- "undefined symbol" or "cannot open xxx.so".
-
-    1. If those symbols are CUDA/C++ symbols (e.g., libcudart.so or GLIBCXX), check
-       whether the CUDA/GCC runtimes are the same as those used for compiling mmcv.
-    2. If those symbols are Pytorch symbols (e.g., symbols containing caffe, aten, and TH), check whether
-       the Pytorch version is the same as that used for compiling mmcv.
-    3. Run `python mmdet/utils/collect_env.py` to check whether PyTorch, torchvision,
-       and MMCV are built by and running on the same environment.
-
-- "RuntimeError: CUDA error: invalid configuration argument".
-
-    This error may be due to your poor GPU. Try to decrease the value of [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
-    and recompile mmcv.
-
-- "RuntimeError: nms is not compiled with GPU support".
-
-    This error is because your CUDA environment is not installed correctly.
-    You may try to re-install your CUDA environment and then delete the build/ folder before re-compile mmcv.
diff --git a/docs/understand_mmcv/cnn.md b/docs/understand_mmcv/cnn.md
deleted file mode 100644
index 749cb951131efe5c9ec4c59ef05b90243913df68..0000000000000000000000000000000000000000
--- a/docs/understand_mmcv/cnn.md
+++ /dev/null
@@ -1,538 +0,0 @@
-## CNN
-
-We provide some building bricks for CNNs, including layer building, module bundles and weight initialization.
-
-### Layer building
-
-We may need to try different layers of the same type when running experiments,
-but do not want to modify the code from time to time.
-Here we provide some layer building methods to construct layers from a dict,
-which can be written in configs or specified via command line arguments.
-
-#### Usage
-
-A simplest example is
-
-```python
-cfg = dict(type='Conv3d')
-layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
-```
-
-- `build_conv_layer`: Supported types are Conv1d, Conv2d, Conv3d, Conv (alias for Conv2d).
-- `build_norm_layer`: Supported types are BN1d, BN2d, BN3d, BN (alias for BN2d), SyncBN, GN, LN, IN1d, IN2d, IN3d, IN (alias for IN2d).
-- `build_activation_layer`: Supported types are ReLU, LeakyReLU, PReLU, RReLU, ReLU6, ELU, Sigmoid, Tanh, GELU.
-- `build_upsample_layer`: Supported types are nearest, bilinear, deconv, pixel_shuffle.
-- `build_padding_layer`: Supported types are zero, reflect, replicate.
-
-#### Extension
-
-We also allow extending the building methods with custom layers and operators.
-
-1. Write and register your own module.
-
-    ```python
-    from mmcv.cnn import UPSAMPLE_LAYERS
-
-    @UPSAMPLE_LAYERS.register_module()
-    class MyUpsample:
-
-        def __init__(self, scale_factor):
-            pass
-
-        def forward(self, x):
-            pass
-    ```
-
-2. Import `MyUpsample` somewhere (e.g., in `__init__.py`) and then use it.
-
-    ```python
-    cfg = dict(type='MyUpsample', scale_factor=2)
-    layer = build_upsample_layer(cfg)
-    ```
-
-### Module bundles
-
-We also provide common module bundles to facilitate the network construction.
-`ConvModule` is a bundle of convolution, normalization and activation layers,
-please refer to the [api](api.html#mmcv.cnn.ConvModule) for details.
-
-```python
-# conv + bn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
-# conv + gn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
-# conv + relu
-conv = ConvModule(3, 8, 2)
-# conv
-conv = ConvModule(3, 8, 2, act_cfg=None)
-# conv + leaky relu
-conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
-# bn + conv + relu
-conv = ConvModule(
-    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
-```
-
-### Weight initialization
-
-> Implementation details are available at [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)
-
-During training, a proper initialization strategy is beneficial to speed up the
-training or obtain a higher performance. In MMCV, we provide some commonly used
-methods for initializing modules like `nn.Conv2d`. Of course, we also provide
-high-level APIs for initializing models containing one or more
-modules.
-
-#### Initialization functions
-
-Initialize a `nn.Module` such as `nn.Conv2d`, `nn.Linear` in a functional way.
-
-We provide the following initialization methods.
-
-- constant_init
-
-  Initialize module parameters with constant values.
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import constant_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # constant_init(module, val, bias=0)
-    >>> constant_init(conv1, 1, 0)
-    >>> conv1.weight
-    ```
-
-- xavier_init
-
-  Initialize module parameters with values according to the method
-  described in [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import xavier_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
-    >>> xavier_init(conv1, distribution='normal')
-    ```
-
-- normal_init
-
-  Initialize module parameters with the values drawn from a normal distribution.
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import normal_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # normal_init(module, mean=0, std=1, bias=0)
-    >>> normal_init(conv1, std=0.01, bias=0)
-    ```
-
-- uniform_init
-
-  Initialize module parameters with values drawn from a uniform distribution.
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import uniform_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # uniform_init(module, a=0, b=1, bias=0)
-    >>> uniform_init(conv1, a=0, b=1)
-    ```
-
-- kaiming_init
-
-  Initialize module parameters with the values according to the method
-  described in [Delving deep into rectifiers: Surpassing human-level
-  performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf)
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import kaiming_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
-    >>> kaiming_init(conv1)
-    ```
-
-- caffe2_xavier_init
-
-  The xavier initialization is implemented in caffe2, which corresponds to `kaiming_uniform_` in PyTorch.
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import caffe2_xavier_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # caffe2_xavier_init(module, bias=0)
-    >>> caffe2_xavier_init(conv1)
-    ```
-
-- bias_init_with_prob
-
-  Initialize conv/fc bias value according to a given probability, as proposed in [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf).
-
-    ```python
-    >>> from mmcv.cnn import bias_init_with_prob
-    >>> # bias_init_with_prob is proposed in Focal Loss
-    >>> bias = bias_init_with_prob(0.01)
-    >>> bias
-    -4.59511985013459
-    ```
-
-#### Initializers and configs
-
-On the basis of the initialization methods, we define the corresponding initialization classes and register them to `INITIALIZERS`, so we can
-use the configuration to initialize the model.
-
-We provide the following initialization classes.
-
-- ConstantInit
-- XavierInit
-- NormalInit
-- UniformInit
-- KaimingInit
-- Caffe2XavierInit
-- PretrainedInit
-
-Let us introduce the usage of `initialize` in detail.
-
-1. Initialize model by `layer` key
-
-    If we only define `layer`, it just initialize the layer in `layer` key.
-
-    NOTE: Value of `layer` key is the class name with attributes weights and bias of Pytorch, so `MultiheadAttention layer` is not supported.
-
-- Define `layer` key for initializing module with same configuration.
-
-  ```python
-  import torch.nn as nn
-  from mmcv.cnn import initialize
-
-  class FooNet(nn.Module):
-      def __init__(self):
-          super().__init__()
-          self.feat = nn.Conv1d(3, 1, 3)
-          self.reg = nn.Conv2d(3, 3, 3)
-          self.cls = nn.Linear(1, 2)
-
-  model = FooNet()
-  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
-  # initialize whole module with same configuration
-  initialize(model, init_cfg)
-  # model.feat.weight
-  # Parameter containing:
-  # tensor([[[1., 1., 1.],
-  #          [1., 1., 1.],
-  #          [1., 1., 1.]]], requires_grad=True)
-  ```
-
-- Define `layer` key for initializing layer with different configurations.
-
-  ```python
-  import torch.nn as nn
-  from mmcv.cnn.utils import initialize
-
-  class FooNet(nn.Module):
-      def __init__(self):
-          super().__init__()
-          self.feat = nn.Conv1d(3, 1, 3)
-          self.reg = nn.Conv2d(3, 3, 3)
-          self.cls = nn.Linear(1,2)
-
-  model = FooNet()
-  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
-              dict(type='Constant', layer='Conv2d', val=2),
-              dict(type='Constant', layer='Linear', val=3)]
-  # nn.Conv1d will be initialized with dict(type='Constant', val=1)
-  # nn.Conv2d will be initialized with dict(type='Constant', val=2)
-  # nn.Linear will be initialized with dict(type='Constant', val=3)
-  initialize(model, init_cfg)
-  # model.reg.weight
-  # Parameter containing:
-  # tensor([[[[2., 2., 2.],
-  #           [2., 2., 2.],
-  #           [2., 2., 2.]],
-  #          ...,
-  #          [[2., 2., 2.],
-  #           [2., 2., 2.],
-  #           [2., 2., 2.]]]], requires_grad=True)
-  ```
-
-2. Initialize model by `override` key
-
-- When initializing some specific part with its attribute name, we can use `override` key, and the value in `override` will ignore the value in init_cfg.
-
-    ```python
-    import torch.nn as nn
-    from mmcv.cnn import initialize
-
-    class FooNet(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.feat = nn.Conv1d(3, 1, 3)
-            self.reg = nn.Conv2d(3, 3, 3)
-            self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
-
-    # if we would like to initialize model's weights as 1 and bias as 2
-    # but weight in `cls` as 3 and bias 4, we can use override key
-    model = FooNet()
-    init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
-                    override=dict(type='Constant', name='reg', val=3, bias=4))
-    # self.feat and self.cls will be initialized with dict(type='Constant', val=1, bias=2)
-    # The module called 'reg' will be initialized with dict(type='Constant', val=3, bias=4)
-    initialize(model, init_cfg)
-    # model.reg.weight
-    # Parameter containing:
-    # tensor([[[[3., 3., 3.],
-    #           [3., 3., 3.],
-    #           [3., 3., 3.]],
-    #           ...,
-    #           [[3., 3., 3.],
-    #            [3., 3., 3.],
-    #            [3., 3., 3.]]]], requires_grad=True)
-    ```
-
-- If `layer` is None in init_cfg, only sub-module with the name in override will be initialized, and type and other args in override can be omitted.
-
-    ```python
-    model = FooNet()
-    init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
-    # self.feat and self.cls will be initialized by Pytorch
-    # The module called 'reg' will be initialized with dict(type='Constant', val=1, bias=2)
-    initialize(model, init_cfg)
-    # model.reg.weight
-    # Parameter containing:
-    # tensor([[[[1., 1., 1.],
-    #           [1., 1., 1.],
-    #           [1., 1., 1.]],
-    #           ...,
-    #           [[1., 1., 1.],
-    #            [1., 1., 1.],
-    #            [1., 1., 1.]]]], requires_grad=True)
-    ```
-
-- If we don't define `layer` key or `override` key, it will not initialize anything.
-
-- Invalid usage
-
-   ```python
-   # It is invalid that override don't have name key
-   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
-                   val=1, bias=2,
-                   override=dict(type='Constant', val=3, bias=4))
-
-   # It is also invalid that override has name and other args except type
-   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
-                   val=1, bias=2,
-                   override=dict(name='reg', val=3, bias=4))
-   ```
-
-3. Initialize model with the pretrained model
-
-    ```python
-    import torch.nn as nn
-    import torchvision.models as models
-    from mmcv.cnn import initialize
-
-    # initialize model with pretrained model
-    model = models.resnet50()
-    # model.conv1.weight
-    # Parameter containing:
-    # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
-    #            -1.8077e-03,  3.0338e-03],
-    #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
-    #             1.1655e-02,  4.5889e-03],
-    #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
-    #             1.7314e-02, -9.9998e-03],
-    #           ...,
-
-    init_cfg = dict(type='Pretrained',
-                    checkpoint='torchvision://resnet50')
-    initialize(model, init_cfg)
-    # model.conv1.weight
-    # Parameter containing:
-    # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
-    #            -4.3034e-02, -7.0755e-02],
-    #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
-    #            -2.0912e-02, -3.8517e-02],
-    #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
-    #             6.2641e-02,  5.1977e-02],
-    #           ...,
-
-    # initialize weights of a sub-module with the specific part of a pretrained model by using 'prefix'
-    model = models.resnet50()
-    url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
-          'retinanet_r50_fpn_1x_coco/'\
-          'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
-    init_cfg = dict(type='Pretrained',
-                    checkpoint=url, prefix='backbone.')
-    initialize(model, init_cfg)
-    ```
-
-4. Initialize model inherited from BaseModule, Sequential, ModuleList
-
-    `BaseModule` is inherited from `torch.nn.Module`, and the only different between them is that `BaseModule` implements `init_weight`.
-
-    `Sequential` is inherited from `BaseModule` and `torch.nn.Sequential`.
-
-    `ModuleList` is inherited from `BaseModule` and `torch.nn.ModuleList`.
-
-    `````python
-    import torch.nn as nn
-    from mmcv.runner import BaseModule, Sequential, ModuleList
-
-    class FooConv1d(BaseModule):
-
-        def __init__(self, init_cfg=None):
-            super().__init__(init_cfg)
-            self.conv1d = nn.Conv1d(4, 1, 4)
-
-        def forward(self, x):
-            return self.conv1d(x)
-
-    class FooConv2d(BaseModule):
-
-        def __init__(self, init_cfg=None):
-            super().__init__(init_cfg)
-            self.conv2d = nn.Conv2d(3, 1, 3)
-
-        def forward(self, x):
-            return self.conv2d(x)
-
-    # BaseModule
-    init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
-    model = FooConv1d(init_cfg)
-    model.init_weights()
-    # model.conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #        [0., 0., 0., 0.],
-    #        [0., 0., 0., 0.],
-    #        [0., 0., 0., 0.]]], requires_grad=True)
-
-    # Sequential
-    init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
-    init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    seq_model = Sequential(model1, model2)
-    seq_model.init_weights()
-    # seq_model[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # seq_model[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # inner init_cfg has higher priority
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
-    seq_model = Sequential(model1, model2, init_cfg=init_cfg)
-    seq_model.init_weights()
-    # seq_model[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # seq_model[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # ModuleList
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    modellist = ModuleList([model1, model2])
-    modellist.init_weights()
-    # modellist[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # modellist[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # inner init_cfg has higher priority
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
-    modellist = ModuleList([model1, model2], init_cfg=init_cfg)
-    modellist.init_weights()
-    # modellist[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # modellist[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-    `````
-
-### Model Zoo
-
-Besides torchvision pre-trained models, we also provide pre-trained models of following CNN:
-
-- VGG Caffe
-- ResNet Caffe
-- ResNeXt
-- ResNet with Group Normalization
-- ResNet with Group Normalization and Weight Standardization
-- HRNetV2
-- Res2Net
-- RegNet
-
-#### Model URLs in JSON
-
-The model zoo links in MMCV are managed by JSON files.
-The json file consists of key-value pair of model name and its url or path.
-An example json file could be like:
-
-```json
-{
-    "model_a": "https://example.com/models/model_a_9e5bac.pth",
-    "model_b": "pretrain/model_b_ab3ef2c.pth"
-}
-```
-
-The default links of the pre-trained models hosted on OpenMMLab AWS could be found [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json).
-
-You may override default links by putting `open-mmlab.json` under `MMCV_HOME`. If `MMCV_HOME` is not find in the environment, `~/.cache/mmcv` will be used by default. You may `export MMCV_HOME=/your/path` to use your own path.
-
-The external json files will be merged into default one. If the same key presents in both external json and default json, the external one will be used.
-
-#### Load Checkpoint
-
-The following types are supported for `filename` argument of `mmcv.load_checkpoint()`.
-
-- filepath: The filepath of the checkpoint.
-- `http://xxx` and `https://xxx`: The link to download the checkpoint. The `SHA256` postfix should be contained in the filename.
-- `torchvision://xxx`: The model links in `torchvision.models`.Please refer to [torchvision](https://pytorch.org/docs/stable/torchvision/models.html) for details.
-- `open-mmlab://xxx`: The model links or filepath provided in default and additional json files.
diff --git a/docs/understand_mmcv/ops.md b/docs/understand_mmcv/ops.md
deleted file mode 100644
index 2729e441c1318ca2850c21bf72df428910657f31..0000000000000000000000000000000000000000
--- a/docs/understand_mmcv/ops.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## CUDA ops
-
-We implement common CUDA ops used in detection, segmentation, etc.
-
-- AssignScoreWithK
-- BallQuery
-- BBoxOverlaps
-- CARAFE
-- CrissCrossAttention
-- ContextBlock
-- CornerPool
-- Deformable Convolution v1/v2
-- Deformable RoIPool
-- DynamicScatter
-- GatherPoints
-- FurthestPointSample
-- FurthestPointSampleWithDist
-- GeneralizedAttention
-- GroupPoints
-- KNN
-- MaskedConv
-- NMS
-- PSAMask
-- RoIPointPool3d
-- RoIPool
-- RoIAlign
-- RoIAwarePool3d
-- SimpleRoIAlign
-- SigmoidFocalLoss
-- SoftmaxFocalLoss
-- SoftNMS
-- Synchronized BatchNorm
-- Voxelization
-- ThreeInterpolate
-- ThreeNN
-- Weight standardization
-- Correlation
diff --git a/docs_zh_CN/Makefile b/docs/zh_cn/Makefile
similarity index 100%
rename from docs_zh_CN/Makefile
rename to docs/zh_cn/Makefile
diff --git a/docs_zh_CN/_static/css/readthedocs.css b/docs/zh_cn/_static/css/readthedocs.css
similarity index 100%
rename from docs_zh_CN/_static/css/readthedocs.css
rename to docs/zh_cn/_static/css/readthedocs.css
diff --git a/docs_zh_CN/_static/image/mmcv-logo.png b/docs/zh_cn/_static/image/mmcv-logo.png
similarity index 100%
rename from docs_zh_CN/_static/image/mmcv-logo.png
rename to docs/zh_cn/_static/image/mmcv-logo.png
diff --git a/docs_zh_CN/api.rst b/docs/zh_cn/api.rst
similarity index 90%
rename from docs_zh_CN/api.rst
rename to docs/zh_cn/api.rst
index 8ca9118c3b033f1b7311ec3c1533ce9c93fa1aa2..5d3e623037e3fb102f8c927ff5909d478a46cab9 100644
--- a/docs_zh_CN/api.rst
+++ b/docs/zh_cn/api.rst
@@ -38,6 +38,11 @@ runner
 .. automodule:: mmcv.runner
     :members:
 
+engine
+------
+.. automodule:: mmcv.engine
+    :members:
+
 ops
 ------
 .. automodule:: mmcv.ops
diff --git a/docs_zh_CN/community/contributing.md b/docs/zh_cn/community/contributing.md
similarity index 68%
rename from docs_zh_CN/community/contributing.md
rename to docs/zh_cn/community/contributing.md
index 30bac8738bee8db306287c6b245b3115464e64da..b7bc1d22d9bb52875b37a15ea1bb3eea1e61c027 100644
--- a/docs_zh_CN/community/contributing.md
+++ b/docs/zh_cn/community/contributing.md
@@ -7,7 +7,9 @@
 - 添加新功能和新组件
 
 ### 工作流
+
 | 详细工作流见 [拉取请求](pr.md)
+
 1. 复刻并拉取最新的 OpenMMLab 算法库
 2. 创建新的分支（不建议使用主分支提拉取请求）
 3. 提交你的修改
@@ -16,16 +18,18 @@
 ```{note}
 如果你计划添加新功能并且该功能包含比较大的改动，建议先开 issue 讨论
 ```
+
 ### 代码风格
 
 #### Python
 
 [PEP8](https://www.python.org/dev/peps/pep-0008/) 作为 OpenMMLab 算法库首选的代码规范，我们使用以下工具检查和格式化代码
 
-- [flake8](http://flake8.pycqa.org/en/latest/): Python 官方发布的代码规范检查工具，是多个检查工具的封装
-- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
+- [flake8](https://github.com/PyCQA/flake8): Python 官方发布的代码规范检查工具，是多个检查工具的封装
 - [isort](https://github.com/timothycrosley/isort): 自动调整模块导入顺序的工具
-- [markdownlint](https://github.com/markdownlint/markdownlint): 检查 markdown 文件的工具
+- [yapf](https://github.com/google/yapf): Google 发布的代码规范检查工具
+- [codespell](https://github.com/codespell-project/codespell): 检查单词拼写是否有误
+- [mdformat](https://github.com/executablebooks/mdformat): 检查 markdown 文件的工具
 - [docformatter](https://github.com/myint/docformatter): 格式化 docstring 的工具
 
 yapf 和 isort 的配置可以在 [setup.cfg](./setup.cfg) 找到
@@ -46,23 +50,7 @@ pip install -U pre-commit
 pre-commit install
 ```
 
-如果安装 markdownlint 遇到了问题，可以尝试使用以下的步骤安装 ruby
-
-```shell
-# install rvm
-curl -L https://get.rvm.io | bash -s -- --autolibs=read-fail
-[[ -s "$HOME/.rvm/scripts/rvm" ]] && source "$HOME/.rvm/scripts/rvm"
-rvm autolibs disable
-
-# install ruby
-rvm install 2.7.1
-```
-
-或者参考 [这个代码库](https://github.com/innerlee/setup) 和 [`zzruby.sh`](https://github.com/innerlee/setup/blob/master/zzruby.sh)。
-
-至此，每一次 commit 修改都会触发 pre-commit 检查代码格式。
-
->提交拉取请求前，请确保你的代码符合 yapf 的格式
+> 提交拉取请求前，请确保你的代码符合 yapf 的格式
 
 #### C++ and CUDA
 
diff --git a/docs/zh_cn/community/pr.md b/docs/zh_cn/community/pr.md
new file mode 100644
index 0000000000000000000000000000000000000000..720f38986320bb94be67165ddb2dea2f04f659c9
--- /dev/null
+++ b/docs/zh_cn/community/pr.md
@@ -0,0 +1,114 @@
+## 拉取请求
+
+### 什么是拉取请求？
+
+`拉取请求` (Pull Request), [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests)定义如下。
+
+```
+拉取请求是一种通知机制。你修改了他人的代码，将你的修改通知原来作者，希望他合并你的修改。
+```
+
+### 基本的工作流：
+
+1. 获取最新的代码库
+2. 从主分支创建最新的分支进行开发
+3. 提交修改
+4. 推送你的修改并创建一个 `拉取请求`
+5. 讨论、审核代码
+6. 将开发分支合并到主分支
+
+### 具体步骤
+
+#### 1. 获取最新的代码库
+
+- 当你第一次提 PR 时
+
+  复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮即可
+  ![avatar](../../en/_static/community/1.png)
+
+  克隆复刻的代码库到本地
+
+  ```bash
+  git clone git@github.com:XXX/mmcv.git
+  ```
+
+  添加原代码库为上游代码库
+
+  ```bash
+  git remote add upstream git@github.com:open-mmlab/mmcv
+  ```
+
+- 从第二个 PR 起
+
+  检出本地代码库的主分支，然后从最新的原代码库的主分支拉取更新
+
+  ```bash
+  git checkout master
+  git pull upstream master
+  ```
+
+#### 2. 从主分支创建一个新的开发分支
+
+```bash
+git checkout -b branchname
+```
+
+```{tip}
+为了保证提交历史清晰可读，我们强烈推荐您先检出主分支 (master)，再创建新的分支。
+```
+
+#### 3. 提交你的修改
+
+```bash
+# coding
+git add [files]
+git commit -m 'messages'
+```
+
+#### 4. 推送你的修改到复刻的代码库，并创建一个`拉取请求`
+
+- 推送当前分支到远端复刻的代码库
+
+  ```bash
+  git push origin branchname
+  ```
+
+- 创建一个`拉取请求`
+  ![avatar](../../en/_static/community/2.png)
+
+- 修改`拉取请求`信息模板，描述修改原因和修改内容。还可以在 PR 描述中，手动关联到相关的`议题` (issue),（更多细节，请参考[官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
+
+#### 5. 讨论并评审你的代码
+
+- 创建`拉取请求`时，可以关联给相关人员进行评审
+  ![avatar](../../en/_static/community/3.png)
+
+- 根据评审人员的意见修改代码，并推送修改
+
+#### 6. `拉取请求`合并之后删除该分支
+
+```bash
+git branch -d branchname # delete local branch
+git push origin --delete branchname # delete remote branch
+```
+
+### PR 规范
+
+1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
+
+2. 一个 PR 对应一个短期分支
+
+3. 粒度要细，一个PR只做一件事情，避免超大的PR
+
+   - Bad：实现 Faster R-CNN
+   - Acceptable：给 Faster R-CNN 添加一个 box head
+   - Good：给 box head 增加一个参数来支持自定义的 conv 层数
+
+4. 每次 Commit 时需要提供清晰且有意义 commit 信息
+
+5. 提供清晰且有意义的`拉取请求`描述
+
+   - 标题写明白任务名称，一般格式:\[Prefix\] Short description of the pull request (Suffix)
+   - prefix: 新增功能 \[Feature\], 修 bug \[Fix\], 文档相关 \[Docs\], 开发中 \[WIP\] (暂时不会被review)
+   - 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
+   - 关联相关的`议题` (issue) 和其他`拉取请求`
diff --git a/docs_zh_CN/compatibility.md b/docs/zh_cn/compatibility.md
similarity index 100%
rename from docs_zh_CN/compatibility.md
rename to docs/zh_cn/compatibility.md
diff --git a/docs/conf.py b/docs/zh_cn/conf.py
similarity index 62%
rename from docs/conf.py
rename to docs/zh_cn/conf.py
index bea4706cf0430220087b77847f5a07cd24c9b31f..2c144917848c787ea1db602e482c09bcf8fae6af 100644
--- a/docs/conf.py
+++ b/docs/zh_cn/conf.py
@@ -15,21 +15,19 @@ import os
 import sys
 
 import pytorch_sphinx_theme
-from m2r import MdInclude
-from recommonmark.transform import AutoStructify
 from sphinx.builders.html import StandaloneHTMLBuilder
 
-sys.path.insert(0, os.path.abspath('..'))
+sys.path.insert(0, os.path.abspath('../..'))
 
-version_file = '../mmcv/version.py'
-with open(version_file, 'r') as f:
+version_file = '../../mmcv/version.py'
+with open(version_file) as f:
     exec(compile(f.read(), version_file, 'exec'))
 __version__ = locals()['__version__']
 
 # -- Project information -----------------------------------------------------
 
 project = 'mmcv'
-copyright = '2018-2021, OpenMMLab'
+copyright = '2018-2022, OpenMMLab'
 author = 'MMCV Authors'
 
 # The short X.Y version
@@ -57,6 +55,8 @@ extensions = [
     'sphinx_copybutton',
 ]  # yapf: disable
 
+myst_heading_anchors = 4
+
 autodoc_mock_imports = ['mmcv._ext', 'mmcv.utils.ext_loader', 'torchvision']
 autosectionlabel_prefix_document = True
 
@@ -79,7 +79,7 @@ master_doc = 'index'
 #
 # This is also used if you do content translation via gettext catalogs.
 # Usually you set "language" from the command line for these cases.
-language = None
+language = 'zh_CN'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -108,92 +108,9 @@ html_theme_options = {
             'name': 'GitHub',
             'url': 'https://github.com/open-mmlab/mmcv'
         },
-        {
-            'name':
-            'Docs',
-            'children': [
-                {
-                    'name': 'MMCV',
-                    'url': 'https://mmcv.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MIM',
-                    'url': 'https://openmim.readthedocs.io/en/latest/'
-                },
-                {
-                    'name': 'MMAction2',
-                    'url': 'https://mmaction2.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMClassification',
-                    'url':
-                    'https://mmclassification.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMDetection',
-                    'url': 'https://mmdetection.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMDetection3D',
-                    'url': 'https://mmdetection3d.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMEditing',
-                    'url': 'https://mmediting.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMGeneration',
-                    'url': 'https://mmgeneration.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMOCR',
-                    'url': 'https://mmocr.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMPose',
-                    'url': 'https://mmpose.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMSegmentation',
-                    'url': 'https://mmsegmentation.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMTracking',
-                    'url': 'https://mmtracking.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMFlow',
-                    'url': 'https://mmflow.readthedocs.io/en/latest/',
-                },
-                {
-                    'name': 'MMFewShot',
-                    'url': 'https://mmfewshot.readthedocs.io/en/latest/',
-                },
-            ]
-        },
-        {
-            'name':
-            'OpenMMLab',
-            'children': [
-                {
-                    'name': 'Homepage',
-                    'url': 'https://openmmlab.com/'
-                },
-                {
-                    'name': 'GitHub',
-                    'url': 'https://github.com/open-mmlab/'
-                },
-                {
-                    'name': 'Twitter',
-                    'url': 'https://twitter.com/OpenMMLab'
-                },
-                {
-                    'name': 'Zhihu',
-                    'url': 'https://zhihu.com/people/openmmlab'
-                },
-            ]
-        },
-    ]
+    ],
+    # Specify the language of shared menu
+    'menu_lang': 'cn',
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
@@ -286,16 +203,3 @@ StandaloneHTMLBuilder.supported_image_types = [
 # Ignore >>> when copying code
 copybutton_prompt_text = r'>>> |\.\.\. '
 copybutton_prompt_is_regexp = True
-
-
-def setup(app):
-    app.add_config_value('no_underscore_emphasis', False, 'env')
-    app.add_config_value('m2r_parse_relative_links', False, 'env')
-    app.add_config_value('m2r_anonymous_references', False, 'env')
-    app.add_config_value('m2r_disable_inline_math', False, 'env')
-    app.add_directive('mdinclude', MdInclude)
-    app.add_config_value('recommonmark_config', {
-        'auto_toc_tree_section': 'Contents',
-        'enable_eval_rst': True,
-    }, True)
-    app.add_transform(AutoStructify)
diff --git a/docs_zh_CN/deployment/onnx.md b/docs/zh_cn/deployment/onnx.md
similarity index 100%
rename from docs_zh_CN/deployment/onnx.md
rename to docs/zh_cn/deployment/onnx.md
diff --git a/docs_zh_CN/deployment/onnxruntime_custom_ops.md b/docs/zh_cn/deployment/onnxruntime_custom_ops.md
similarity index 98%
rename from docs_zh_CN/deployment/onnxruntime_custom_ops.md
rename to docs/zh_cn/deployment/onnxruntime_custom_ops.md
index 594aefb4ba4566aeda990ee5f42512f5e2be1917..1150f919efb1df20e2d99d02747fe2c331554010 100644
--- a/docs_zh_CN/deployment/onnxruntime_custom_ops.md
+++ b/docs/zh_cn/deployment/onnxruntime_custom_ops.md
@@ -64,7 +64,7 @@
 
 | 类型    | 参数名          | 描述                                                    |
 | ------- | --------------- | ------------------------------------------------------- |
-| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0   |
+| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围\[0, 1\]。默认值为0 |
 | `float` | `sigma`         | 高斯方法的超参数                                        |
 | `float` | `min_score`     | NMS的score阈值                                          |
 | `int`   | `method`        | NMS的计算方式, (0: `naive`, 1: `linear`, 2: `gaussian`) |
@@ -137,10 +137,10 @@
 
 #### 模型参数
 
-| 类型    | 参数名          | 描述                                                  |
-| ------- | --------------- | ----------------------------------------------------- |
-| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0 |
-| `int`   | `offset`        | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1    |
+| 类型    | 参数名          | 描述                                                    |
+| ------- | --------------- | ------------------------------------------------------- |
+| `float` | `iou_threshold` | 用来判断候选框重合度的阈值，取值范围\[0, 1\]。默认值为0 |
+| `int`   | `offset`        | 用来计算候选框的宽高(x2 - x1 + offset)。可选值0或1      |
 
 #### 输入
 
diff --git a/docs_zh_CN/deployment/onnxruntime_op.md b/docs/zh_cn/deployment/onnxruntime_op.md
similarity index 78%
rename from docs_zh_CN/deployment/onnxruntime_op.md
rename to docs/zh_cn/deployment/onnxruntime_op.md
index 3898aa164fd019b635890243d03de316d2f36127..e5599307294a87093110bdd5fa33966f275572cd 100644
--- a/docs_zh_CN/deployment/onnxruntime_op.md
+++ b/docs/zh_cn/deployment/onnxruntime_op.md
@@ -15,16 +15,16 @@
 
 ### MMCV已支持的算子
 
-|                                       算子                                       |  CPU  |  GPU  | MMCV版本 |
-| :------------------------------------------------------------------------------: | :---: | :---: | :------: |
-|                   [SoftNMS](onnxruntime_custom_ops.md#softnms)                   |   Y   |   N   |  1.2.3   |
-|                  [RoIAlign](onnxruntime_custom_ops.md#roialign)                  |   Y   |   N   |  1.2.5   |
-|                       [NMS](onnxruntime_custom_ops.md#nms)                       |   Y   |   N   |  1.2.7   |
-|              [grid_sampler](onnxruntime_custom_ops.md#grid_sampler)              |   Y   |   N   |  1.3.1   |
-|                [CornerPool](onnxruntime_custom_ops.md#cornerpool)                |   Y   |   N   |  1.3.4   |
-|                    [cummax](onnxruntime_custom_ops.md#cummax)                    |   Y   |   N   |  1.3.4   |
-|                    [cummin](onnxruntime_custom_ops.md#cummin)                    |   Y   |   N   |  1.3.4   |
-| [MMCVModulatedDeformConv2d](onnxruntime_custom_ops.md#mmcvmodulateddeformconv2d) |   Y   |   N   |  1.3.12  |
+|                                       算子                                       | CPU | GPU | MMCV版本 |
+| :------------------------------------------------------------------------------: | :-: | :-: | :------: |
+|                   [SoftNMS](onnxruntime_custom_ops.md#softnms)                   |  Y  |  N  |  1.2.3   |
+|                  [RoIAlign](onnxruntime_custom_ops.md#roialign)                  |  Y  |  N  |  1.2.5   |
+|                       [NMS](onnxruntime_custom_ops.md#nms)                       |  Y  |  N  |  1.2.7   |
+|              [grid_sampler](onnxruntime_custom_ops.md#grid_sampler)              |  Y  |  N  |  1.3.1   |
+|                [CornerPool](onnxruntime_custom_ops.md#cornerpool)                |  Y  |  N  |  1.3.4   |
+|                    [cummax](onnxruntime_custom_ops.md#cummax)                    |  Y  |  N  |  1.3.4   |
+|                    [cummin](onnxruntime_custom_ops.md#cummin)                    |  Y  |  N  |  1.3.4   |
+| [MMCVModulatedDeformConv2d](onnxruntime_custom_ops.md#mmcvmodulateddeformconv2d) |  Y  |  N  |  1.3.12  |
 
 ### 如何编译ONNX Runtime自定义算子？
 
@@ -97,18 +97,20 @@ onnx_results = sess.run(None, {'input' : input_data})
 以`soft_nms`为例：
 
 1. 在ONNX Runtime头文件目录`mmcv/ops/csrc/onnxruntime/`下添加头文件`soft_nms.h`
+
 2. 在ONNX Runtime源码目录`mmcv/ops/csrc/onnxruntime/cpu/`下添加算子实现`soft_nms.cpp`
-3. 在[onnxruntime_register.cpp](../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)中注册实现的算子`soft_nms`
 
-    ```c++
-    #include "soft_nms.h"
+3. 在[onnxruntime_register.cpp](../../../mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp)中注册实现的算子`soft_nms`
+
+   ```c++
+   #include "soft_nms.h"
 
-    SoftNmsOp c_SoftNmsOp;
+   SoftNmsOp c_SoftNmsOp;
 
-    if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
-    return status;
-    }
-    ```
+   if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+   return status;
+   }
+   ```
 
 4. 在`tests/test_ops/test_onnx.py`添加单元测试，
    可以参考[here](../../tests/test_ops/test_onnx.py)。
@@ -118,10 +120,10 @@ onnx_results = sess.run(None, {'input' : input_data})
 ### 已知问题
 
 - "RuntimeError: tuple appears in op that does not forward tuples, unsupported kind: `prim::PythonOp`."
-   1. 请注意`cummax`和`cummin`算子是在torch >= 1.5.0被添加的。但他们需要在torch version >= 1.7.0才能正确导出。否则会在导出时发生上面的错误。
-   2. 解决方法：升级PyTorch到1.7.0以上版本
+  1. 请注意`cummax`和`cummin`算子是在torch >= 1.5.0被添加的。但他们需要在torch version >= 1.7.0才能正确导出。否则会在导出时发生上面的错误。
+  2. 解决方法：升级PyTorch到1.7.0以上版本
 
 ### 引用
 
 - [How to export Pytorch model with custom op to ONNX and run it in ONNX Runtime](https://github.com/onnx/tutorials/blob/master/PyTorchCustomOperator/README.md)
-- [How to add a custom operator/kernel in ONNX Runtime](https://github.com/microsoft/onnxruntime/blob/master/docs/AddingCustomOp.md)
+- [How to add a custom operator/kernel in ONNX Runtime](https://onnxruntime.ai/docs/reference/operators/add-custom-op.html)
diff --git a/docs_zh_CN/deployment/tensorrt_custom_ops.md b/docs/zh_cn/deployment/tensorrt_custom_ops.md
similarity index 95%
rename from docs_zh_CN/deployment/tensorrt_custom_ops.md
rename to docs/zh_cn/deployment/tensorrt_custom_ops.md
index 123f2889bf18aa549c327ea70f3ba974b45e48f5..d7731548303a03bd089950d5a2c87bed1c8e2fd7 100644
--- a/docs_zh_CN/deployment/tensorrt_custom_ops.md
+++ b/docs/zh_cn/deployment/tensorrt_custom_ops.md
@@ -100,7 +100,7 @@
 
 #### 描述
 
-ScatterND接收三个输入，分别为秩为r >= 1的`data`，秩为q >= 1的`indices`以及秩为 q + r - indices.shape[-1] -1 的`update`。输出的计算方式为：首先创建一个`data`的拷贝，然后根据`indces`的值使用`update`对拷贝的`data`进行更新。注意`indices`中不应该存在相同的条目，也就是说对同一个位置进行一次以上的更新是不允许的。
+ScatterND接收三个输入，分别为秩为r >= 1的`data`，秩为q >= 1的`indices`以及秩为 q + r - indices.shape\[-1\] -1 的`update`。输出的计算方式为：首先创建一个`data`的拷贝，然后根据`indces`的值使用`update`对拷贝的`data`进行更新。注意`indices`中不应该存在相同的条目，也就是说对同一个位置进行一次以上的更新是不允许的。
 
 输出的计算方式可以参考如下代码：
 
@@ -147,13 +147,13 @@ ScatterND接收三个输入，分别为秩为r >= 1的`data`，秩为q >= 1的`i
 
 #### 模型参数
 
-| 类型    | 参数名                       | 描述                                                                                     |
-| ------- | ---------------------------- | ---------------------------------------------------------------------------------------- |
-| `int`   | `center_point_box`           | 0 - 候选框的格式为[y1, x1, y2, x2]， 1-候选框的格式为[x_center, y_center, width, height] |
-| `int`   | `max_output_boxes_per_class` | 每一类最大的输出检测框个数。默认为0，输出检测框个数等于输入候选框数                      |
-| `float` | `iou_threshold`              | 用来判断候选框重合度的阈值，取值范围[0, 1]。默认值为0                                    |
-| `float` | `score_threshold`            | 用来判断候选框是否合法的阈值                                                             |
-| `int`   | `offset`                     | 检测框长宽计算方式为(x2 - x1 + offset)，可选值0或1                                       |
+| 类型    | 参数名                       | 描述                                                                                         |
+| ------- | ---------------------------- | -------------------------------------------------------------------------------------------- |
+| `int`   | `center_point_box`           | 0 - 候选框的格式为\[y1, x1, y2, x2\]， 1-候选框的格式为\[x_center, y_center, width, height\] |
+| `int`   | `max_output_boxes_per_class` | 每一类最大的输出检测框个数。默认为0，输出检测框个数等于输入候选框数                          |
+| `float` | `iou_threshold`              | 用来判断候选框重合度的阈值，取值范围\[0, 1\]。默认值为0                                      |
+| `float` | `score_threshold`            | 用来判断候选框是否合法的阈值                                                                 |
+| `int`   | `offset`                     | 检测框长宽计算方式为(x2 - x1 + offset)，可选值0或1                                           |
 
 #### 输入
 
diff --git a/docs_zh_CN/deployment/tensorrt_plugin.md b/docs/zh_cn/deployment/tensorrt_plugin.md
similarity index 79%
rename from docs_zh_CN/deployment/tensorrt_plugin.md
rename to docs/zh_cn/deployment/tensorrt_plugin.md
index 0f385b8e032fac3267a838367b53d26880a693c9..0c29f14b1eb93450b606c41e831e9c6b511efe96 100644
--- a/docs_zh_CN/deployment/tensorrt_plugin.md
+++ b/docs/zh_cn/deployment/tensorrt_plugin.md
@@ -2,18 +2,18 @@
 
 <!-- TOC -->
 
-- [MMCV中的TensorRT自定义算子 (实验性)](#mmcv中的tensorrt自定义算子-实验性)
-  - [介绍](#介绍)
-  - [MMCV中的TensorRT插件列表](#mmcv中的tensorrt插件列表)
-  - [如何编译MMCV中的TensorRT插件](#如何编译mmcv中的tensorrt插件)
-    - [准备](#准备)
-    - [在Linux上编译](#在linux上编译)
-  - [创建TensorRT推理引擎并在python下进行推理](#创建tensorrt推理引擎并在python下进行推理)
-  - [如何在MMCV中添加新的TensorRT自定义算子](#如何在mmcv中添加新的tensorrt自定义算子)
-    - [主要流程](#主要流程)
-    - [注意](#注意)
-  - [已知问题](#已知问题)
-  - [引用](#引用)
+- [MMCV中的TensorRT自定义算子 (实验性)](#mmcv%E4%B8%AD%E7%9A%84tensorrt%E8%87%AA%E5%AE%9A%E4%B9%89%E7%AE%97%E5%AD%90-%E5%AE%9E%E9%AA%8C%E6%80%A7)
+  - [介绍](#%E4%BB%8B%E7%BB%8D)
+  - [MMCV中的TensorRT插件列表](#mmcv%E4%B8%AD%E7%9A%84tensorrt%E6%8F%92%E4%BB%B6%E5%88%97%E8%A1%A8)
+  - [如何编译MMCV中的TensorRT插件](#%E5%A6%82%E4%BD%95%E7%BC%96%E8%AF%91mmcv%E4%B8%AD%E7%9A%84tensorrt%E6%8F%92%E4%BB%B6)
+    - [准备](#%E5%87%86%E5%A4%87)
+    - [在Linux上编译](#%E5%9C%A8linux%E4%B8%8A%E7%BC%96%E8%AF%91)
+  - [创建TensorRT推理引擎并在python下进行推理](#%E5%88%9B%E5%BB%BAtensorrt%E6%8E%A8%E7%90%86%E5%BC%95%E6%93%8E%E5%B9%B6%E5%9C%A8python%E4%B8%8B%E8%BF%9B%E8%A1%8C%E6%8E%A8%E7%90%86)
+  - [如何在MMCV中添加新的TensorRT自定义算子](#%E5%A6%82%E4%BD%95%E5%9C%A8mmcv%E4%B8%AD%E6%B7%BB%E5%8A%A0%E6%96%B0%E7%9A%84tensorrt%E8%87%AA%E5%AE%9A%E4%B9%89%E7%AE%97%E5%AD%90)
+    - [主要流程](#%E4%B8%BB%E8%A6%81%E6%B5%81%E7%A8%8B)
+    - [注意](#%E6%B3%A8%E6%84%8F)
+  - [已知问题](#%E5%B7%B2%E7%9F%A5%E9%97%AE%E9%A2%98)
+  - [引用](#%E5%BC%95%E7%94%A8)
 
 <!-- TOC -->
 
@@ -75,6 +75,10 @@ pip install $TENSORRT_DIR/graphsurgeon/graphsurgeon-0.4.5-py2.py3-none-any.whl
 
 想了解更多通过tar包安装TensorRT，请访问[Nvidia' website](https://docs.nvidia.com/deeplearning/tensorrt/archives/tensorrt-721/install-guide/index.html#installing-tar).
 
+- 安装 cuDNN
+
+参考[Nvidia' website](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html#installlinux-tar)安装 cuDNN 8。
+
 #### 在Linux上编译
 
 ```bash
@@ -142,21 +146,24 @@ with torch.no_grad():
 **以RoIAlign算子插件`roi_align`举例。**
 
 1. 在TensorRT包含目录`mmcv/ops/csrc/tensorrt/`中添加头文件`trt_roi_align.hpp`
+
 2. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加头文件`trt_roi_align.cpp`
+
 3. 在TensorRT源码目录`mmcv/ops/csrc/tensorrt/plugins/`中添加cuda kernel文件`trt_roi_align_kernel.cu`
+
 4. 在[trt_plugin.cpp](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp)中注册`roi_align`插件
 
-    ```c++
-    #include "trt_plugin.hpp"
+   ```c++
+   #include "trt_plugin.hpp"
 
-    #include "trt_roi_align.hpp"
+   #include "trt_roi_align.hpp"
 
-    REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+   REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
 
-    extern "C" {
-    bool initLibMMCVInferPlugins() { return true; }
-    }  // extern "C"
-    ```
+   extern "C" {
+   bool initLibMMCVInferPlugins() { return true; }
+   }  // extern "C"
+   ```
 
 5. 在`tests/test_ops/test_tensorrt.py`中添加单元测试
 
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
new file mode 100644
index 0000000000000000000000000000000000000000..6cfb100c631b101fa0cff0650105a3cc7d735e7b
--- /dev/null
+++ b/docs/zh_cn/faq.md
@@ -0,0 +1,91 @@
+## 常见问题
+
+在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
+欢迎随时丰富这个列表。
+
+### 安装问题
+
+- KeyError: "xxx: 'yyy is not in the zzz registry'"
+
+  只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件，更多详情请查看 [KeyError: "MaskRCNN: 'RefineRoIHead is not in the models registry'"](https://github.com/open-mmlab/mmdetection/issues/5974)。
+
+- "No module named 'mmcv.ops'"; "No module named 'mmcv.\_ext'"
+
+  1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
+  2. 参考 [installation instruction](https://mmcv.readthedocs.io/en/latest/get_started/installation.html) 或者 [Build MMCV from source](https://mmcv.readthedocs.io/en/latest/get_started/build.html) 安装 mmcv-full
+
+- "invalid device function" 或者 "no kernel image is available for execution"
+
+  1. 检查 GPU 的 CUDA 计算能力
+  2. 运行 `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的，您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV。兼容性问题可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
+  3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它
+
+- "undefined symbol" 或者 "cannot open xxx.so"
+
+  1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
+  2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
+  3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同
+
+- "RuntimeError: CUDA error: invalid configuration argument"
+
+  这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低 [THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
+  的值并重新编译 mmcv。
+
+- "RuntimeError: nms is not compiled with GPU support"
+
+  这个错误是由于您的 CUDA 环境没有正确安装。
+  您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。
+
+- "Segmentation fault"
+
+  1. 检查 GCC 的版本，通常是因为 PyTorch 版本与 GCC 版本不匹配 （例如 GCC \< 4.9 )，我们推荐用户使用 GCC 5.4，我们也不推荐使用 GCC 5.5， 因为有反馈 GCC 5.5 会导致 "segmentation fault" 并且切换到 GCC 5.4 就可以解决问题
+  2. 检查是否正确安装 CUDA 版本的 PyTorc。输入以下命令并检查是否返回 True
+     ```shell
+     python -c 'import torch; print(torch.cuda.is_available())'
+     ```
+  3. 如果 `torch` 安装成功，那么检查 MMCV 是否安装成功。输入以下命令，如果没有报错说明 mmcv-full 安装成。
+     ```shell
+     python -c 'import mmcv; import mmcv.ops'
+     ```
+  4. 如果 MMCV 与 PyTorch 都安装成功了，则可以使用 `ipdb` 设置断点或者使用 `print` 函数，分析是哪一部分的代码导致了 `segmentation fault`
+
+- "libtorch_cuda_cu.so: cannot open shared object file"
+
+  `mmcv-full` 依赖 `libtorch_cuda_cu.so` 文件，但程序运行时没能找到该文件。我们可以检查该文件是否存在 `~/miniconda3/envs/{environment-name}/lib/python3.7/site-packages/torch/lib` 也可以尝试重装 PyTorch。
+
+- "fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version!"
+
+  如果您在 Windows 上编译 mmcv-full 并且 CUDA 的版本是 9.2，您很可能会遇到这个问题 `"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.2\include\crt/host_config.h(133): fatal error C1189: #error:  -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported!"`，您可以尝试使用低版本的 Microsoft Visual Studio，例如 vs2017。
+
+- "error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.5.0，您很可能会遇到这个问题 `- torch/csrc/jit/api/module.h(474): error: member "torch::jit::detail::ModulePolicy::all_slots" may not be initialized`。解决这个问题的方法是将 `torch/csrc/jit/api/module.h` 文件中所有 `static constexpr bool all_slots = false;` 替换为 `static bool all_slots = false;`。更多细节可以查看 [member "torch::jit::detail::AttributePolicy::all_slots" may not be initialized](https://github.com/pytorch/pytorch/issues/39394)。
+
+- "error: a member with an in-class initializer must be const"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.6.0，您很可能会遇到这个问题 `"- torch/include\torch/csrc/jit/api/module.h(483): error: a member with an in-class initializer must be const"`. 解决这个问题的方法是将 `torch/include\torch/csrc/jit/api/module.h` 文件中的所有 `CONSTEXPR_EXCEPT_WIN_CUDA ` 替换为 `const`。更多细节可以查看 [Ninja: build stopped: subcommand failed](https://github.com/open-mmlab/mmcv/issues/575)。
+
+- "error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized"
+
+  如果您在 Windows 上编译 mmcv-full 并且 PyTorch 的版本是 1.7.0，您很可能会遇到这个问题 `torch/include\torch/csrc/jit/ir/ir.h(1347): error: member "torch::jit::ProfileOptionalOp::Kind" may not be initialized`. 解决这个问题的方法是修改 PyTorch 中的几个文件：
+
+  - 删除 `torch/include\torch/csrc/jit/ir/ir.h` 文件中的 `static constexpr Symbol Kind = ::c10::prim::profile;` 和 `tatic constexpr Symbol Kind = ::c10::prim::profile_optional;`
+  - 将 `torch\include\pybind11\cast.h` 文件中的 `explicit operator type&() { return *(this->value); }` 替换为 `explicit operator type&() { return *((type*)this->value); }`
+  - 将 `torch/include\torch/csrc/jit/api/module.h` 文件中的 所有 `CONSTEXPR_EXCEPT_WIN_CUDA` 替换为 `const`
+
+  更多细节可以查看 [Ensure default extra_compile_args](https://github.com/pytorch/pytorch/pull/45956)。
+
+- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"
+
+  请参考 [installation instruction](https://mmdetection.readthedocs.io/en/latest/get_started.html#installation) 为您的 MMDetection 版本安装正确版本的 MMCV。
+
+### 使用问题
+
+- "RuntimeError: Expected to have finished reduction in the prior iteration before starting a new one"
+
+  1. 这个错误是因为有些参数没有参与 loss 的计算，可能是代码中存在多个分支，导致有些分支没有参与 loss 的计算。更多细节见 [Expected to have finished reduction in the prior iteration before starting a new one](https://github.com/pytorch/pytorch/issues/55582)。
+  2. 你可以设置 DDP 中的 `find_unused_parameters` 为 `True`，或者手动查找哪些参数没有用到。
+
+- "RuntimeError: Trying to backward through the graph a second time"
+
+  不能同时设置 `GradientCumulativeOptimizerHook` 和 `OptimizerHook`，这会导致 `loss.backward()` 被调用两次，于是程序抛出 `RuntimeError`。我们只需设置其中的一个。更多细节见 [Trying to backward through the graph a second time](https://github.com/open-mmlab/mmcv/issues/1379)。
diff --git a/docs_zh_CN/get_started/build.md b/docs/zh_cn/get_started/build.md
similarity index 50%
rename from docs_zh_CN/get_started/build.md
rename to docs/zh_cn/get_started/build.md
index 77fb86e9cf5c805bdca5fdaff6f22768cbfe8d3e..ec6ebb887946f115a7a7ac06e43da6b261e36d28 100644
--- a/docs_zh_CN/get_started/build.md
+++ b/docs/zh_cn/get_started/build.md
@@ -9,6 +9,12 @@ git clone https://github.com/open-mmlab/mmcv.git
 cd mmcv
 ```
 
+建议安装 `ninja` 以加快编译速度
+
+```bash
+pip install -r requirements/optional.txt
+```
+
 你可以安装 lite 版本
 
 ```bash
@@ -36,6 +42,7 @@ CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e .
 ```{note}
 如果你打算使用 `opencv-python-headless` 而不是 `opencv-python`，例如在一个很小的容器环境或者没有图形用户界面的服务器中，你可以先安装 `opencv-python-headless`，这样在安装 mmcv 依赖的过程中会跳过 `opencv-python`
 ```
+
 ### 在 Windows 上编译 MMCV
 
 在 Windows 上编译 MMCV 比 Linux 复杂，本节将一步步介绍如何在 Windows 上编译 MMCV。
@@ -63,32 +70,38 @@ CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' MMCV_WITH_OPS=1 pip install -e .
 
 2. 创建一个新的 Conda 环境
 
-    ```shell
-    conda create --name mmcv python=3.7  # 经测试，3.6, 3.7, 3.8 也能通过
-    conda activate mmcv  # 确保做任何操作前先激活环境
-    ```
+   ```shell
+   conda create --name mmcv python=3.7  # 经测试，3.6, 3.7, 3.8 也能通过
+   conda activate mmcv  # 确保做任何操作前先激活环境
+   ```
 
 3. 安装 PyTorch 时，可以根据需要安装支持 CUDA 或不支持 CUDA 的版本
 
-    ```shell
-    # CUDA version
-    conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
-    # CPU version
-    conda install pytorch torchvision cpuonly -c pytorch
-    ```
+   ```shell
+   # CUDA version
+   conda install pytorch torchvision cudatoolkit=10.2 -c pytorch
+   # CPU version
+   conda install pytorch torchvision cpuonly -c pytorch
+   ```
 
 4. 准备 MMCV 源代码
 
-    ```shell
-    git clone https://github.com/open-mmlab/mmcv.git
-    cd mmcv
-    ```
+   ```shell
+   git clone https://github.com/open-mmlab/mmcv.git
+   cd mmcv
+   ```
 
 5. 安装所需 Python 依赖包
 
-    ```shell
-    pip3 install -r requirements.txt
-    ```
+   ```shell
+   pip3 install -r requirements/runtime.txt
+   ```
+
+6. 建议安装 `ninja` 以加快编译速度
+
+   ```bash
+   pip install -r requirements/optional.txt
+   ```
 
 #### 编译与安装 MMCV
 
@@ -96,33 +109,33 @@ MMCV 有三种安装的模式：
 
 1. Lite 版本（不包含算子）
 
-    这种方式下，没有算子被编译，这种模式的 mmcv 是原生的 python 包
+   这种方式下，没有算子被编译，这种模式的 mmcv 是原生的 python 包
 
 2. Full 版本（只包含 CPU 算子）
 
-    编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行
+   编译 CPU 算子，但只有 x86 将会被编译，并且编译版本只能在 CPU only 情况下运行
 
 3. Full 版本（既包含 CPU 算子，又包含 CUDA 算子）
 
-    同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU
+   同时编译 CPU 和 CUDA 算子，`ops` 模块的 x86 与 CUDA 的代码都可以被编译。同时编译的版本可以在 CUDA 上调用 GPU
 
 ##### 通用步骤
 
 1. 设置 MSVC 编译器
 
-    设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。
+   设置环境变量。添加 `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.27.29110\bin\Hostx86\x64` 到 `PATH`，则 `cl.exe` 可以在命令行中运行，如下所示。
 
-    ```none
-    (base) PS C:\Users\xxx> cl
-    Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
-    Copyright (C) Microsoft Corporation.   All rights reserved.
+   ```none
+   (base) PS C:\Users\xxx> cl
+   Microsoft (R) C/C++ Optimizing  Compiler Version 19.27.29111 for x64
+   Copyright (C) Microsoft Corporation.   All rights reserved.
 
-    usage: cl [ option... ] filename... [ / link linkoption... ]
-    ```
+   usage: cl [ option... ] filename... [ / link linkoption... ]
+   ```
 
-    为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。
+   为了兼容性，我们使用 x86-hosted 以及 x64-targeted 版本，即路径中的 `Hostx86\x64` 。
 
-    因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。
+   因为 PyTorch 将解析 `cl.exe` 的输出以检查其版本，只有 utf-8 将会被识别，你可能需要将系统语言更改为英语。控制面板 -> 地区-> 管理-> 非 Unicode 来进行语言转换。
 
 ##### 安装方式一：Lite version（不包含算子）
 
@@ -145,20 +158,20 @@ pip list
 
 2. 设置环境变量
 
-    ```shell
-    $env:MMCV_WITH_OPS = 1
-    $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
-    ```
+   ```shell
+   $env:MMCV_WITH_OPS = 1
+   $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
+   ```
 
 3. 编译安装
 
-    ```shell
-    conda activate mmcv  # 激活环境
-    cd mmcv  # 改变路径
-    python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
-    python setup.py develop  # 安装
-    pip list  # 检查是否安装成功
-    ```
+   ```shell
+   conda activate mmcv  # 激活环境
+   cd mmcv  # 改变路径
+   python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+   python setup.py develop  # 安装
+   pip list  # 检查是否安装成功
+   ```
 
 ##### 安装方式三：Full version（既编译 CPU 算子又编译 CUDA 算子）
 
@@ -166,38 +179,38 @@ pip list
 
 2. 设置环境变量
 
-    ```shell
-    $env:MMCV_WITH_OPS = 1
-    $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
-    ```
+   ```shell
+   $env:MMCV_WITH_OPS = 1
+   $env:MAX_JOBS = 8  # 根据你可用CPU以及内存量进行设置
+   ```
 
-3.  检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中
+3. 检查 `CUDA_PATH` 或者 `CUDA_HOME` 环境变量已经存在在 `envs` 之中
 
-    ```none
-    (base) PS C:\Users\WRH> ls env:
+   ```none
+   (base) PS C:\Users\WRH> ls env:
 
-    Name                           Value
-    ----                           -----
-    CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-    CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
-    CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
-    ```
+   Name                           Value
+   ----                           -----
+   CUDA_PATH                      C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   CUDA_PATH_V10_1                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1
+   CUDA_PATH_V10_2                C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2
+   ```
 
-    如果没有，你可以按照下面的步骤设置
+   如果没有，你可以按照下面的步骤设置
 
-    ```shell
-    $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
-    # 或者
-    $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
-    ```
+   ```shell
+   $env:CUDA_HOME = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2"
+   # 或者
+   $env:CUDA_HOME = $env:CUDA_PATH_V10_2  # CUDA_PATH_V10_2 已经在环境变量中
+   ```
 
 4. 设置 CUDA 的目标架构
 
-    ```shell
-    $env:TORCH_CUDA_ARCH_LIST="6.1" # 支持 GTX 1080
-    # 或者用所有支持的版本，但可能会变得很慢
-    $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5"
-    ```
+   ```shell
+   $env:TORCH_CUDA_ARCH_LIST="6.1" # 支持 GTX 1080
+   # 或者用所有支持的版本，但可能会变得很慢
+   $env:TORCH_CUDA_ARCH_LIST="3.5 3.7 5.0 5.2 6.0 6.1 7.0 7.5"
+   ```
 
 ```{note}
 我们可以在 [here](https://developer.nvidia.com/cuda-gpus) 查看 GPU 的计算能力
@@ -205,15 +218,15 @@ pip list
 
 5. 编译安装
 
-    ```shell
-    $env:MMCV_WITH_OPS = 1
-    $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置
-    conda activate mmcv # 激活环境
-    cd mmcv  # 改变路径
-    python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
-    python setup.py develop # 安装
-    pip list # 检查是否安装成功
-    ```
+   ```shell
+   $env:MMCV_WITH_OPS = 1
+   $env:MAX_JOBS = 8 # 根据你可用CPU以及内存量进行设置
+   conda activate mmcv # 激活环境
+   cd mmcv  # 改变路径
+   python setup.py build_ext  # 如果成功, cl 将被启动用于编译算子
+   python setup.py develop # 安装
+   pip list # 检查是否安装成功
+   ```
 
 ```{note}
 如果你的 PyTorch 版本是 1.6.0，你可能会遇到一些这个 [issue](https://github.com/pytorch/pytorch/issues/42467) 提到的错误，则可以参考这个 [pull request](https://github.com/pytorch/pytorch/pull/43380/files) 修改 本地环境的 PyTorch 源代码
diff --git a/docs_zh_CN/get_started/installation.md b/docs/zh_cn/get_started/installation.md
similarity index 74%
rename from docs_zh_CN/get_started/installation.md
rename to docs/zh_cn/get_started/installation.md
index 20e8cd59545fefb833b35195c1df7b4d3736b281..a6a20b054184623eea17a0852d37121d3fccea58 100644
--- a/docs_zh_CN/get_started/installation.md
+++ b/docs/zh_cn/get_started/installation.md
@@ -13,17 +13,17 @@ a. 安装完整版
 
 在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch 官方[文档](https://pytorch.org/)。
 
-我们提供了不同 PyTorch 和 CUDA 版本的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
+我们提供了 **Linux 和 Windows 平台** PyTorch 和 CUDA 版本组合的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](https://github.com/open-mmlab/mmcv/.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
 
 i. 安装最新版本
 
-如下是安装最新版 ``mmcv-full`` 的命令
+如下是安装最新版 `mmcv-full` 的命令
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-请将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的最新版 ``mmcv-full``，使用如下替换过的命令
+请将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的最新版 `mmcv-full`，使用如下替换过的命令
 
 ```shell
 pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
@@ -37,18 +37,18 @@ PyTorch 版本是 1.8.1、CUDA 版本是 11.1，你可以使用以下命令安
 `pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html`
 ```
 
-如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 ``=={mmcv_version}`` 删去即可。
+如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 `=={mmcv_version}` 删去即可。
 
 ii. 安装特定的版本
 
-如下是安装特定版本 ``mmcv-full`` 的命令
+如下是安装特定版本 `mmcv-full` 的命令
 
 ```shell
 pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
 ```
 
-首先请参考版本发布信息找到想要安装的版本号，将 ``{mmcv_version}`` 替换成该版本号，例如 ``1.3.9``。
-然后将链接中的 ``{cu_version}`` 和 ``{torch_version}`` 根据自身需求替换成实际的版本号，例如想安装和 ``CUDA 11.1``、``PyTorch 1.9.0`` 兼容的 ``mmcv-full`` 1.3.9 版本，使用如下替换过的命令
+首先请参考版本发布信息找到想要安装的版本号，将 `{mmcv_version}` 替换成该版本号，例如 `1.3.9`。
+然后将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的 `mmcv-full` 1.3.9 版本，使用如下替换过的命令
 
 ```shell
 pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
@@ -60,15 +60,27 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
   <tbody>
     <tr>
       <th width="80"> CUDA </th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.10</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.9</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.8</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.7</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.6</th>
-      <th valign="bottom" align="left" style="min-width: 100px">torch 1.5</th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">11.3</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"></td>
       <td align="left"></code></pre> </details> </td>
@@ -78,6 +90,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
     </tr>
     <tr>
       <td align="left">11.1</td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
@@ -90,12 +103,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"> </td>
       <td align="left"> </td>
     </tr>
     <tr>
       <td align="left">10.2</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
@@ -107,6 +122,7 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left">10.1</td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
@@ -117,12 +133,14 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
       <td align="left"> </td>
       <td align="left"> </td>
       <td align="left"> </td>
+      <td align="left"> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
     </tr>
     <tr>
       <td align="left">cpu</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
        <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
@@ -134,7 +152,11 @@ pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/t
 </table>
 
 ```{note}
-以上提供的预编译包并不囊括所有的 mmcv-full 版本，我们可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs_zh_CN/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+以上提供的预编译包并不囊括所有的 mmcv-full 版本，我们可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+```
+
+```{note}
+mmcv-full 没有提供 Windows 平台 `cu102-torch1.8.0` 和 `cu92-torch*` 的预编译包。
 ```
 
 除了使用预编译包之外，另一种方式是在本地进行编译，直接运行下述命令
diff --git a/docs_zh_CN/get_started/introduction.md b/docs/zh_cn/get_started/introduction.md
similarity index 62%
rename from docs_zh_CN/get_started/introduction.md
rename to docs/zh_cn/get_started/introduction.md
index 0082ae88a6a94fb09c76d9a821121ceb58b901a5..990713254928616f53240ca6f8926d9d1e5a8aec 100644
--- a/docs_zh_CN/get_started/introduction.md
+++ b/docs/zh_cn/get_started/introduction.md
@@ -2,16 +2,24 @@
 
 MMCV 是一个面向计算机视觉的基础库，它支持了很多开源项目，例如：
 
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
 - [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
 - [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
-- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
 - [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
 - [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
 
 MMCV 提供了如下众多功能：
 
@@ -23,6 +31,12 @@ MMCV 提供了如下众多功能：
 - 多种 CNN 网络结构
 - 高质量实现的常见 CUDA 算子
 
+MMCV 支持以下的系统：
+
+- Linux
+- Windows
+- macOS
+
 如想了解更多特性和使用，请参考[文档](https://mmcv.readthedocs.io/zh_CN/latest)。
 
 ```{note}
diff --git a/docs_zh_CN/get_started/previous_versions.md b/docs/zh_cn/get_started/previous_versions.md
similarity index 93%
rename from docs_zh_CN/get_started/previous_versions.md
rename to docs/zh_cn/get_started/previous_versions.md
index 56679d48181290768f33d0da866b7399ca63e710..d543818752b51985169d4489bd46708725ce422d 100644
--- a/docs_zh_CN/get_started/previous_versions.md
+++ b/docs/zh_cn/get_started/previous_versions.md
@@ -1,11 +1,10 @@
-
 ## 其他版本的 PyTorch
 
 我们不再提供在较低的 `PyTorch` 版本下编译的 `mmcv-full` 包，但为了您的方便，您可以在下面找到它们。
 
 ### PyTorch 1.4
 
-| 1.0.0 <= mmcv_version <= 1.2.1
+| 1.0.0 \<= mmcv_version \<= 1.2.1
 
 #### CUDA 10.1
 
@@ -27,7 +26,7 @@ pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dis
 
 ### PyTorch v1.3
 
-| 1.0.0 <= mmcv_version <= 1.3.16
+| 1.0.0 \<= mmcv_version \<= 1.3.16
 
 #### CUDA 10.1
 
diff --git a/docs_zh_CN/index.rst b/docs/zh_cn/index.rst
similarity index 100%
rename from docs_zh_CN/index.rst
rename to docs/zh_cn/index.rst
diff --git a/docs_zh_CN/make.bat b/docs/zh_cn/make.bat
similarity index 100%
rename from docs_zh_CN/make.bat
rename to docs/zh_cn/make.bat
diff --git a/docs_zh_CN/mmcv-logo.png b/docs/zh_cn/mmcv-logo.png
similarity index 100%
rename from docs_zh_CN/mmcv-logo.png
rename to docs/zh_cn/mmcv-logo.png
diff --git a/docs/zh_cn/understand_mmcv/cnn.md b/docs/zh_cn/understand_mmcv/cnn.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa8584f72f3825080c8620dadaf947a591bed22a
--- /dev/null
+++ b/docs/zh_cn/understand_mmcv/cnn.md
@@ -0,0 +1,570 @@
+## 卷积神经网络
+
+我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。
+
+### 网络层的构建
+
+在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。
+
+#### 用法
+
+一个简单的例子：
+
+```python
+cfg = dict(type='Conv3d')
+layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
+```
+
+- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
+- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
+- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
+- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
+- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate
+
+#### 拓展
+
+我们还允许自定义层和算子来扩展构建方法。
+
+1. 编写和注册自己的模块：
+
+   ```python
+   from mmcv.cnn import UPSAMPLE_LAYERS
+
+   @UPSAMPLE_LAYERS.register_module()
+   class MyUpsample:
+
+       def __init__(self, scale_factor):
+           pass
+
+       def forward(self, x):
+           pass
+   ```
+
+2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：
+
+   ```python
+   cfg = dict(type='MyUpsample', scale_factor=2)
+   layer = build_upsample_layer(cfg)
+   ```
+
+### 模块组件
+
+我们还提供了常用的模块组件，以方便网络构建。
+卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。
+
+```python
+# conv + bn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+# conv + gn + relu
+conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
+# conv + relu
+conv = ConvModule(3, 8, 2)
+# conv
+conv = ConvModule(3, 8, 2, act_cfg=None)
+# conv + leaky relu
+conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+# bn + conv + relu
+conv = ConvModule(
+    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+```
+
+### Weight initialization
+
+> 实现细节可以在 [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)中找到
+
+在训练过程中，适当的初始化策略有利于加快训练速度或者获得更高的性能。 在MMCV中，我们提供了一些常用的方法来初始化模块，比如 `nn.Conv2d` 模块。当然，我们也提供了一些高级API，可用于初始化包含一个或多个模块的模型。
+
+#### Initialization functions
+
+以函数的方式初始化 `nn.Module` ，例如 `nn.Conv2d` 、 `nn.Linear` 等。
+
+我们提供以下初始化方法，
+
+- constant_init
+
+  使用给定常量值初始化模型参数
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import constant_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # constant_init(module, val, bias=0)
+  >>> constant_init(conv1, 1, 0)
+  >>> conv1.weight
+  ```
+
+- xavier_init
+
+  按照 [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 描述的方法初始化模型参数
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import xavier_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
+  >>> xavier_init(conv1, distribution='normal')
+  ```
+
+- normal_init
+
+  使用正态分布（高斯分布）初始化模型参数
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import normal_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # normal_init(module, mean=0, std=1, bias=0)
+  >>> normal_init(conv1, std=0.01, bias=0)
+  ```
+
+- uniform_init
+
+  使用均匀分布初始化模型参数
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import uniform_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # uniform_init(module, a=0, b=1, bias=0)
+  >>> uniform_init(conv1, a=0, b=1)
+  ```
+
+- kaiming_init
+
+  按照 [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) 描述的方法来初始化模型参数。
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import kaiming_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
+  >>> kaiming_init(conv1)
+  ```
+
+- caffe2_xavier_init
+
+  caffe2中实现的 `xavier initialization`，对应于 PyTorch中的 `kaiming_uniform_`
+
+  ```python
+  >>> import torch.nn as nn
+  >>> from mmcv.cnn import caffe2_xavier_init
+  >>> conv1 = nn.Conv2d(3, 3, 1)
+  >>> # caffe2_xavier_init(module, bias=0)
+  >>> caffe2_xavier_init(conv1)
+  ```
+
+- bias_init_with_prob
+
+  根据给定的概率初始化 `conv/fc`, 这在 [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf) 提出。
+
+  ```python
+  >>> from mmcv.cnn import bias_init_with_prob
+  >>> # bias_init_with_prob is proposed in Focal Loss
+  >>> bias = bias_init_with_prob(0.01)
+  >>> bias
+  -4.59511985013459
+  ```
+
+#### Initializers and configs
+
+在初始化方法的基础上，我们定义了相应的初始化类，并将它们注册到 `INITIALIZERS` 中，这样我们就可以使用 `config` 配置来初始化模型了。
+
+我们提供以下初始化类：
+
+- ConstantInit
+- XavierInit
+- NormalInit
+- UniformInit
+- KaimingInit
+- Caffe2XavierInit
+- PretrainedInit
+
+接下来详细介绍 `initialize` 的使用方法
+
+1. 通过关键字 `layer` 来初始化模型
+
+   如果我们只定义了关键字 `layer` ，那么只初始化 `layer` 中包含的层。
+
+   注意: 关键字 `layer` 支持的模块是带有 weights 和 bias 属性的 PyTorch 模块，所以不支持 `MultiheadAttention layer`
+
+- 定义关键字 `layer` 列表并使用相同相同配置初始化模块
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1, 2)
+
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
+  # 使用相同的配置初始化整个模块
+  initialize(model, init_cfg)
+  # model.feat.weight
+  # Parameter containing:
+  # tensor([[[1., 1., 1.],
+  #          [1., 1., 1.],
+  #          [1., 1., 1.]]], requires_grad=True)
+  ```
+
+- 定义关键字 `layer` 用于初始化不同配置的层
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn.utils import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Linear(1,2)
+
+  model = FooNet()
+  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+              dict(type='Constant', layer='Conv2d', val=2),
+              dict(type='Constant', layer='Linear', val=3)]
+  # nn.Conv1d 使用 dict(type='Constant', val=1) 初始化
+  # nn.Conv2d 使用 dict(type='Constant', val=2) 初始化
+  # nn.Linear 使用 dict(type='Constant', val=3) 初始化
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]],
+  #          ...,
+  #          [[2., 2., 2.],
+  #           [2., 2., 2.],
+  #           [2., 2., 2.]]]], requires_grad=True)
+  ```
+
+2. 定义关键字`override`初始化模型
+
+- 当用属性名初始化某个特定部分时, 我们可以使用关键字 `override`, 关键字 `override` 对应的Value会替代init_cfg中相应的值
+
+  ```python
+  import torch.nn as nn
+  from mmcv.cnn import initialize
+
+  class FooNet(nn.Module):
+      def __init__(self):
+          super().__init__()
+          self.feat = nn.Conv1d(3, 1, 3)
+          self.reg = nn.Conv2d(3, 3, 3)
+          self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+
+  # 如果我们想将模型的权重初始化为 1，将偏差初始化为 2
+  # 但希望 `reg` 中的权重为 3，偏差为 4，则我们可以使用关键字override
+
+  model = FooNet()
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
+                  override=dict(type='Constant', name='reg', val=3, bias=4))
+  #  使用 dict(type='Constant', val=1, bias=2)来初始化 self.feat and self.cls
+  # 使用dict(type='Constant', val=3, bias=4)来初始化‘reg’模块。
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[3., 3., 3.],
+  #           [3., 3., 3.],
+  #           [3., 3., 3.]],
+  #           ...,
+  #           [[3., 3., 3.],
+  #            [3., 3., 3.],
+  #            [3., 3., 3.]]]], requires_grad=True)
+  ```
+
+- 如果 init_cfg 中的关键字`layer`为None，则只初始化在关键字override中的子模块，并且省略override中的 type 和其他参数
+
+  ```python
+  model = FooNet()
+  init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
+  # self.feat 和 self.cls 使用pyTorch默认的初始化
+  # 将使用 dict(type='Constant', val=1, bias=2) 初始化名为 'reg' 的模块
+  initialize(model, init_cfg)
+  # model.reg.weight
+  # Parameter containing:
+  # tensor([[[[1., 1., 1.],
+  #           [1., 1., 1.],
+  #           [1., 1., 1.]],
+  #           ...,
+  #           [[1., 1., 1.],
+  #            [1., 1., 1.],
+  #            [1., 1., 1.]]]], requires_grad=True)
+  ```
+
+- 如果我们没有定义关键字`layer`或`override` , 将不会初始化任何东西
+
+- 关键字`override`的无效用法
+
+  ```python
+  # 没有重写任何子模块
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                  val=1, bias=2,
+                  override=dict(type='Constant', val=3, bias=4))
+
+  # 没有指定type，即便有其他参数，也是无效的。
+  init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
+                  val=1, bias=2,
+                  override=dict(name='reg', val=3, bias=4))
+  ```
+
+3. 用预训练模型初始化
+
+   ```python
+   import torch.nn as nn
+   import torchvision.models as models
+   from mmcv.cnn import initialize
+
+   # 使用预训练模型来初始化
+   model = models.resnet50()
+   # model.conv1.weight
+   # Parameter containing:
+   # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
+   #            -1.8077e-03,  3.0338e-03],
+   #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
+   #             1.1655e-02,  4.5889e-03],
+   #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
+   #             1.7314e-02, -9.9998e-03],
+   #           ...,
+
+   init_cfg = dict(type='Pretrained',
+                   checkpoint='torchvision://resnet50')
+   initialize(model, init_cfg)
+   # model.conv1.weight
+   # Parameter containing:
+   # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
+   #            -4.3034e-02, -7.0755e-02],
+   #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
+   #            -2.0912e-02, -3.8517e-02],
+   #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
+   #             6.2641e-02,  5.1977e-02],
+   #           ...,
+
+   # 使用关键字'prefix'用预训练模型的特定部分来初始化子模块权重
+   model = models.resnet50()
+   url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+         'retinanet_r50_fpn_1x_coco/'\
+         'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+   init_cfg = dict(type='Pretrained',
+                   checkpoint=url, prefix='backbone.')
+   initialize(model, init_cfg)
+   ```
+
+4. 初始化继承自BaseModule、Sequential、ModuleList、ModuleDict的模型
+
+   `BaseModule` 继承自 `torch.nn.Module`, 它们之间唯一的不同是 `BaseModule` 实现了 `init_weight`
+
+   `Sequential` 继承自 `BaseModule` 和 `torch.nn.Sequential`
+
+   `ModuleList` 继承自 `BaseModule` 和 `torch.nn.ModuleList`
+
+   `ModuleDict` 继承自 `BaseModule` 和 `torch.nn.ModuleDict`
+
+   ```python
+   import torch.nn as nn
+   from mmcv.runner import BaseModule, Sequential, ModuleList, ModuleDict
+
+   class FooConv1d(BaseModule):
+
+       def __init__(self, init_cfg=None):
+           super().__init__(init_cfg)
+           self.conv1d = nn.Conv1d(4, 1, 4)
+
+       def forward(self, x):
+           return self.conv1d(x)
+
+   class FooConv2d(BaseModule):
+
+       def __init__(self, init_cfg=None):
+           super().__init__(init_cfg)
+           self.conv2d = nn.Conv2d(3, 1, 3)
+
+       def forward(self, x):
+           return self.conv2d(x)
+
+   # BaseModule
+   init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+   model = FooConv1d(init_cfg)
+   model.init_weights()
+   # model.conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #        [0., 0., 0., 0.],
+   #        [0., 0., 0., 0.],
+   #        [0., 0., 0., 0.]]], requires_grad=True)
+
+   # Sequential
+   init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
+   init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   seq_model = Sequential(model1, model2)
+   seq_model.init_weights()
+   # seq_model[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # seq_model[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   seq_model = Sequential(model1, model2, init_cfg=init_cfg)
+   seq_model.init_weights()
+   # seq_model[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # seq_model[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # ModuleList
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   modellist = ModuleList([model1, model2])
+   modellist.init_weights()
+   # modellist[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modellist[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   modellist = ModuleList([model1, model2], init_cfg=init_cfg)
+   modellist.init_weights()
+   # modellist[0].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modellist[1].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # ModuleDict
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   modeldict = ModuleDict(dict(model1=model1, model2=model2))
+   modeldict.init_weights()
+   # modeldict['model1'].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modeldict['model2'].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+
+   # inner init_cfg has higher priority
+   model1 = FooConv1d(init_cfg1)
+   model2 = FooConv2d(init_cfg2)
+   init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
+   modeldict = ModuleDict(dict(model1=model1, model2=model2), init_cfg=init_cfg)
+   modeldict.init_weights()
+   # modeldict['model1'].conv1d.weight
+   # Parameter containing:
+   # tensor([[[0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.],
+   #         [0., 0., 0., 0.]]], requires_grad=True)
+   # modeldict['model2'].conv2d.weight
+   # Parameter containing:
+   # tensor([[[[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]],
+   #         ...,
+   #          [[2., 2., 2.],
+   #           [2., 2., 2.],
+   #           [2., 2., 2.]]]], requires_grad=True)
+   ```
+
+### Model Zoo
+
+除了`torchvision`的预训练模型，我们还提供以下 CNN 的预训练模型：
+
+- VGG Caffe
+- ResNet Caffe
+- ResNeXt
+- ResNet with Group Normalization
+- ResNet with Group Normalization and Weight Standardization
+- HRNetV2
+- Res2Net
+- RegNet
+
+#### Model URLs in JSON
+
+MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于:
+
+```json
+{
+    "model_a": "https://example.com/models/model_a_9e5bac.pth",
+    "model_b": "pretrain/model_b_ab3ef2c.pth"
+}
+```
+
+可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。
+
+你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接，如果在环境中找不到`MMCV_HOME`，则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。
+
+外部的json文件将被合并为默认文件，如果相同的键出现在外部`json`和默认`json`中，则将使用外部`json`。
+
+#### Load Checkpoint
+
+`mmcv.load_checkpoint()`的参数`filename`支持以下类型：
+
+- filepath: `checkpoint`路径
+- `http://xxx` and `https://xxx`: 下载checkpoint的链接，文件名中必需包含`SHA256`后缀
+- `torchvision://xxx`: `torchvision.models`中的模型链接，更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html)
+- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径
diff --git a/docs_zh_CN/understand_mmcv/config.md b/docs/zh_cn/understand_mmcv/config.md
similarity index 99%
rename from docs_zh_CN/understand_mmcv/config.md
rename to docs/zh_cn/understand_mmcv/config.md
index c6da308833ebb3e1588d7dfb5ba66cc90fb5ee42..52d7ab37b4a375cf67a08fdb1ae7add4672c2d44 100644
--- a/docs_zh_CN/understand_mmcv/config.md
+++ b/docs/zh_cn/understand_mmcv/config.md
@@ -40,6 +40,7 @@ d = 'string'
 这里是一个带有预定义变量的配置文件的例子。
 
 `config_a.py`
+
 ```python
 a = 1
 b = './work_dir/{{ fileBasenameNoExtension }}'
@@ -65,6 +66,7 @@ c = '{{ fileExtname }}'
 a = 1
 b = dict(b1=[0, 1, 2], b2=None)
 ```
+
 ### 不含重复键值对从基类配置文件继承
 
 `config_b.py`
@@ -83,6 +85,7 @@ d = 'string'
 ...      c=(1, 2),
 ...      d='string')
 ```
+
 在`config_b.py`里的新字段与在`config_a.py`里的旧字段拼接
 
 ### 含重复键值对从基类配置文件继承
diff --git a/docs_zh_CN/understand_mmcv/data_process.md b/docs/zh_cn/understand_mmcv/data_process.md
similarity index 96%
rename from docs_zh_CN/understand_mmcv/data_process.md
rename to docs/zh_cn/understand_mmcv/data_process.md
index 0885fe03353738d42b4503c9dddf4ec70883c5bb..98f00f1ed6a33f3dcbdb662008621474bb45b7ef 100644
--- a/docs_zh_CN/understand_mmcv/data_process.md
+++ b/docs/zh_cn/understand_mmcv/data_process.md
@@ -252,9 +252,9 @@ flow = mmcv.flowread('compressed.jpg', quantize=True, concat_axis=1)
 mmcv.flowshow(flow)
 ```
 
-![progress](../../docs/_static/flow_visualization.png)
+![progress](../../en/_static/flow_visualization.png)
 
-3. 流变换
+1. 流变换
 
 ```python
 img1 = mmcv.imread('img1.jpg')
@@ -264,12 +264,12 @@ warpped_img2 = mmcv.flow_warp(img1, flow)
 
 img1 (左) and img2 (右)
 
-![raw images](../../docs/_static/flow_raw_images.png)
+![raw images](../../en/_static/flow_raw_images.png)
 
 光流 (img2 -> img1)
 
-![optical flow](../../docs/_static/flow_img2toimg1.png)
+![optical flow](../../en/_static/flow_img2toimg1.png)
 
 变换后的图像和真实图像的差异
 
-![warpped image](../../docs/_static/flow_warp_diff.png)
+![warpped image](../../en/_static/flow_warp_diff.png)
diff --git a/docs_zh_CN/understand_mmcv/io.md b/docs/zh_cn/understand_mmcv/io.md
similarity index 99%
rename from docs_zh_CN/understand_mmcv/io.md
rename to docs/zh_cn/understand_mmcv/io.md
index 0e5002f828f5489ee0447d65501de78e20d3f093..eb4fe14ba1102effa43acb906e23ffdd95ecf5c6 100644
--- a/docs_zh_CN/understand_mmcv/io.md
+++ b/docs/zh_cn/understand_mmcv/io.md
@@ -107,6 +107,7 @@ c
 d
 e
 ```
+
 #### 从硬盘读取
 
 使用 `list_from_file` 读取 `a.txt`
diff --git a/docs/zh_cn/understand_mmcv/ops.md b/docs/zh_cn/understand_mmcv/ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..82c9eb4fcabc42f2506f22cfc4b5cc5881ae939a
--- /dev/null
+++ b/docs/zh_cn/understand_mmcv/ops.md
@@ -0,0 +1,60 @@
+## 算子
+
+MMCV 提供了检测、分割等任务中常用的算子
+
+| Device                       | CPU | CUDA | MLU | MPS |
+| ---------------------------- | --- | ---- | --- | --- |
+| ActiveRotatedFilter          | √   | √    |     |     |
+| AssignScoreWithK             |     | √    |     |     |
+| BallQuery                    |     | √    |     |     |
+| BBoxOverlaps                 |     | √    | √   | √   |
+| BorderAlign                  |     | √    |     |     |
+| BoxIouRotated                | √   | √    |     |     |
+| CARAFE                       |     | √    |     |     |
+| ChamferDistance              |     | √    |     |     |
+| CrissCrossAttention          |     | √    |     |     |
+| ContourExpand                | √   |      |     |     |
+| ConvexIoU                    |     | √    |     |     |
+| CornerPool                   |     | √    |     |     |
+| Correlation                  |     | √    |     |     |
+| Deformable Convolution v1/v2 | √   | √    |     |     |
+| Deformable RoIPool           |     | √    |     |     |
+| DiffIoURotated               |     | √    |     |     |
+| DynamicScatter               |     | √    |     |     |
+| FurthestPointSample          |     | √    |     |     |
+| FurthestPointSampleWithDist  |     | √    |     |     |
+| FusedBiasLeakyrelu           |     | √    |     |     |
+| GatherPoints                 |     | √    |     |     |
+| GroupPoints                  |     | √    |     |     |
+| Iou3d                        |     | √    |     |     |
+| KNN                          |     | √    |     |     |
+| MaskedConv                   |     | √    |     |     |
+| MergeCells                   |     | √    |     |     |
+| MinAreaPolygon               |     | √    |     |     |
+| ModulatedDeformConv2d        | √   | √    |     |     |
+| MultiScaleDeformableAttn     |     | √    |     |     |
+| NMS                          | √   | √    | √   |     |
+| NMSRotated                   | √   | √    |     |     |
+| PixelGroup                   | √   |      |     |     |
+| PointsInBoxes                | √   | √    |     |     |
+| PointsInPolygons             |     | √    |     |     |
+| PSAMask                      | √   | √    | √   |     |
+| RotatedFeatureAlign          | √   | √    |     |     |
+| RoIPointPool3d               |     | √    |     |     |
+| RoIPool                      |     | √    | √   |     |
+| RoIAlignRotated              | √   | √    | √   |     |
+| RiRoIAlignRotated            |     | √    |     |     |
+| RoIAlign                     | √   | √    | √   |     |
+| RoIAwarePool3d               |     | √    |     |     |
+| SAConv2d                     |     | √    |     |     |
+| SigmoidFocalLoss             |     | √    | √   |     |
+| SoftmaxFocalLoss             |     | √    |     |     |
+| SoftNMS                      |     | √    |     |     |
+| Sparse Convolution           |     | √    |     |     |
+| Synchronized BatchNorm       |     | √    |     |     |
+| ThreeInterpolate             |     | √    |     |     |
+| ThreeNN                      |     | √    |     |     |
+| TINShift                     |     | √    | √   |     |
+| UpFirDn2d                    |     | √    |     |     |
+| Voxelization                 | √   | √    |     |     |
+| PrRoIPool                    |     | √    |     |     |
diff --git a/docs_zh_CN/understand_mmcv/registry.md b/docs/zh_cn/understand_mmcv/registry.md
similarity index 76%
rename from docs_zh_CN/understand_mmcv/registry.md
rename to docs/zh_cn/understand_mmcv/registry.md
index 3afd0ab66e8e9787280ce54cdfb807e2acf60827..325baa41db36f13fc627ccb57759fb5210e696f9 100644
--- a/docs_zh_CN/understand_mmcv/registry.md
+++ b/docs/zh_cn/understand_mmcv/registry.md
@@ -1,11 +1,17 @@
 ## 注册器
+
 MMCV 使用 [注册器](https://github.com/open-mmlab/mmcv/blob/master/mmcv/utils/registry.py) 来管理具有相似功能的不同模块, 例如, 检测器中的主干网络、头部、和模型颈部。
 在 OpenMMLab 家族中的绝大部分开源项目使用注册器去管理数据集和模型的模块，例如 [MMDetection](https://github.com/open-mmlab/mmdetection), [MMDetection3D](https://github.com/open-mmlab/mmdetection3d), [MMClassification](https://github.com/open-mmlab/mmclassification), [MMEditing](https://github.com/open-mmlab/mmediting) 等。
 
+```{note}
+在 v1.5.1 版本开始支持注册函数的功能。
+```
+
 ### 什么是注册器
-在MMCV中，注册器可以看作类到字符串的映射。
-一个注册器中的类通常有相似的接口，但是可以实现不同的算法或支持不同的数据集。
-借助注册器，用户可以通过使用相应的字符串查找并实例化该类，并根据他们的需要实例化对应模块。
+
+在MMCV中，注册器可以看作类或函数到字符串的映射。
+一个注册器中的类或函数通常有相似的接口，但是可以实现不同的算法或支持不同的数据集。
+借助注册器，用户可以通过使用相应的字符串查找类或函数，并根据他们的需要实例化对应模块或调用函数获取结果。
 一个典型的案例是，OpenMMLab　中的大部分开源项目的配置系统，这些系统通过配置文件来使用注册器创建钩子、执行器、模型和数据集。
 可以在[这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.Registry)找到注册器接口使用文档。
 
@@ -15,7 +21,7 @@ MMCV 使用 [注册器](https://github.com/open-mmlab/mmcv/blob/master/mmcv/util
 2. 创建注册器
 3. 使用此注册器来管理模块
 
-`Registry`（注册器）的参数 `build_func`（构建函数） 用来自定以如何实例化类的实例，默认使用 [这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg)实现的`build_from_cfg`。
+`Registry`（注册器）的参数 `build_func`（构建函数） 用来自定义如何实例化类的实例或如何调用函数获取结果，默认使用 [这里](https://mmcv.readthedocs.io/en/latest/api.html?highlight=registry#mmcv.utils.build_from_cfg) 实现的`build_from_cfg`。
 
 ### 一个简单的例子
 
@@ -29,9 +35,10 @@ from mmcv.utils import Registry
 CONVERTERS = Registry('converter')
 ```
 
-然后我们在包中可以实现不同的转换器（converter）。例如，在 `converters/converter1.py` 中实现 `Converter1`。
+然后我们在包中可以实现不同的转换器（converter），其可以为类或函数。例如，在 `converters/converter1.py` 中实现 `Converter1`，在 `converters/converter2.py` 中实现 `converter2`。
 
 ```python
+# converter1.py
 from .builder import CONVERTERS
 
 # 使用注册器管理模块
@@ -41,19 +48,39 @@ class Converter1(object):
         self.a = a
         self.b = b
 ```
-使用注册器管理模块的关键步骤是，将实现的模块注册到注册表 `CONVERTERS` 中。通过 `@CONVERTERS.register_module()` 装饰所实现的模块，字符串和类之间的映射就可以由 `CONVERTERS` 构建和维护，如下所示：
 
-通过这种方式，就可以通过 `CONVERTERS` 建立字符串与类之间的映射，如下所示：
+```python
+# converter2.py
+from .builder import CONVERTERS
+from .converter1 import Converter1
+
+# 使用注册器管理模块
+@CONVERTERS.register_module()
+def converter2(a, b)
+    return Converter1(a, b)
+```
+
+使用注册器管理模块的关键步骤是，将实现的模块注册到注册表 `CONVERTERS` 中。通过 `@CONVERTERS.register_module()` 装饰所实现的模块，字符串到类或函数之间的映射就可以由 `CONVERTERS` 构建和维护，如下所示：
+
+通过这种方式，就可以通过 `CONVERTERS` 建立字符串与类或函数之间的映射，如下所示：
 
 ```python
 'Converter1' -> <class 'Converter1'>
+'converter2' -> <function 'converter2'>
+```
+
+```{note}
+只有模块所在的文件被导入时，注册机制才会被触发，所以您需要在某处导入该文件。更多详情请查看 https://github.com/open-mmlab/mmdetection/issues/5974。
 ```
 
 如果模块被成功注册了，你可以通过配置文件使用这个转换器（converter），如下所示：
 
 ```python
-converter_cfg = dict(type='Converter1', a=a_value, b=b_value)
-converter = CONVERTERS.build(converter_cfg)
+converter1_cfg = dict(type='Converter1', a=a_value, b=b_value)
+converter2_cfg = dict(type='converter2', a=a_value, b=b_value)
+converter1 = CONVERTERS.build(converter1_cfg)
+# returns the calling result
+result = CONVERTERS.build(converter2_cfg)
 ```
 
 ### 自定义构建函数
@@ -84,7 +111,7 @@ CONVERTERS = Registry('converter', build_func=build_converter)
 该功能类似于默认的`build_from_cfg`。在大多数情况下，默认就足够了。
 ```
 
-`build_model_from_cfg`也实现了在`nn.Sequentail`中构建PyTorch模块，你可以直接使用它们。
+`build_model_from_cfg`也实现了在`nn.Sequential`中构建PyTorch模块，你可以直接使用它们。
 
 ### 注册器层结构
 
diff --git a/docs_zh_CN/understand_mmcv/runner.md b/docs/zh_cn/understand_mmcv/runner.md
similarity index 88%
rename from docs_zh_CN/understand_mmcv/runner.md
rename to docs/zh_cn/understand_mmcv/runner.md
index 203a5dcacfd709772dce8c411a25bb8a623e0dd7..7098eb977f998ed67fc2a6fc66b0d436c47f3d75 100644
--- a/docs_zh_CN/understand_mmcv/runner.md
+++ b/docs/zh_cn/understand_mmcv/runner.md
@@ -8,7 +8,7 @@
 
 ### EpochBasedRunner
 
-顾名思义，`EpochBasedRunner` 是指以 epoch 为周期的工作流，例如设置 workflow = [('train', 2), ('val', 1)] 表示循环迭代地训练 2 个 epoch，然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。
+顾名思义，`EpochBasedRunner` 是指以 epoch 为周期的工作流，例如设置 workflow = \[('train', 2), ('val', 1)\] 表示循环迭代地训练 2 个 epoch，然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。
 
 其抽象逻辑如下所示：
 
@@ -25,6 +25,7 @@ while curr_epoch < max_epochs:
         for _ in range(epochs):
             epoch_runner(data_loaders[i], **kwargs)
 ```
+
 目前支持训练和验证两个工作流，以训练函数为例，其抽象逻辑是：
 
 ```python
@@ -40,7 +41,8 @@ def train(self, data_loader, **kwargs):
 ```
 
 ### IterBasedRunner
-不同于 `EpochBasedRunner`，`IterBasedRunner` 是指以 iter 为周期的工作流，例如设置 workflow = [('train', 2)， ('val', 1)] 表示循环迭代的训练 2 个 iter，然后验证 1 个 iter，MMSegmentation 语义分割框架默认采用的是  `EpochBasedRunner`。
+
+不同于 `EpochBasedRunner`，`IterBasedRunner` 是指以 iter 为周期的工作流，例如设置 workflow = \[('train', 2)， ('val', 1)\] 表示循环迭代的训练 2 个 iter，然后验证 1 个 iter，MMSegmentation 语义分割框架默认采用的是  `IterBasedRunner`。
 
 其抽象逻辑如下所示：
 
@@ -59,6 +61,7 @@ while curr_iter < max_iters:
         for _ in range(iters):
             iter_runner(iter_loaders[i], **kwargs)
 ```
+
 目前支持训练和验证两个工作流，以验证函数为例，其抽象逻辑是：
 
 ```python
@@ -75,6 +78,7 @@ def val(self, data_loader, **kwargs):
 除了上述基础功能外，`EpochBasedRunner` 和 `IterBasedRunner` 还提供了 resume 、 save_checkpoint 和注册 hook 功能。
 
 ### 一个简单例子
+
 以最常用的分类任务为例详细说明 `runner` 的使用方法。 开启任何一个训练任务，都需要包括如下步骤：
 
 **(1) dataloader、model 和优化器等类初始化**
@@ -148,8 +152,8 @@ runner.run(data_loaders, cfg.workflow)
 
 关于 workflow 设置，以 `EpochBasedRunner` 为例，详情如下：
 
-- 假设只想运行训练工作流，则可以设置 workflow = [('train', 1)]，表示只进行迭代训练
-- 假设想运行训练和验证工作流，则可以设置 workflow = [('train',  3), ('val', 1)]，表示先训练 3 个 epoch ，然后切换到 val 工作流，运行 1 个 epoch，然后循环，直到训练 epoch 次数达到指定值
-- 工作流设置还自由定制，例如你可以先验证再训练 workflow = [('val', 1), ('train', 1)]
+- 假设只想运行训练工作流，则可以设置 workflow = \[('train', 1)\]，表示只进行迭代训练
+- 假设想运行训练和验证工作流，则可以设置 workflow = \[('train',  3), ('val', 1)\]，表示先训练 3 个 epoch ，然后切换到 val 工作流，运行 1 个 epoch，然后循环，直到训练 epoch 次数达到指定值
+- 工作流设置还自由定制，例如你可以先验证再训练 workflow = \[('val', 1), ('train', 1)\]
 
 上述代码都已经封装到了各个代码库的 train.py 中，用户只需要设置相应的配置即可，上述流程会自动运行。
diff --git a/docs_zh_CN/understand_mmcv/utils.md b/docs/zh_cn/understand_mmcv/utils.md
similarity index 93%
rename from docs_zh_CN/understand_mmcv/utils.md
rename to docs/zh_cn/understand_mmcv/utils.md
index 746c560039759df3e6f76ae665e63812ed3c9ed6..c02e5203a4cde69e9f9f332b047bfea25c151bb4 100644
--- a/docs_zh_CN/understand_mmcv/utils.md
+++ b/docs/zh_cn/understand_mmcv/utils.md
@@ -17,7 +17,7 @@ mmcv.track_progress(func, tasks)
 ```
 
 效果如下
-![progress](../../docs/_static/progress.*)
+![progress](../../en/_static/progress.*)
 
 如果你想可视化多进程任务的进度，你可以使用 `track_parallel_progress` 。
 
@@ -25,7 +25,7 @@ mmcv.track_progress(func, tasks)
 mmcv.track_parallel_progress(func, tasks, 8)  # 8 workers
 ```
 
-![progress](../../docs/_static/parallel_progress.*)
+![progress](../../_static/parallel_progress.*)
 
 如果你想要迭代或枚举数据列表并可视化进度,你可以使用 `track_iter_progress` 。
 
@@ -58,7 +58,6 @@ with mmcv.Timer():
 
 你也可以使用 `since_start()` 和 `since_last_check()` 。前者返回计时器启动后的运行时长，后者返回最近一次查看计时器后的运行时长。
 
-
 ```python
 timer = mmcv.Timer()
 # code block 1 here
diff --git a/docs_zh_CN/understand_mmcv/visualization.md b/docs/zh_cn/understand_mmcv/visualization.md
similarity index 100%
rename from docs_zh_CN/understand_mmcv/visualization.md
rename to docs/zh_cn/understand_mmcv/visualization.md
diff --git a/docs_zh_CN/community/pr.md b/docs_zh_CN/community/pr.md
deleted file mode 100644
index 219e01dd747827adedddd922310624f97ff10672..0000000000000000000000000000000000000000
--- a/docs_zh_CN/community/pr.md
+++ /dev/null
@@ -1,90 +0,0 @@
-## 拉取请求
-
-### 什么是拉取请求？
-
-`拉取请求` (Pull Request), [GitHub 官方文档](https://docs.github.com/en/github/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/about-pull-requests)定义如下。
-
->拉取请求是一种通知机制。你修改了他人的代码，将你的修改通知原来作者，希望他合并你的修改。
-
-### 基本的工作流：
-
-1. 获取最新的代码库
-2. 从主分支创建最新的分支进行开发
-3. 提交修改
-4. 推送你的修改并创建一个`拉取请求`
-5. 讨论、审核代码
-6. 将开发分支合并到主分支
-
-### 具体步骤
-
-1. 获取最新的代码库
-    + 当你第一次提 PR 时
-        - 复刻 OpenMMLab 原代码库，点击 GitHub 页面右上角的 **Fork** 按钮即可
-        ![avatar](../../docs/_static/community/1.png)
-
-        - 克隆复刻的代码库到本地
-            ```bash
-            git clone git@github.com:XXX/mmcv.git
-            ```
-
-        - 添加原代码库为上游代码库
-            ```bash
-            git remote add upstream git@github.com:open-mmlab/mmcv
-            ```
-    + 从第二个 PR 起
-       - 检出本地代码库的主分支，然后从最新的原代码库的主分支拉取更新
-            ```bash
-            git checkout master
-            git pull upstream master
-            ```
-
-2. 从主分支创建一个新的开发分支
-    ```bash
-    git checkout -b branchname
-    ```
-    注意：为了保证提交历史清晰可读，我们强烈推荐您先检出主分支 (master)，再创建新的分支。
-
-3. 提交你的修改
-    ```bash
-    # coding
-    git add [files]
-    git commit -m 'messages'
-    ```
-
-4. 推送你的修改到复刻的代码库，并创建一个`拉取请求`
-    + 推送当前分支到远端复刻的代码库
-        ```bash
-        git push origin branchname
-        ```
-
-    + 创建一个`拉取请求`
-    ![avatar](../../docs/_static/community/2.png)
-
-    + 修改`拉取请求`信息模板，描述修改原因和修改内容。还可以在 PR 描述中，手动关联到相关的`议题` (issue),（更多细节，请参考[官方文档](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue)）。
-
-5. 讨论并评审你的代码
-    + 创建`拉取请求`时，可以关联给相关人员进行评审
-    ![avatar](../../docs/_static/community/3.png)
-
-    + 根据评审人员的意见修改代码，并推送修改
-
-6.  `拉取请求`合并之后删除该分支
-```bash
-git branch -d branchname # delete local branch
-git push origin --delete branchname # delete remote branch
-```
-
-### PR 规范
-
-1. 使用 [pre-commit hook](https://pre-commit.com)，尽量减少代码风格相关问题
-2. 一个PR对应一个短期分支
-3. 粒度要细，一个PR只做一件事情，避免超大的PR
-    >- Bad:实现Faster R-CNN
-    >- Acceptable:给 Faster R-CNN 添加一个 box head
-    >- Good:给 box head 增加一个参数来支持自定义的 conv 层数
-4. 每次 Commit 时需要提供清晰且有意义 commit 信息
-5. 提供清晰且有意义的`拉取请求`描述
-    >- 标题写明白任务名称，一般格式:[Prefix] Short description of the pull request (Suffix)
-    >- prefix: 新增功能 [Feature], 修 bug [Fix], 文档相关 [Docs], 开发中 [WIP] (暂时不会被review)
-    >- 描述里介绍`拉取请求`的主要修改内容，结果，以及对其他部分的影响, 参考`拉取请求`模板
-    >- 关联相关的`议题` (issue) 和其他`拉取请求`
diff --git a/docs_zh_CN/faq.md b/docs_zh_CN/faq.md
deleted file mode 100644
index e5d6395720e9e210771e10256efb926a0da5f4fa..0000000000000000000000000000000000000000
--- a/docs_zh_CN/faq.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## 常见问题
-
-在这里我们列出了用户经常遇到的问题以及对应的解决方法。如果您遇到了其他常见的问题，并且知道可以帮到大家的解决办法，
-欢迎随时丰富这个列表。
-
-- MMCV 和 MMDetection 的兼容性问题；"ConvWS is already registered in conv layer"
-
-    请按照上述说明为您的 MMDetection 版本安装正确版本的 MMCV。
-
-- "No module named 'mmcv.ops'"; "No module named 'mmcv._ext'"
-
-    1. 使用 `pip uninstall mmcv` 卸载您环境中的 mmcv
-    2. 按照上述说明安装 mmcv-full
-
-- "invalid device function" 或者 "no kernel image is available for execution"
-
-    1. 检查 GPU 的 CUDA 计算能力
-    2. 运行  `python mmdet/utils/collect_env.py` 来检查 PyTorch、torchvision 和 MMCV 是否是针对正确的 GPU 架构构建的
-        您可能需要去设置 `TORCH_CUDA_ARCH_LIST` 来重新安装 MMCV
-        兼容性问题的可能会出现在使用旧版的 GPUs，如：colab 上的 Tesla K80 (3.7)
-    3. 检查运行环境是否和 mmcv/mmdet 编译时的环境相同。例如，您可能使用 CUDA 10.0 编译 mmcv，但在 CUDA 9.0 的环境中运行它
-
-- "undefined symbol" 或者 "cannot open xxx.so"。
-
-    1. 如果符号和 CUDA/C++ 相关（例如：libcudart.so 或者 GLIBCXX），请检查 CUDA/GCC 运行时的版本是否和编译 mmcv 的一致
-    2. 如果符号和 PyTorch 相关（例如：符号包含 caffe、aten 和 TH），请检查 PyTorch 运行时的版本是否和编译 mmcv 的一致
-    3. 运行 `python mmdet/utils/collect_env.py` 以检查 PyTorch、torchvision 和 MMCV 构建和运行的环境是否相同
-
-- "RuntimeError: CUDA error: invalid configuration argument"。
-
-    这个错误可能是由于您的 GPU 性能不佳造成的。尝试降低[THREADS_PER_BLOCK](https://github.com/open-mmlab/mmcv/blob/cac22f8cf5a904477e3b5461b1cc36856c2793da/mmcv/ops/csrc/common_cuda_helper.hpp#L10)
-    的值并重新编译 mmcv。
-
-- "RuntimeError: nms is not compiled with GPU support"。
-
-    这个错误是由于您的 CUDA 环境没有正确安装。
-    您可以尝试重新安装您的 CUDA 环境，然后删除 mmcv/build 文件夹并重新编译 mmcv。
diff --git a/docs_zh_CN/understand_mmcv/cnn.md b/docs_zh_CN/understand_mmcv/cnn.md
deleted file mode 100644
index 9027cf38dc48cbe342a48c3f4e658d629d2e0974..0000000000000000000000000000000000000000
--- a/docs_zh_CN/understand_mmcv/cnn.md
+++ /dev/null
@@ -1,525 +0,0 @@
-## 卷积神经网络
-
-我们为卷积神经网络提供了一些构建模块，包括层构建、模块组件和权重初始化。
-
-### 网络层的构建
-
-在运行实验时，我们可能需要尝试同属一种类型但不同配置的层，但又不希望每次都修改代码。于是我们提供一些层构建方法，可以从字典构建层，字典可以在配置文件中配置，也可以通过命令行参数指定。
-
-#### 用法
-
-一个简单的例子：
-
-```python
-cfg = dict(type='Conv3d')
-layer = build_conv_layer(cfg, in_channels=3, out_channels=8, kernel_size=3)
-```
-
-- `build_conv_layer`: 支持的类型包括 Conv1d、Conv2d、Conv3d、Conv (Conv是Conv2d的别名）
-- `build_norm_layer`: 支持的类型包括 BN1d、BN2d、BN3d、BN (alias for BN2d)、SyncBN、GN、LN、IN1d、IN2d、IN3d、IN（IN是IN2d的别名）
-- `build_activation_layer`：支持的类型包括 ReLU、LeakyReLU、PReLU、RReLU、ReLU6、ELU、Sigmoid、Tanh、GELU
-- `build_upsample_layer`: 支持的类型包括 nearest、bilinear、deconv、pixel_shuffle
-- `build_padding_layer`: 支持的类型包括 zero、reflect、replicate
-
-#### 拓展
-
-我们还允许自定义层和算子来扩展构建方法。
-
-1. 编写和注册自己的模块：
-
-    ```python
-    from mmcv.cnn import UPSAMPLE_LAYERS
-
-    @UPSAMPLE_LAYERS.register_module()
-    class MyUpsample:
-
-        def __init__(self, scale_factor):
-            pass
-
-        def forward(self, x):
-            pass
-    ```
-
-2. 在某处导入 `MyUpsample` （例如 `__init__.py` ）然后使用它：
-
-    ```python
-    cfg = dict(type='MyUpsample', scale_factor=2)
-    layer = build_upsample_layer(cfg)
-    ```
-
-### 模块组件
-
-我们还提供了常用的模块组件，以方便网络构建。
-卷积组件 `ConvModule` 由 convolution、normalization以及activation layers 组成，更多细节请参考 [ConvModule api](api.html#mmcv.cnn.ConvModule)。
-
-```python
-# conv + bn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
-# conv + gn + relu
-conv = ConvModule(3, 8, 2, norm_cfg=dict(type='GN', num_groups=2))
-# conv + relu
-conv = ConvModule(3, 8, 2)
-# conv
-conv = ConvModule(3, 8, 2, act_cfg=None)
-# conv + leaky relu
-conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
-# bn + conv + relu
-conv = ConvModule(
-    3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
-```
-
-### Weight initialization
-
-> 实现细节可以在 [mmcv/cnn/utils/weight_init.py](../../mmcv/cnn/utils/weight_init.py)中找到
-
-在训练过程中，适当的初始化策略有利于加快训练速度或者获得更高的性能。 在MMCV中，我们提供了一些常用的方法来初始化模块，比如 `nn.Conv2d` 模块。当然，我们也提供了一些高级API，可用于初始化包含一个或多个模块的模型。
-
-#### Initialization functions
-
-以函数的方式初始化 `nn.Module` ，例如 `nn.Conv2d` 、 `nn.Linear` 等。
-
-我们提供以下初始化方法，
-
-- constant_init
-
-  使用给定常量值初始化模型参数
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import constant_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # constant_init(module, val, bias=0)
-    >>> constant_init(conv1, 1, 0)
-    >>> conv1.weight
-    ```
-
-- xavier_init
-
-   按照 [Understanding the difficulty of training deep feedforward neural networks - Glorot, X. & Bengio, Y. (2010)](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf) 描述的方法初始化模型参数
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import xavier_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # xavier_init(module, gain=1, bias=0, distribution='normal')
-    >>> xavier_init(conv1, distribution='normal')
-    ```
-
-- normal_init
-
-  使用正态分布（高斯分布）初始化模型参数
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import normal_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # normal_init(module, mean=0, std=1, bias=0)
-    >>> normal_init(conv1, std=0.01, bias=0)
-    ```
-
-- uniform_init
-
-  使用均匀分布初始化模型参数
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import uniform_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # uniform_init(module, a=0, b=1, bias=0)
-    >>> uniform_init(conv1, a=0, b=1)
-    ```
-
-- kaiming_init
-
-   按照 [Delving deep into rectifiers: Surpassing human-level performance on ImageNet classification - He, K. et al. (2015)](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf) 描述的方法来初始化模型参数。
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import kaiming_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # kaiming_init(module, a=0, mode='fan_out', nonlinearity='relu', bias=0, distribution='normal')
-    >>> kaiming_init(conv1)
-    ```
-
-- caffe2_xavier_init
-
-  caffe2中实现的 `xavier initialization`，对应于 PyTorch中的 `kaiming_uniform_`
-
-    ```python
-    >>> import torch.nn as nn
-    >>> from mmcv.cnn import caffe2_xavier_init
-    >>> conv1 = nn.Conv2d(3, 3, 1)
-    >>> # caffe2_xavier_init(module, bias=0)
-    >>> caffe2_xavier_init(conv1)
-    ```
-
-- bias_init_with_prob
-
-  根据给定的概率初始化 `conv/fc`, 这在 [Focal Loss for Dense Object Detection](https://arxiv.org/pdf/1708.02002.pdf) 提出。
-
-    ```python
-    >>> from mmcv.cnn import bias_init_with_prob
-    >>> # bias_init_with_prob is proposed in Focal Loss
-    >>> bias = bias_init_with_prob(0.01)
-    >>> bias
-    -4.59511985013459
-    ```
-
-#### Initializers and configs
-
-在初始化方法的基础上，我们定义了相应的初始化类，并将它们注册到 `INITIALIZERS` 中，这样我们就可以使用 `config` 配置来初始化模型了。
-
-我们提供以下初始化类：
-
-- ConstantInit
-- XavierInit
-- NormalInit
-- UniformInit
-- KaimingInit
-- Caffe2XavierInit
-- PretrainedInit
-
-接下来详细介绍 `initialize` 的使用方法
-
-1. 通过关键字 `layer` 来初始化模型
-
-    如果我们只定义了关键字 `layer` ，那么只初始化 `layer` 中包含的层。
-
-    注意: 关键字 `layer` 支持的模块是带有 weights 和 bias 属性的 PyTorch 模块，所以不支持 `MultiheadAttention layer`
-
-- 定义关键字 `layer` 列表并使用相同相同配置初始化模块
-
-  ```python
-  import torch.nn as nn
-  from mmcv.cnn import initialize
-
-  class FooNet(nn.Module):
-      def __init__(self):
-          super().__init__()
-          self.feat = nn.Conv1d(3, 1, 3)
-          self.reg = nn.Conv2d(3, 3, 3)
-          self.cls = nn.Linear(1, 2)
-
-  model = FooNet()
-  init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d', 'Linear'], val=1)
-  # 使用相同的配置初始化整个模块
-  initialize(model, init_cfg)
-  # model.feat.weight
-  # Parameter containing:
-  # tensor([[[1., 1., 1.],
-  #          [1., 1., 1.],
-  #          [1., 1., 1.]]], requires_grad=True)
-  ```
-
-- 定义关键字 `layer` 用于初始化不同配置的层
-
-  ```python
-  import torch.nn as nn
-  from mmcv.cnn.utils import initialize
-
-  class FooNet(nn.Module):
-      def __init__(self):
-          super().__init__()
-          self.feat = nn.Conv1d(3, 1, 3)
-          self.reg = nn.Conv2d(3, 3, 3)
-          self.cls = nn.Linear(1,2)
-
-  model = FooNet()
-  init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
-              dict(type='Constant', layer='Conv2d', val=2),
-              dict(type='Constant', layer='Linear', val=3)]
-  # nn.Conv1d 使用 dict(type='Constant', val=1) 初始化
-  # nn.Conv2d 使用 dict(type='Constant', val=2) 初始化
-  # nn.Linear 使用 dict(type='Constant', val=3) 初始化
-  initialize(model, init_cfg)
-  # model.reg.weight
-  # Parameter containing:
-  # tensor([[[[2., 2., 2.],
-  #           [2., 2., 2.],
-  #           [2., 2., 2.]],
-  #          ...,
-  #          [[2., 2., 2.],
-  #           [2., 2., 2.],
-  #           [2., 2., 2.]]]], requires_grad=True)
-  ```
-
-2. 定义关键字`override`初始化模型
-
-- 当用属性名初始化某个特定部分时, 我们可以使用关键字 `override`, 关键字 `override` 对应的Value会替代init_cfg中相应的值
-
-    ```python
-    import torch.nn as nn
-    from mmcv.cnn import initialize
-
-    class FooNet(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.feat = nn.Conv1d(3, 1, 3)
-            self.reg = nn.Conv2d(3, 3, 3)
-            self.cls = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
-
-    # 如果我们想将模型的权重初始化为 1，将偏差初始化为 2
-    # 但希望 `cls` 中的权重为 3，偏差为 4，则我们可以使用关键字override
-
-    model = FooNet()
-    init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'], val=1, bias=2,
-                    override=dict(type='Constant', name='reg', val=3, bias=4))
-    #  使用 dict(type='Constant', val=1, bias=2)来初始化 self.feat and self.cls
-    # 使用dict(type='Constant', val=3, bias=4)来初始化‘reg’模块。
-    initialize(model, init_cfg)
-    # model.reg.weight
-    # Parameter containing:
-    # tensor([[[[3., 3., 3.],
-    #           [3., 3., 3.],
-    #           [3., 3., 3.]],
-    #           ...,
-    #           [[3., 3., 3.],
-    #            [3., 3., 3.],
-    #            [3., 3., 3.]]]], requires_grad=True)
-    ```
-
-- 如果 init_cfg 中的关键字`layer`为None，则只初始化在关键字override中的子模块，并且省略override中的 type 和其他参数
-
-    ```python
-    model = FooNet()
-    init_cfg = dict(type='Constant', val=1, bias=2, override=dict(name='reg'))
-    # self.feat 和 self.cls 使用pyTorch默认的初始化
-    # 将使用 dict(type='Constant', val=1, bias=2) 初始化名为 'reg' 的模块
-    initialize(model, init_cfg)
-    # model.reg.weight
-    # Parameter containing:
-    # tensor([[[[1., 1., 1.],
-    #           [1., 1., 1.],
-    #           [1., 1., 1.]],
-    #           ...,
-    #           [[1., 1., 1.],
-    #            [1., 1., 1.],
-    #            [1., 1., 1.]]]], requires_grad=True)
-    ```
-
-- 如果我们没有定义关键字`layer`或`override` , 将不会初始化任何东西
-
-- 关键字`override`的无效用法
-
-   ```python
-   # 没有重写任何子模块
-   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
-                   val=1, bias=2,
-                   override=dict(type='Constant', val=3, bias=4))
-
-   # 没有指定type，即便有其他参数，也是无效的。
-   init_cfg = dict(type='Constant', layer=['Conv1d','Conv2d'],
-                   val=1, bias=2,
-                   override=dict(name='reg', val=3, bias=4))
-   ```
-
-3. 用预训练模型初始化
-
-    ```python
-    import torch.nn as nn
-    import torchvision.models as models
-    from mmcv.cnn import initialize
-
-    # 使用预训练模型来初始化
-    model = models.resnet50()
-    # model.conv1.weight
-    # Parameter containing:
-    # tensor([[[[-6.7435e-03, -2.3531e-02, -9.0143e-03,  ..., -2.1245e-03,
-    #            -1.8077e-03,  3.0338e-03],
-    #           [-1.2603e-02, -2.7831e-02,  2.3187e-02,  ..., -1.5793e-02,
-    #             1.1655e-02,  4.5889e-03],
-    #           [-3.7916e-02,  1.2014e-02,  1.3815e-02,  ..., -4.2651e-03,
-    #             1.7314e-02, -9.9998e-03],
-    #           ...,
-
-    init_cfg = dict(type='Pretrained',
-                    checkpoint='torchvision://resnet50')
-    initialize(model, init_cfg)
-    # model.conv1.weight
-    # Parameter containing:
-    # tensor([[[[ 1.3335e-02,  1.4664e-02, -1.5351e-02,  ..., -4.0896e-02,
-    #            -4.3034e-02, -7.0755e-02],
-    #           [ 4.1205e-03,  5.8477e-03,  1.4948e-02,  ...,  2.2060e-03,
-    #            -2.0912e-02, -3.8517e-02],
-    #           [ 2.2331e-02,  2.3595e-02,  1.6120e-02,  ...,  1.0281e-01,
-    #             6.2641e-02,  5.1977e-02],
-    #           ...,
-
-    # 使用关键字'prefix'用预训练模型的特定部分来初始化子模块权重
-    model = models.resnet50()
-    url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
-          'retinanet_r50_fpn_1x_coco/'\
-          'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
-    init_cfg = dict(type='Pretrained',
-                    checkpoint=url, prefix='backbone.')
-    initialize(model, init_cfg)
-    ```
-
-4. 初始化继承自BaseModule、Sequential、ModuleList的模型
-
-    `BaseModule` 继承自 `torch.nn.Module`, 它们之间唯一的不同是 `BaseModule` 实现了 `init_weight`
-
-    `Sequential` 继承自 `BaseModule` 和 `torch.nn.Sequential`
-
-    `ModuleList` 继承自 `BaseModule` 和 `torch.nn.ModuleList`
-
-    `````python
-    import torch.nn as nn
-    from mmcv.runner import BaseModule, Sequential, ModuleList
-
-    class FooConv1d(BaseModule):
-
-        def __init__(self, init_cfg=None):
-            super().__init__(init_cfg)
-            self.conv1d = nn.Conv1d(4, 1, 4)
-
-        def forward(self, x):
-            return self.conv1d(x)
-
-    class FooConv2d(BaseModule):
-
-        def __init__(self, init_cfg=None):
-            super().__init__(init_cfg)
-            self.conv2d = nn.Conv2d(3, 1, 3)
-
-        def forward(self, x):
-            return self.conv2d(x)
-
-    # BaseModule
-    init_cfg = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
-    model = FooConv1d(init_cfg)
-    model.init_weights()
-    # model.conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #        [0., 0., 0., 0.],
-    #        [0., 0., 0., 0.],
-    #        [0., 0., 0., 0.]]], requires_grad=True)
-
-    # Sequential
-    init_cfg1 = dict(type='Constant', layer='Conv1d', val=0., bias=1.)
-    init_cfg2 = dict(type='Constant', layer='Conv2d', val=2., bias=3.)
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    seq_model = Sequential(model1, model2)
-    seq_model.init_weights()
-    # seq_model[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # seq_model[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # inner init_cfg has higher priority
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
-    seq_model = Sequential(model1, model2, init_cfg=init_cfg)
-    seq_model.init_weights()
-    # seq_model[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # seq_model[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # ModuleList
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    modellist = ModuleList([model1, model2])
-    modellist.init_weights()
-    # modellist[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # modellist[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-
-    # inner init_cfg has higher priority
-    model1 = FooConv1d(init_cfg1)
-    model2 = FooConv2d(init_cfg2)
-    init_cfg = dict(type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.)
-    modellist = ModuleList([model1, model2], init_cfg=init_cfg)
-    modellist.init_weights()
-    # modellist[0].conv1d.weight
-    # Parameter containing:
-    # tensor([[[0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.],
-    #         [0., 0., 0., 0.]]], requires_grad=True)
-    # modellist[1].conv2d.weight
-    # Parameter containing:
-    # tensor([[[[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]],
-    #         ...,
-    #          [[2., 2., 2.],
-    #           [2., 2., 2.],
-    #           [2., 2., 2.]]]], requires_grad=True)
-    `````
-
-### Model Zoo
-
-除了`torchvision`的预训练模型，我们还提供以下 CNN 的预训练模型：
-
-- VGG Caffe
-- ResNet Caffe
-- ResNeXt
-- ResNet with Group Normalization
-- ResNet with Group Normalization and Weight Standardization
-- HRNetV2
-- Res2Net
-- RegNet
-
-#### Model URLs in JSON
-
-MMCV中的Model Zoo Link 由 JSON 文件管理。 json 文件由模型名称及其url或path的键值对组成,一个json文件可能类似于:
-
-```json
-{
-    "model_a": "https://example.com/models/model_a_9e5bac.pth",
-    "model_b": "pretrain/model_b_ab3ef2c.pth"
-}
-```
-
-可以在[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/model_zoo/open_mmlab.json)找到托管在 OpenMMLab AWS 上的预训练模型的默认链接。
-
-你可以通过将 `open-mmlab.json` 放在 `MMCV_HOME`下来覆盖默认链接，如果在环境中找不到`MMCV_HOME`，则默认使用 `~/.cache/mmcv`。当然你也可以使用命令 `export MMCV_HOME=/your/path`来设置自己的路径。
-
-外部的json文件将被合并为默认文件，如果相同的键出现在外部`json`和默认`json`中，则将使用外部`json`。
-
-#### Load Checkpoint
-
-`mmcv.load_checkpoint()`的参数`filename`支持以下类型：
-
-- filepath: `checkpoint`路径
-- `http://xxx` and `https://xxx`: 下载checkpoint的链接，文件名中必需包含`SHA256`后缀
-- `torchvision://xxx`: `torchvision.models`中的模型链接，更多细节参考 [torchvision](https://pytorch.org/docs/stable/torchvision/models.html)
-- `open-mmlab://xxx`: 默认和其他 json 文件中提供的模型链接或文件路径
diff --git a/docs_zh_CN/understand_mmcv/ops.md b/docs_zh_CN/understand_mmcv/ops.md
deleted file mode 100644
index a45bb14862ad0ec05d5fa4d66954ac1465bb668c..0000000000000000000000000000000000000000
--- a/docs_zh_CN/understand_mmcv/ops.md
+++ /dev/null
@@ -1,36 +0,0 @@
-## CUDA 算子
-
-MMCV 提供了检测、分割等任务中常用的 CUDA 算子
-
-- AssignScoreWithK
-- BallQuery
-- BBoxOverlaps
-- CARAFE
-- CrissCrossAttention
-- ContextBlock
-- CornerPool
-- Deformable Convolution v1/v2
-- Deformable RoIPool
-- DynamicScatter
-- GatherPoints
-- FurthestPointSample
-- FurthestPointSampleWithDist
-- GeneralizedAttention
-- KNN
-- MaskedConv
-- NMS
-- PSAMask
-- RoIPointPool3d
-- RoIPool
-- RoIAlign
-- RoIAwarePool3d
-- SimpleRoIAlign
-- SigmoidFocalLoss
-- SoftmaxFocalLoss
-- SoftNMS
-- Synchronized BatchNorm
-- Voxelization
-- ThreeInterpolate
-- ThreeNN
-- Weight standardization
-- Correlation
diff --git a/examples/train.py b/examples/train.py
index 2dbdfee40f049f55e07d7be1427fdd2da784a9f4..b08d36bf621747354d0df30bd6d787fd2c12faf1 100644
--- a/examples/train.py
+++ b/examples/train.py
@@ -14,7 +14,7 @@ from mmcv.utils import get_logger
 class Model(nn.Module):
 
     def __init__(self):
-        super(Model, self).__init__()
+        super().__init__()
         self.conv1 = nn.Conv2d(3, 6, 5)
         self.pool = nn.MaxPool2d(2, 2)
         self.conv2 = nn.Conv2d(6, 16, 5)
diff --git a/mmcv/__init__.py b/mmcv/__init__.py
index 210a2989138380559f23045b568d0fbbeb918c03..14c556acdf5832a1da569da6819a428f17adc328 100644
--- a/mmcv/__init__.py
+++ b/mmcv/__init__.py
@@ -13,3 +13,4 @@ from .visualization import *
 # - runner
 # - parallel
 # - op
+# - device
diff --git a/mmcv/arraymisc/quantization.py b/mmcv/arraymisc/quantization.py
index 8e47a3545780cf071a1ef8195efb0b7b662c8186..6182710d51787061304cfc7304ec97d565822536 100644
--- a/mmcv/arraymisc/quantization.py
+++ b/mmcv/arraymisc/quantization.py
@@ -1,14 +1,20 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import numpy as np
 
 
-def quantize(arr, min_val, max_val, levels, dtype=np.int64):
+def quantize(arr: np.ndarray,
+             min_val: Union[int, float],
+             max_val: Union[int, float],
+             levels: int,
+             dtype=np.int64) -> tuple:
     """Quantize an array of (-inf, inf) to [0, levels-1].
 
     Args:
         arr (ndarray): Input array.
-        min_val (scalar): Minimum value to be clipped.
-        max_val (scalar): Maximum value to be clipped.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
         levels (int): Quantization levels.
         dtype (np.type): The type of the quantized array.
 
@@ -29,13 +35,17 @@ def quantize(arr, min_val, max_val, levels, dtype=np.int64):
     return quantized_arr
 
 
-def dequantize(arr, min_val, max_val, levels, dtype=np.float64):
+def dequantize(arr: np.ndarray,
+               min_val: Union[int, float],
+               max_val: Union[int, float],
+               levels: int,
+               dtype=np.float64) -> tuple:
     """Dequantize an array.
 
     Args:
         arr (ndarray): Input array.
-        min_val (scalar): Minimum value to be clipped.
-        max_val (scalar): Maximum value to be clipped.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
         levels (int): Quantization levels.
         dtype (np.type): The type of the dequantized array.
 
diff --git a/mmcv/cnn/alexnet.py b/mmcv/cnn/alexnet.py
index 89e36b8c7851f895d9ae7f07149f0e707456aab0..4d45d96d86bdcb52a51f095c4571b21c8421cbfa 100644
--- a/mmcv/cnn/alexnet.py
+++ b/mmcv/cnn/alexnet.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
+from typing import Optional
 
+import torch
 import torch.nn as nn
 
 
@@ -11,8 +13,8 @@ class AlexNet(nn.Module):
         num_classes (int): number of classes for classification.
     """
 
-    def __init__(self, num_classes=-1):
-        super(AlexNet, self).__init__()
+    def __init__(self, num_classes: int = -1):
+        super().__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
@@ -40,7 +42,7 @@ class AlexNet(nn.Module):
                 nn.Linear(4096, num_classes),
             )
 
-    def init_weights(self, pretrained=None):
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
             from ..runner import load_checkpoint
@@ -51,7 +53,7 @@ class AlexNet(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         x = self.features(x)
         if self.num_classes > 0:
diff --git a/mmcv/cnn/bricks/activation.py b/mmcv/cnn/bricks/activation.py
index 79f1988386cbf09a4a13e2c5a72222e22bcc6f7f..23e62722776d18b764cffe4a76e646e3103f8fb7 100644
--- a/mmcv/cnn/bricks/activation.py
+++ b/mmcv/cnn/bricks/activation.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -28,12 +30,12 @@ class Clamp(nn.Module):
             Default to 1.
     """
 
-    def __init__(self, min=-1., max=1.):
-        super(Clamp, self).__init__()
+    def __init__(self, min: float = -1., max: float = 1.):
+        super().__init__()
         self.min = min
         self.max = max
 
-    def forward(self, x):
+    def forward(self, x) -> torch.Tensor:
         """Forward function.
 
         Args:
@@ -67,7 +69,7 @@ class GELU(nn.Module):
         >>> output = m(input)
     """
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         return F.gelu(input)
 
 
@@ -78,11 +80,12 @@ else:
     ACTIVATION_LAYERS.register_module(module=nn.GELU)
 
 
-def build_activation_layer(cfg):
+def build_activation_layer(cfg: Dict) -> nn.Module:
     """Build activation layer.
 
     Args:
         cfg (dict): The activation layer config, which should contain:
+
             - type (str): Layer type.
             - layer args: Args needed to instantiate an activation layer.
 
diff --git a/mmcv/cnn/bricks/context_block.py b/mmcv/cnn/bricks/context_block.py
index d60fdb904c749ce3b251510dff3cc63cea70d42e..15669cab35dcdc98a95df006788f78f84b88dc44 100644
--- a/mmcv/cnn/bricks/context_block.py
+++ b/mmcv/cnn/bricks/context_block.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import torch
 from torch import nn
 
@@ -6,7 +8,7 @@ from ..utils import constant_init, kaiming_init
 from .registry import PLUGIN_LAYERS
 
 
-def last_zero_init(m):
+def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
     if isinstance(m, nn.Sequential):
         constant_init(m[-1], val=0)
     else:
@@ -34,11 +36,11 @@ class ContextBlock(nn.Module):
     _abbr_ = 'context_block'
 
     def __init__(self,
-                 in_channels,
-                 ratio,
-                 pooling_type='att',
-                 fusion_types=('channel_add', )):
-        super(ContextBlock, self).__init__()
+                 in_channels: int,
+                 ratio: float,
+                 pooling_type: str = 'att',
+                 fusion_types: tuple = ('channel_add', )):
+        super().__init__()
         assert pooling_type in ['avg', 'att']
         assert isinstance(fusion_types, (list, tuple))
         valid_fusion_types = ['channel_add', 'channel_mul']
@@ -82,7 +84,7 @@ class ContextBlock(nn.Module):
         if self.channel_mul_conv is not None:
             last_zero_init(self.channel_mul_conv)
 
-    def spatial_pool(self, x):
+    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
         batch, channel, height, width = x.size()
         if self.pooling_type == 'att':
             input_x = x
@@ -108,7 +110,7 @@ class ContextBlock(nn.Module):
 
         return context
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         # [N, C, 1, 1]
         context = self.spatial_pool(x)
 
diff --git a/mmcv/cnn/bricks/conv.py b/mmcv/cnn/bricks/conv.py
index cf54491997a48ac3e7fadc4183ab7bf3e831024c..147517ef4ecdee16d26b535fa49c26a2fcbdd48e 100644
--- a/mmcv/cnn/bricks/conv.py
+++ b/mmcv/cnn/bricks/conv.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
 from torch import nn
 
 from .registry import CONV_LAYERS
@@ -9,7 +11,7 @@ CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
 CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
 
 
-def build_conv_layer(cfg, *args, **kwargs):
+def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
     """Build convolution layer.
 
     Args:
@@ -35,7 +37,7 @@ def build_conv_layer(cfg, *args, **kwargs):
 
     layer_type = cfg_.pop('type')
     if layer_type not in CONV_LAYERS:
-        raise KeyError(f'Unrecognized norm type {layer_type}')
+        raise KeyError(f'Unrecognized layer type {layer_type}')
     else:
         conv_layer = CONV_LAYERS.get(layer_type)
 
diff --git a/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/mmcv/cnn/bricks/conv2d_adaptive_padding.py
index b45e758ac6cf8dfb0382d072fe09125bc7e9b888..6a7a1d2844db097c21e5ecc55a579e0b9b95c816 100644
--- a/mmcv/cnn/bricks/conv2d_adaptive_padding.py
+++ b/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
+from typing import Tuple, Union
 
+import torch
 from torch import nn
 from torch.nn import functional as F
 
@@ -31,18 +33,18 @@ class Conv2dAdaptivePadding(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
         super().__init__(in_channels, out_channels, kernel_size, stride, 0,
                          dilation, groups, bias)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         img_h, img_w = x.size()[-2:]
         kernel_h, kernel_w = self.weight.size()[-2:]
         stride_h, stride_w = self.stride
diff --git a/mmcv/cnn/bricks/conv_module.py b/mmcv/cnn/bricks/conv_module.py
index 4f19f1d0cf4448179272ac53536e7ccf5fd860a3..b5d4a8c2760ea81656d3eefdad86e8dd43488447 100644
--- a/mmcv/cnn/bricks/conv_module.py
+++ b/mmcv/cnn/bricks/conv_module.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Dict, Optional, Tuple, Union
 
+import torch
 import torch.nn as nn
 
 from mmcv.utils import _BatchNorm, _InstanceNorm
@@ -68,22 +70,22 @@ class ConvModule(nn.Module):
     _abbr_ = 'conv_block'
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias='auto',
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 inplace=True,
-                 with_spectral_norm=False,
-                 padding_mode='zeros',
-                 order=('conv', 'norm', 'act')):
-        super(ConvModule, self).__init__()
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: Union[bool, str] = 'auto',
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Optional[Dict] = dict(type='ReLU'),
+                 inplace: bool = True,
+                 with_spectral_norm: bool = False,
+                 padding_mode: str = 'zeros',
+                 order: tuple = ('conv', 'norm', 'act')):
+        super().__init__()
         assert conv_cfg is None or isinstance(conv_cfg, dict)
         assert norm_cfg is None or isinstance(norm_cfg, dict)
         assert act_cfg is None or isinstance(act_cfg, dict)
@@ -96,7 +98,7 @@ class ConvModule(nn.Module):
         self.with_explicit_padding = padding_mode not in official_padding_mode
         self.order = order
         assert isinstance(self.order, tuple) and len(self.order) == 3
-        assert set(order) == set(['conv', 'norm', 'act'])
+        assert set(order) == {'conv', 'norm', 'act'}
 
         self.with_norm = norm_cfg is not None
         self.with_activation = act_cfg is not None
@@ -143,21 +145,22 @@ class ConvModule(nn.Module):
                 norm_channels = out_channels
             else:
                 norm_channels = in_channels
-            self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
+            self.norm_name, norm = build_norm_layer(
+                norm_cfg, norm_channels)  # type: ignore
             self.add_module(self.norm_name, norm)
             if self.with_bias:
                 if isinstance(norm, (_BatchNorm, _InstanceNorm)):
                     warnings.warn(
                         'Unnecessary conv bias before batch/instance norm')
         else:
-            self.norm_name = None
+            self.norm_name = None  # type: ignore
 
         # build activation layer
         if self.with_activation:
-            act_cfg_ = act_cfg.copy()
+            act_cfg_ = act_cfg.copy()  # type: ignore
             # nn.Tanh has no 'inplace' argument
             if act_cfg_['type'] not in [
-                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish'
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
             ]:
                 act_cfg_.setdefault('inplace', inplace)
             self.activate = build_activation_layer(act_cfg_)
@@ -193,7 +196,10 @@ class ConvModule(nn.Module):
         if self.with_norm:
             constant_init(self.norm, 1, bias=0)
 
-    def forward(self, x, activate=True, norm=True):
+    def forward(self,
+                x: torch.Tensor,
+                activate: bool = True,
+                norm: bool = True) -> torch.Tensor:
         for layer in self.order:
             if layer == 'conv':
                 if self.with_explicit_padding:
diff --git a/mmcv/cnn/bricks/conv_ws.py b/mmcv/cnn/bricks/conv_ws.py
index a3941e27874993418b3b5708d5a7485f175ff9c8..6569f920fea942a9345ff509c7dbdb6ace1f3741 100644
--- a/mmcv/cnn/bricks/conv_ws.py
+++ b/mmcv/cnn/bricks/conv_ws.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -6,14 +9,14 @@ import torch.nn.functional as F
 from .registry import CONV_LAYERS
 
 
-def conv_ws_2d(input,
-               weight,
-               bias=None,
-               stride=1,
-               padding=0,
-               dilation=1,
-               groups=1,
-               eps=1e-5):
+def conv_ws_2d(input: torch.Tensor,
+               weight: torch.Tensor,
+               bias: Optional[torch.Tensor] = None,
+               stride: Union[int, Tuple[int, int]] = 1,
+               padding: Union[int, Tuple[int, int]] = 0,
+               dilation: Union[int, Tuple[int, int]] = 1,
+               groups: int = 1,
+               eps: float = 1e-5) -> torch.Tensor:
     c_in = weight.size(0)
     weight_flat = weight.view(c_in, -1)
     mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
@@ -26,16 +29,16 @@ def conv_ws_2d(input,
 class ConvWS2d(nn.Conv2d):
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True,
-                 eps=1e-5):
-        super(ConvWS2d, self).__init__(
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 eps: float = 1e-5):
+        super().__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -46,7 +49,7 @@ class ConvWS2d(nn.Conv2d):
             bias=bias)
         self.eps = eps
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
                           self.dilation, self.groups, self.eps)
 
@@ -76,14 +79,14 @@ class ConvAWS2d(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
         super().__init__(
             in_channels,
             out_channels,
@@ -98,7 +101,7 @@ class ConvAWS2d(nn.Conv2d):
         self.register_buffer('weight_beta',
                              torch.zeros(self.out_channels, 1, 1, 1))
 
-    def _get_weight(self, weight):
+    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
         weight_flat = weight.view(weight.size(0), -1)
         mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
         std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
@@ -106,13 +109,16 @@ class ConvAWS2d(nn.Conv2d):
         weight = self.weight_gamma * weight + self.weight_beta
         return weight
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         weight = self._get_weight(self.weight)
         return F.conv2d(x, weight, self.bias, self.stride, self.padding,
                         self.dilation, self.groups)
 
-    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
-                              missing_keys, unexpected_keys, error_msgs):
+    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                              local_metadata: Dict, strict: bool,
+                              missing_keys: List[str],
+                              unexpected_keys: List[str],
+                              error_msgs: List[str]) -> None:
         """Override default load function.
 
         AWS overrides the function _load_from_state_dict to recover
@@ -124,7 +130,7 @@ class ConvAWS2d(nn.Conv2d):
         """
 
         self.weight_gamma.data.fill_(-1)
-        local_missing_keys = []
+        local_missing_keys: List = []
         super()._load_from_state_dict(state_dict, prefix, local_metadata,
                                       strict, local_missing_keys,
                                       unexpected_keys, error_msgs)
diff --git a/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/mmcv/cnn/bricks/depthwise_separable_conv_module.py
index 722d5d8d71f75486e2db3008907c4eadfca41d63..cf1fe4cad3812007573211fa2bede28b23822122 100644
--- a/mmcv/cnn/bricks/depthwise_separable_conv_module.py
+++ b/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
 import torch.nn as nn
 
 from .conv_module import ConvModule
@@ -46,27 +49,27 @@ class DepthwiseSeparableConvModule(nn.Module):
     """
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 norm_cfg=None,
-                 act_cfg=dict(type='ReLU'),
-                 dw_norm_cfg='default',
-                 dw_act_cfg='default',
-                 pw_norm_cfg='default',
-                 pw_act_cfg='default',
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Dict = dict(type='ReLU'),
+                 dw_norm_cfg: Union[Dict, str] = 'default',
+                 dw_act_cfg: Union[Dict, str] = 'default',
+                 pw_norm_cfg: Union[Dict, str] = 'default',
+                 pw_act_cfg: Union[Dict, str] = 'default',
                  **kwargs):
-        super(DepthwiseSeparableConvModule, self).__init__()
+        super().__init__()
         assert 'groups' not in kwargs, 'groups should not be specified'
 
         # if norm/activation config of depthwise/pointwise ConvModule is not
         # specified, use default config.
-        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
         dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
-        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
         pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
 
         # depthwise convolution
@@ -78,19 +81,19 @@ class DepthwiseSeparableConvModule(nn.Module):
             padding=padding,
             dilation=dilation,
             groups=in_channels,
-            norm_cfg=dw_norm_cfg,
-            act_cfg=dw_act_cfg,
+            norm_cfg=dw_norm_cfg,  # type: ignore
+            act_cfg=dw_act_cfg,  # type: ignore
             **kwargs)
 
         self.pointwise_conv = ConvModule(
             in_channels,
             out_channels,
             1,
-            norm_cfg=pw_norm_cfg,
-            act_cfg=pw_act_cfg,
+            norm_cfg=pw_norm_cfg,  # type: ignore
+            act_cfg=pw_act_cfg,  # type: ignore
             **kwargs)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.depthwise_conv(x)
         x = self.pointwise_conv(x)
         return x
diff --git a/mmcv/cnn/bricks/drop.py b/mmcv/cnn/bricks/drop.py
index b0a026654ac2e3b994eb7a5248ca9faa277f8985..ea05221d854592a5d885efbef002cb673c65f778 100644
--- a/mmcv/cnn/bricks/drop.py
+++ b/mmcv/cnn/bricks/drop.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional
+
 import torch
 import torch.nn as nn
 
@@ -6,7 +8,9 @@ from mmcv import build_from_cfg
 from .registry import DROPOUT_LAYERS
 
 
-def drop_path(x, drop_prob=0., training=False):
+def drop_path(x: torch.Tensor,
+              drop_prob: float = 0.,
+              training: bool = False) -> torch.Tensor:
     """Drop paths (Stochastic Depth) per sample (when applied in main path of
     residual blocks).
 
@@ -36,11 +40,11 @@ class DropPath(nn.Module):
         drop_prob (float): Probability of the path to be zeroed. Default: 0.1
     """
 
-    def __init__(self, drop_prob=0.1):
-        super(DropPath, self).__init__()
+    def __init__(self, drop_prob: float = 0.1):
+        super().__init__()
         self.drop_prob = drop_prob
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return drop_path(x, self.drop_prob, self.training)
 
 
@@ -56,10 +60,10 @@ class Dropout(nn.Dropout):
         inplace (bool):  Do the operation inplace or not. Default: False.
     """
 
-    def __init__(self, drop_prob=0.5, inplace=False):
+    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
         super().__init__(p=drop_prob, inplace=inplace)
 
 
-def build_dropout(cfg, default_args=None):
+def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
     """Builder for drop out layers."""
     return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/mmcv/cnn/bricks/generalized_attention.py b/mmcv/cnn/bricks/generalized_attention.py
index 988d9adf2f289ef223bd1c680a5ae1d3387f0269..118e39c7ea2d9f24a97f22878dfbe753c4afef0b 100644
--- a/mmcv/cnn/bricks/generalized_attention.py
+++ b/mmcv/cnn/bricks/generalized_attention.py
@@ -45,16 +45,16 @@ class GeneralizedAttention(nn.Module):
     _abbr_ = 'gen_attention_block'
 
     def __init__(self,
-                 in_channels,
-                 spatial_range=-1,
-                 num_heads=9,
-                 position_embedding_dim=-1,
-                 position_magnitude=1,
-                 kv_stride=2,
-                 q_stride=1,
-                 attention_type='1111'):
+                 in_channels: int,
+                 spatial_range: int = -1,
+                 num_heads: int = 9,
+                 position_embedding_dim: int = -1,
+                 position_magnitude: int = 1,
+                 kv_stride: int = 2,
+                 q_stride: int = 1,
+                 attention_type: str = '1111'):
 
-        super(GeneralizedAttention, self).__init__()
+        super().__init__()
 
         # hard range means local range for non-local operation
         self.position_embedding_dim = (
@@ -131,7 +131,7 @@ class GeneralizedAttention(nn.Module):
 
             max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
             local_constraint_map = np.ones(
-                (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int)
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
             for iy in range(max_len):
                 for ix in range(max_len):
                     local_constraint_map[
@@ -213,7 +213,7 @@ class GeneralizedAttention(nn.Module):
 
         return embedding_x, embedding_y
 
-    def forward(self, x_input):
+    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
         num_heads = self.num_heads
 
         # use empirical_attention
@@ -351,7 +351,7 @@ class GeneralizedAttention(nn.Module):
                         repeat(n, 1, 1, 1)
 
                     position_feat_x_reshape = position_feat_x.\
-                        view(n, num_heads, w*w_kv, self.qk_embed_dim)
+                        view(n, num_heads, w * w_kv, self.qk_embed_dim)
 
                     position_feat_y_reshape = position_feat_y.\
                         view(n, num_heads, h * h_kv, self.qk_embed_dim)
diff --git a/mmcv/cnn/bricks/hsigmoid.py b/mmcv/cnn/bricks/hsigmoid.py
index 30b1a3d6580cf0360710426fbea1f05acdf07b4b..5eb97e8ab13e76c6916a7ebba15cb50f8b846897 100644
--- a/mmcv/cnn/bricks/hsigmoid.py
+++ b/mmcv/cnn/bricks/hsigmoid.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
 import torch.nn as nn
 
 from .registry import ACTIVATION_LAYERS
@@ -8,11 +11,15 @@ from .registry import ACTIVATION_LAYERS
 class HSigmoid(nn.Module):
     """Hard Sigmoid Module. Apply the hard sigmoid function:
     Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
-    Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
+    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+    Note:
+        In MMCV v1.4.4, we modified the default value of args to align with
+        PyTorch official.
 
     Args:
-        bias (float): Bias of the input feature map. Default: 1.0.
-        divisor (float): Divisor of the input feature map. Default: 2.0.
+        bias (float): Bias of the input feature map. Default: 3.0.
+        divisor (float): Divisor of the input feature map. Default: 6.0.
         min_value (float): Lower bound value. Default: 0.0.
         max_value (float): Upper bound value. Default: 1.0.
 
@@ -20,15 +27,25 @@ class HSigmoid(nn.Module):
         Tensor: The output tensor.
     """
 
-    def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
-        super(HSigmoid, self).__init__()
+    def __init__(self,
+                 bias: float = 3.0,
+                 divisor: float = 6.0,
+                 min_value: float = 0.0,
+                 max_value: float = 1.0):
+        super().__init__()
+        warnings.warn(
+            'In MMCV v1.4.4, we modified the default value of args to align '
+            'with PyTorch official. Previous Implementation: '
+            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+            'Current Implementation: '
+            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
         self.bias = bias
         self.divisor = divisor
         assert self.divisor != 0
         self.min_value = min_value
         self.max_value = max_value
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = (x + self.bias) / self.divisor
 
         return x.clamp_(self.min_value, self.max_value)
diff --git a/mmcv/cnn/bricks/hswish.py b/mmcv/cnn/bricks/hswish.py
index 7e0c090ff037c99ee6c5c84c4592e87beae02208..6f6cc276c10a5c49bd9c0e30a1ffad4a1b6018d4 100644
--- a/mmcv/cnn/bricks/hswish.py
+++ b/mmcv/cnn/bricks/hswish.py
@@ -1,10 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
 import torch.nn as nn
 
+from mmcv.utils import TORCH_VERSION, digit_version
 from .registry import ACTIVATION_LAYERS
 
 
-@ACTIVATION_LAYERS.register_module()
 class HSwish(nn.Module):
     """Hard Swish Module.
 
@@ -21,9 +22,18 @@ class HSwish(nn.Module):
         Tensor: The output tensor.
     """
 
-    def __init__(self, inplace=False):
-        super(HSwish, self).__init__()
+    def __init__(self, inplace: bool = False):
+        super().__init__()
         self.act = nn.ReLU6(inplace)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x * self.act(x + 3) / 6
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.7')):
+    # Hardswish is not supported when PyTorch version < 1.6.
+    # And Hardswish in PyTorch 1.6 does not support inplace.
+    ACTIVATION_LAYERS.register_module(module=HSwish)
+else:
+    ACTIVATION_LAYERS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/mmcv/cnn/bricks/non_local.py b/mmcv/cnn/bricks/non_local.py
index 92d00155ef275c1201ea66bba30470a1785cc5d7..159db245e80950d9b94e2744361bca2a09e67c13 100644
--- a/mmcv/cnn/bricks/non_local.py
+++ b/mmcv/cnn/bricks/non_local.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABCMeta
+from typing import Dict, Optional
 
 import torch
 import torch.nn as nn
@@ -33,14 +34,14 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
     """
 
     def __init__(self,
-                 in_channels,
-                 reduction=2,
-                 use_scale=True,
-                 conv_cfg=None,
-                 norm_cfg=None,
-                 mode='embedded_gaussian',
+                 in_channels: int,
+                 reduction: int = 2,
+                 use_scale: bool = True,
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 mode: str = 'embedded_gaussian',
                  **kwargs):
-        super(_NonLocalNd, self).__init__()
+        super().__init__()
         self.in_channels = in_channels
         self.reduction = reduction
         self.use_scale = use_scale
@@ -61,7 +62,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
             self.inter_channels,
             kernel_size=1,
             conv_cfg=conv_cfg,
-            act_cfg=None)
+            act_cfg=None)  # type: ignore
         self.conv_out = ConvModule(
             self.inter_channels,
             self.in_channels,
@@ -96,7 +97,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
 
         self.init_weights(**kwargs)
 
-    def init_weights(self, std=0.01, zeros_init=True):
+    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
         if self.mode != 'gaussian':
             for m in [self.g, self.theta, self.phi]:
                 normal_init(m.conv, std=std)
@@ -113,7 +114,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
             else:
                 normal_init(self.conv_out.norm, std=std)
 
-    def gaussian(self, theta_x, phi_x):
+    def gaussian(self, theta_x: torch.Tensor,
+                 phi_x: torch.Tensor) -> torch.Tensor:
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -121,7 +123,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight = pairwise_weight.softmax(dim=-1)
         return pairwise_weight
 
-    def embedded_gaussian(self, theta_x, phi_x):
+    def embedded_gaussian(self, theta_x: torch.Tensor,
+                          phi_x: torch.Tensor) -> torch.Tensor:
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -132,7 +135,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight = pairwise_weight.softmax(dim=-1)
         return pairwise_weight
 
-    def dot_product(self, theta_x, phi_x):
+    def dot_product(self, theta_x: torch.Tensor,
+                    phi_x: torch.Tensor) -> torch.Tensor:
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -140,7 +144,8 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
         pairwise_weight /= pairwise_weight.shape[-1]
         return pairwise_weight
 
-    def concatenation(self, theta_x, phi_x):
+    def concatenation(self, theta_x: torch.Tensor,
+                      phi_x: torch.Tensor) -> torch.Tensor:
         # NonLocal1d pairwise_weight: [N, H, H]
         # NonLocal2d pairwise_weight: [N, HxW, HxW]
         # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
@@ -157,7 +162,7 @@ class _NonLocalNd(nn.Module, metaclass=ABCMeta):
 
         return pairwise_weight
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         # Assume `reduction = 1`, then `inter_channels = C`
         # or `inter_channels = C` when `mode="gaussian"`
 
@@ -224,12 +229,11 @@ class NonLocal1d(_NonLocalNd):
     """
 
     def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv1d'),
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv1d'),
                  **kwargs):
-        super(NonLocal1d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
 
         self.sub_sample = sub_sample
 
@@ -258,12 +262,11 @@ class NonLocal2d(_NonLocalNd):
     _abbr_ = 'nonlocal_block'
 
     def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv2d'),
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv2d'),
                  **kwargs):
-        super(NonLocal2d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
 
         self.sub_sample = sub_sample
 
@@ -289,12 +292,11 @@ class NonLocal3d(_NonLocalNd):
     """
 
     def __init__(self,
-                 in_channels,
-                 sub_sample=False,
-                 conv_cfg=dict(type='Conv3d'),
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv3d'),
                  **kwargs):
-        super(NonLocal3d, self).__init__(
-            in_channels, conv_cfg=conv_cfg, **kwargs)
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
         self.sub_sample = sub_sample
 
         if sub_sample:
diff --git a/mmcv/cnn/bricks/norm.py b/mmcv/cnn/bricks/norm.py
index cfb326bdb8ced3ec17ab5c3203cb6d6784ff2e78..b6281a7c697483fbdaaba5a37d88a00f3c259d31 100644
--- a/mmcv/cnn/bricks/norm.py
+++ b/mmcv/cnn/bricks/norm.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import inspect
+from typing import Dict, Tuple, Union
 
 import torch.nn as nn
 
@@ -69,7 +70,9 @@ def infer_abbr(class_type):
             return 'norm_layer'
 
 
-def build_norm_layer(cfg, num_features, postfix=''):
+def build_norm_layer(cfg: Dict,
+                     num_features: int,
+                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
     """Build normalization layer.
 
     Args:
@@ -83,9 +86,9 @@ def build_norm_layer(cfg, num_features, postfix=''):
             to create named layer.
 
     Returns:
-        (str, nn.Module): The first element is the layer name consisting of
-            abbreviation and postfix, e.g., bn1, gn. The second element is the
-            created norm layer.
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
     """
     if not isinstance(cfg, dict):
         raise TypeError('cfg must be a dict')
@@ -119,7 +122,8 @@ def build_norm_layer(cfg, num_features, postfix=''):
     return name, layer
 
 
-def is_norm(layer, exclude=None):
+def is_norm(layer: nn.Module,
+            exclude: Union[type, tuple, None] = None) -> bool:
     """Check if a layer is a normalization layer.
 
     Args:
diff --git a/mmcv/cnn/bricks/padding.py b/mmcv/cnn/bricks/padding.py
index e4ac6b28a1789bd551c613a7d3e7b622433ac7ec..8412b0c6576fd220eca52382943ad5889f0dfd1f 100644
--- a/mmcv/cnn/bricks/padding.py
+++ b/mmcv/cnn/bricks/padding.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
 import torch.nn as nn
 
 from .registry import PADDING_LAYERS
@@ -8,11 +10,11 @@ PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
 PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
 
 
-def build_padding_layer(cfg, *args, **kwargs):
+def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
     """Build padding layer.
 
     Args:
-        cfg (None or dict): The padding layer config, which should contain:
+        cfg (dict): The padding layer config, which should contain:
             - type (str): Layer type.
             - layer args: Args needed to instantiate a padding layer.
 
diff --git a/mmcv/cnn/bricks/plugin.py b/mmcv/cnn/bricks/plugin.py
index 07c010d4053174dd41107aa654ea67e82b46a25c..095ef9234501d0bca54373d4422244b80f818341 100644
--- a/mmcv/cnn/bricks/plugin.py
+++ b/mmcv/cnn/bricks/plugin.py
@@ -1,15 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import inspect
 import platform
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
 
 from .registry import PLUGIN_LAYERS
 
 if platform.system() == 'Windows':
-    import regex as re
+    import regex as re  # type: ignore
 else:
-    import re
+    import re  # type: ignore
 
 
-def infer_abbr(class_type):
+def infer_abbr(class_type: type) -> str:
     """Infer abbreviation from the class name.
 
     This method will infer the abbreviation to map class types to
@@ -47,25 +51,27 @@ def infer_abbr(class_type):
         raise TypeError(
             f'class_type must be a type, but got {type(class_type)}')
     if hasattr(class_type, '_abbr_'):
-        return class_type._abbr_
+        return class_type._abbr_  # type: ignore
     else:
         return camel2snack(class_type.__name__)
 
 
-def build_plugin_layer(cfg, postfix='', **kwargs):
+def build_plugin_layer(cfg: Dict,
+                       postfix: Union[int, str] = '',
+                       **kwargs) -> Tuple[str, nn.Module]:
     """Build plugin layer.
 
     Args:
-        cfg (None or dict): cfg should contain:
-            type (str): identify plugin layer type.
-            layer args: args needed to instantiate a plugin layer.
+        cfg (dict): cfg should contain:
+
+            - type (str): identify plugin layer type.
+            - layer args: args needed to instantiate a plugin layer.
         postfix (int, str): appended into norm abbreviation to
             create named layer. Default: ''.
 
     Returns:
-        tuple[str, nn.Module]:
-            name (str): abbreviation + postfix
-            layer (nn.Module): created plugin layer
+        tuple[str, nn.Module]: The first one is the concatenation of
+        abbreviation and postfix. The second is the created plugin layer.
     """
     if not isinstance(cfg, dict):
         raise TypeError('cfg must be a dict')
diff --git a/mmcv/cnn/bricks/scale.py b/mmcv/cnn/bricks/scale.py
index c905fffcc8bf998d18d94f927591963c428025e2..dbd07c6a445e116bd6f32c96d8b52079ccf9b28a 100644
--- a/mmcv/cnn/bricks/scale.py
+++ b/mmcv/cnn/bricks/scale.py
@@ -13,9 +13,9 @@ class Scale(nn.Module):
         scale (float): Initial value of scale factor. Default: 1.0
     """
 
-    def __init__(self, scale=1.0):
-        super(Scale, self).__init__()
+    def __init__(self, scale: float = 1.0):
+        super().__init__()
         self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x * self.scale
diff --git a/mmcv/cnn/bricks/swish.py b/mmcv/cnn/bricks/swish.py
index e2ca8ed7b749413f011ae54aac0cab27e6f0b51f..b297adff068661859265a5057c1b2204ac8eefa7 100644
--- a/mmcv/cnn/bricks/swish.py
+++ b/mmcv/cnn/bricks/swish.py
@@ -19,7 +19,7 @@ class Swish(nn.Module):
     """
 
     def __init__(self):
-        super(Swish, self).__init__()
+        super().__init__()
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x * torch.sigmoid(x)
diff --git a/mmcv/cnn/bricks/transformer.py b/mmcv/cnn/bricks/transformer.py
index ed32688af40c0744289d07cd991b17a0dcb1c29f..f7ba4d9f836609cec8526607db98c4b03ec4fee3 100644
--- a/mmcv/cnn/bricks/transformer.py
+++ b/mmcv/cnn/bricks/transformer.py
@@ -1,21 +1,26 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import math
 import warnings
+from typing import Sequence
 
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
-from mmcv import ConfigDict, deprecated_api_warning
-from mmcv.cnn import Linear, build_activation_layer, build_norm_layer
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
 from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
-from mmcv.utils import build_from_cfg
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
 from .drop import build_dropout
 from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
                        TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
 
 # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
 try:
-    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention  # noqa F401
+    from mmcv.ops.multi_scale_deform_attn import \
+        MultiScaleDeformableAttention  # noqa F401
     warnings.warn(
         ImportWarning(
             '``MultiScaleDeformableAttention`` has been moved to '
@@ -55,6 +60,349 @@ def build_transformer_layer_sequence(cfg, default_args=None):
     return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
 
 
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
 @ATTENTION.register_module()
 class MultiheadAttention(BaseModule):
     """A wrapper for ``torch.nn.MultiheadAttention``.
@@ -87,12 +435,13 @@ class MultiheadAttention(BaseModule):
                  init_cfg=None,
                  batch_first=False,
                  **kwargs):
-        super(MultiheadAttention, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         if 'dropout' in kwargs:
-            warnings.warn('The arguments `dropout` in MultiheadAttention '
-                          'has been deprecated, now you can separately '
-                          'set `attn_drop`(float), proj_drop(float), '
-                          'and `dropout_layer`(dict) ')
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
             attn_drop = kwargs['dropout']
             dropout_layer['drop_prob'] = kwargs.pop('dropout')
 
@@ -154,9 +503,9 @@ class MultiheadAttention(BaseModule):
 
         Returns:
             Tensor: forwarded results with shape
-                [num_queries, bs, embed_dims]
-                if self.batch_first is False, else
-                [bs, num_queries embed_dims].
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
         """
 
         if key is None:
@@ -241,7 +590,7 @@ class FFN(BaseModule):
                  add_identity=True,
                  init_cfg=None,
                  **kwargs):
-        super(FFN, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         assert num_fcs >= 2, 'num_fcs should be no less ' \
             f'than 2. got {num_fcs}.'
         self.embed_dims = embed_dims
@@ -342,15 +691,15 @@ class BaseTransformerLayer(BaseModule):
                     f'The arguments `{ori_name}` in BaseTransformerLayer '
                     f'has been deprecated, now you should set `{new_name}` '
                     f'and other FFN related arguments '
-                    f'to a dict named `ffn_cfgs`. ')
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
                 ffn_cfgs[new_name] = kwargs[ori_name]
 
-        super(BaseTransformerLayer, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.batch_first = batch_first
 
-        assert set(operation_order) & set(
-            ['self_attn', 'norm', 'ffn', 'cross_attn']) == \
+        assert set(operation_order) & {
+            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
             set(operation_order), f'The operation_order of' \
             f' {self.__class__.__name__} should ' \
             f'contains all four operation type ' \
@@ -397,7 +746,7 @@ class BaseTransformerLayer(BaseModule):
         assert len(ffn_cfgs) == num_ffns
         for ffn_index in range(num_ffns):
             if 'embed_dims' not in ffn_cfgs[ffn_index]:
-                ffn_cfgs['embed_dims'] = self.embed_dims
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
             else:
                 assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
             self.ffns.append(
@@ -531,7 +880,7 @@ class TransformerLayerSequence(BaseModule):
     """
 
     def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
-        super(TransformerLayerSequence, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         if isinstance(transformerlayers, dict):
             transformerlayers = [
                 copy.deepcopy(transformerlayers) for _ in range(num_layers)
diff --git a/mmcv/cnn/bricks/upsample.py b/mmcv/cnn/bricks/upsample.py
index a1a353767d0ce8518f0d7289bed10dba0178ed12..d86c5f54a22ed26b09f66bd59659ff7ab1f5b3d9 100644
--- a/mmcv/cnn/bricks/upsample.py
+++ b/mmcv/cnn/bricks/upsample.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -24,9 +27,9 @@ class PixelShufflePack(nn.Module):
             channels.
     """
 
-    def __init__(self, in_channels, out_channels, scale_factor,
-                 upsample_kernel):
-        super(PixelShufflePack, self).__init__()
+    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
+                 upsample_kernel: int):
+        super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.scale_factor = scale_factor
@@ -41,13 +44,13 @@ class PixelShufflePack(nn.Module):
     def init_weights(self):
         xavier_init(self.upsample_conv, distribution='uniform')
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.upsample_conv(x)
         x = F.pixel_shuffle(x, self.scale_factor)
         return x
 
 
-def build_upsample_layer(cfg, *args, **kwargs):
+def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
     """Build upsample layer.
 
     Args:
@@ -55,7 +58,7 @@ def build_upsample_layer(cfg, *args, **kwargs):
 
             - type (str): Layer type.
             - scale_factor (int): Upsample ratio, which is not applicable to
-                deconv.
+              deconv.
             - layer args: Args needed to instantiate a upsample layer.
         args (argument list): Arguments passed to the ``__init__``
             method of the corresponding conv layer.
diff --git a/mmcv/cnn/bricks/wrappers.py b/mmcv/cnn/bricks/wrappers.py
index 8aebf67bf52355a513f21756ee74fe510902d075..a07eff00e49970c7692ee3f2625c7f7aba9d7b22 100644
--- a/mmcv/cnn/bricks/wrappers.py
+++ b/mmcv/cnn/bricks/wrappers.py
@@ -21,19 +21,19 @@ else:
     TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
 
 
-def obsolete_torch_version(torch_version, version_threshold):
+def obsolete_torch_version(torch_version, version_threshold) -> bool:
     return torch_version == 'parrots' or torch_version <= version_threshold
 
 
 class NewEmptyTensorOp(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, x, new_shape):
+    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
         ctx.shape = x.shape
         return x.new_empty(new_shape)
 
     @staticmethod
-    def backward(ctx, grad):
+    def backward(ctx, grad: torch.Tensor) -> tuple:
         shape = ctx.shape
         return NewEmptyTensorOp.apply(grad, shape), None
 
@@ -41,7 +41,7 @@ class NewEmptyTensorOp(torch.autograd.Function):
 @CONV_LAYERS.register_module('Conv', force=True)
 class Conv2d(nn.Conv2d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
@@ -62,7 +62,7 @@ class Conv2d(nn.Conv2d):
 @CONV_LAYERS.register_module('Conv3d', force=True)
 class Conv3d(nn.Conv3d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
@@ -85,7 +85,7 @@ class Conv3d(nn.Conv3d):
 @UPSAMPLE_LAYERS.register_module('deconv', force=True)
 class ConvTranspose2d(nn.ConvTranspose2d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
@@ -108,7 +108,7 @@ class ConvTranspose2d(nn.ConvTranspose2d):
 @UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
 class ConvTranspose3d(nn.ConvTranspose3d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
             out_shape = [x.shape[0], self.out_channels]
             for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
@@ -128,7 +128,7 @@ class ConvTranspose3d(nn.ConvTranspose3d):
 
 class MaxPool2d(nn.MaxPool2d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         # PyTorch 1.9 does not support empty tensor inference yet
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
             out_shape = list(x.shape[:2])
@@ -146,7 +146,7 @@ class MaxPool2d(nn.MaxPool2d):
 
 class MaxPool3d(nn.MaxPool3d):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         # PyTorch 1.9 does not support empty tensor inference yet
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
             out_shape = list(x.shape[:2])
@@ -165,7 +165,7 @@ class MaxPool3d(nn.MaxPool3d):
 
 class Linear(torch.nn.Linear):
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         # empty tensor forward of Linear layer is supported in Pytorch 1.6
         if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
             out_shape = [x.shape[0], self.out_features]
diff --git a/mmcv/cnn/resnet.py b/mmcv/cnn/resnet.py
index 1cb3ac057ee2d52c46fc94685b5d4e698aad8d5f..fb29e6256280b671acfbf73fd9a01f079749b260 100644
--- a/mmcv/cnn/resnet.py
+++ b/mmcv/cnn/resnet.py
@@ -1,13 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
+from typing import Optional, Sequence, Tuple, Union
 
 import torch.nn as nn
 import torch.utils.checkpoint as cp
+from torch import Tensor
 
 from .utils import constant_init, kaiming_init
 
 
-def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            dilation: int = 1):
     """3x3 convolution with padding."""
     return nn.Conv2d(
         in_planes,
@@ -23,14 +28,14 @@ class BasicBlock(nn.Module):
     expansion = 1
 
     def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False):
-        super(BasicBlock, self).__init__()
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        super().__init__()
         assert style in ['pytorch', 'caffe']
         self.conv1 = conv3x3(inplanes, planes, stride, dilation)
         self.bn1 = nn.BatchNorm2d(planes)
@@ -42,7 +47,7 @@ class BasicBlock(nn.Module):
         self.dilation = dilation
         assert not with_cp
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         residual = x
 
         out = self.conv1(x)
@@ -65,19 +70,19 @@ class Bottleneck(nn.Module):
     expansion = 4
 
     def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 dilation=1,
-                 downsample=None,
-                 style='pytorch',
-                 with_cp=False):
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
         """Bottleneck block.
 
         If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
         it is "caffe", the stride-two layer is the first 1x1 conv layer.
         """
-        super(Bottleneck, self).__init__()
+        super().__init__()
         assert style in ['pytorch', 'caffe']
         if style == 'pytorch':
             conv1_stride = 1
@@ -107,7 +112,7 @@ class Bottleneck(nn.Module):
         self.dilation = dilation
         self.with_cp = with_cp
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
 
         def _inner_forward(x):
             residual = x
@@ -140,14 +145,14 @@ class Bottleneck(nn.Module):
         return out
 
 
-def make_res_layer(block,
-                   inplanes,
-                   planes,
-                   blocks,
-                   stride=1,
-                   dilation=1,
-                   style='pytorch',
-                   with_cp=False):
+def make_res_layer(block: nn.Module,
+                   inplanes: int,
+                   planes: int,
+                   blocks: int,
+                   stride: int = 1,
+                   dilation: int = 1,
+                   style: str = 'pytorch',
+                   with_cp: bool = False) -> nn.Module:
     downsample = None
     if stride != 1 or inplanes != planes * block.expansion:
         downsample = nn.Sequential(
@@ -208,22 +213,22 @@ class ResNet(nn.Module):
     }
 
     def __init__(self,
-                 depth,
-                 num_stages=4,
-                 strides=(1, 2, 2, 2),
-                 dilations=(1, 1, 1, 1),
-                 out_indices=(0, 1, 2, 3),
-                 style='pytorch',
-                 frozen_stages=-1,
-                 bn_eval=True,
-                 bn_frozen=False,
-                 with_cp=False):
-        super(ResNet, self).__init__()
+                 depth: int,
+                 num_stages: int = 4,
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3),
+                 style: str = 'pytorch',
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 with_cp: bool = False):
+        super().__init__()
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for resnet')
         assert num_stages >= 1 and num_stages <= 4
         block, stage_blocks = self.arch_settings[depth]
-        stage_blocks = stage_blocks[:num_stages]
+        stage_blocks = stage_blocks[:num_stages]  # type: ignore
         assert len(strides) == len(dilations) == num_stages
         assert max(out_indices) < num_stages
 
@@ -234,7 +239,7 @@ class ResNet(nn.Module):
         self.bn_frozen = bn_frozen
         self.with_cp = with_cp
 
-        self.inplanes = 64
+        self.inplanes: int = 64
         self.conv1 = nn.Conv2d(
             3, 64, kernel_size=7, stride=2, padding=3, bias=False)
         self.bn1 = nn.BatchNorm2d(64)
@@ -255,14 +260,15 @@ class ResNet(nn.Module):
                 dilation=dilation,
                 style=self.style,
                 with_cp=with_cp)
-            self.inplanes = planes * block.expansion
+            self.inplanes = planes * block.expansion  # type: ignore
             layer_name = f'layer{i + 1}'
             self.add_module(layer_name, res_layer)
             self.res_layers.append(layer_name)
 
-        self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1)
+        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
+            len(stage_blocks) - 1)
 
-    def init_weights(self, pretrained=None):
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
             from ..runner import load_checkpoint
@@ -276,7 +282,7 @@ class ResNet(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
         x = self.conv1(x)
         x = self.bn1(x)
         x = self.relu(x)
@@ -292,8 +298,8 @@ class ResNet(nn.Module):
         else:
             return tuple(outs)
 
-    def train(self, mode=True):
-        super(ResNet, self).train(mode)
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
         if self.bn_eval:
             for m in self.modules():
                 if isinstance(m, nn.BatchNorm2d):
diff --git a/mmcv/cnn/utils/flops_counter.py b/mmcv/cnn/utils/flops_counter.py
index dceeb398bfc8a562d406136028381326ef55e0dc..150a55992a9561073626d26df503ba4ef37efa18 100644
--- a/mmcv/cnn/utils/flops_counter.py
+++ b/mmcv/cnn/utils/flops_counter.py
@@ -24,7 +24,9 @@
 # SOFTWARE.
 
 import sys
+import warnings
 from functools import partial
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple
 
 import numpy as np
 import torch
@@ -33,13 +35,13 @@ import torch.nn as nn
 import mmcv
 
 
-def get_model_complexity_info(model,
-                              input_shape,
-                              print_per_layer_stat=True,
-                              as_strings=True,
-                              input_constructor=None,
-                              flush=False,
-                              ost=sys.stdout):
+def get_model_complexity_info(model: nn.Module,
+                              input_shape: tuple,
+                              print_per_layer_stat: bool = True,
+                              as_strings: bool = True,
+                              input_constructor: Optional[Callable] = None,
+                              flush: bool = False,
+                              ost: TextIO = sys.stdout) -> tuple:
     """Get complexity information of a model.
 
     This method can calculate FLOPs and parameter counts of a model with
@@ -48,16 +50,16 @@ def get_model_complexity_info(model,
 
     Supported layers are listed as below:
         - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
-        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``,
-            ``nn.ReLU6``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+          ``nn.LeakyReLU``, ``nn.ReLU6``.
         - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
-            ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
-            ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
-            ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
-            ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
         - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
-            ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
-            ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
         - Linear: ``nn.Linear``.
         - Deconvolution: ``nn.ConvTranspose2d``.
         - Upsample: ``nn.Upsample``.
@@ -78,8 +80,8 @@ def get_model_complexity_info(model,
 
     Returns:
         tuple[float | str]: If ``as_strings`` is set to True, it will return
-            FLOPs and parameter counts in a string format. otherwise, it will
-            return those in a float number format.
+        FLOPs and parameter counts in a string format. otherwise, it will
+        return those in a float number format.
     """
     assert type(input_shape) is tuple
     assert len(input_shape) >= 1
@@ -115,7 +117,9 @@ def get_model_complexity_info(model,
     return flops_count, params_count
 
 
-def flops_to_string(flops, units='GFLOPs', precision=2):
+def flops_to_string(flops: float,
+                    units: Optional[str] = 'GFLOPs',
+                    precision: int = 2) -> str:
     """Convert FLOPs number into a string.
 
     Note that Here we take a multiply-add counts as one FLOP.
@@ -158,7 +162,9 @@ def flops_to_string(flops, units='GFLOPs', precision=2):
             return str(flops) + ' FLOPs'
 
 
-def params_to_string(num_params, units=None, precision=2):
+def params_to_string(num_params: float,
+                     units: Optional[str] = None,
+                     precision: int = 2) -> str:
     """Convert parameter number into a string.
 
     Args:
@@ -195,13 +201,13 @@ def params_to_string(num_params, units=None, precision=2):
             return str(num_params)
 
 
-def print_model_with_flops(model,
-                           total_flops,
-                           total_params,
-                           units='GFLOPs',
-                           precision=3,
-                           ost=sys.stdout,
-                           flush=False):
+def print_model_with_flops(model: nn.Module,
+                           total_flops: float,
+                           total_params: float,
+                           units: Optional[str] = 'GFLOPs',
+                           precision: int = 3,
+                           ost: TextIO = sys.stdout,
+                           flush: bool = False) -> None:
     """Print a model with FLOPs for each layer.
 
     Args:
@@ -276,10 +282,10 @@ def print_model_with_flops(model,
         return ', '.join([
             params_to_string(
                 accumulated_num_params, units='M', precision=precision),
-            '{:.3%} Params'.format(accumulated_num_params / total_params),
+            f'{accumulated_num_params / total_params:.3%} Params',
             flops_to_string(
                 accumulated_flops_cost, units=units, precision=precision),
-            '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops),
+            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
             self.original_extra_repr()
         ])
 
@@ -304,7 +310,7 @@ def print_model_with_flops(model,
     model.apply(del_extra_repr)
 
 
-def get_model_parameters_number(model):
+def get_model_parameters_number(model: nn.Module) -> float:
     """Calculate parameter number of a model.
 
     Args:
@@ -317,16 +323,16 @@ def get_model_parameters_number(model):
     return num_params
 
 
-def add_flops_counting_methods(net_main_module):
+def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
     # adding additional methods to the existing module object,
     # this is done this way so that each function has access to self object
-    net_main_module.start_flops_count = start_flops_count.__get__(
+    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
         net_main_module)
-    net_main_module.stop_flops_count = stop_flops_count.__get__(
+    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
         net_main_module)
-    net_main_module.reset_flops_count = reset_flops_count.__get__(
+    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
         net_main_module)
-    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # noqa: E501
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
         net_main_module)
 
     net_main_module.reset_flops_count()
@@ -334,7 +340,7 @@ def add_flops_counting_methods(net_main_module):
     return net_main_module
 
 
-def compute_average_flops_cost(self):
+def compute_average_flops_cost(self) -> Tuple[float, float]:
     """Compute average FLOPs cost.
 
     A method to compute average FLOPs cost, which will be available after
@@ -352,7 +358,7 @@ def compute_average_flops_cost(self):
     return flops_sum / batches_count, params_sum
 
 
-def start_flops_count(self):
+def start_flops_count(self) -> None:
     """Activate the computation of mean flops consumption per image.
 
     A method to activate the computation of mean flops consumption per image.
@@ -361,7 +367,7 @@ def start_flops_count(self):
     """
     add_batch_counter_hook_function(self)
 
-    def add_flops_counter_hook_function(module):
+    def add_flops_counter_hook_function(module: nn.Module) -> None:
         if is_supported_instance(module):
             if hasattr(module, '__flops_handle__'):
                 return
@@ -375,7 +381,7 @@ def start_flops_count(self):
     self.apply(partial(add_flops_counter_hook_function))
 
 
-def stop_flops_count(self):
+def stop_flops_count(self) -> None:
     """Stop computing the mean flops consumption per image.
 
     A method to stop computing the mean flops consumption per image, which will
@@ -386,7 +392,7 @@ def stop_flops_count(self):
     self.apply(remove_flops_counter_hook_function)
 
 
-def reset_flops_count(self):
+def reset_flops_count(self) -> None:
     """Reset statistics computed so far.
 
     A method to Reset computed statistics, which will be available after
@@ -397,11 +403,13 @@ def reset_flops_count(self):
 
 
 # ---- Internal functions
-def empty_flops_counter_hook(module, input, output):
+def empty_flops_counter_hook(module: nn.Module, input: tuple,
+                             output: Any) -> None:
     module.__flops__ += 0
 
 
-def upsample_flops_counter_hook(module, input, output):
+def upsample_flops_counter_hook(module: nn.Module, input: tuple,
+                                output: torch.Tensor) -> None:
     output_size = output[0]
     batch_size = output_size.shape[0]
     output_elements_count = batch_size
@@ -410,39 +418,38 @@ def upsample_flops_counter_hook(module, input, output):
     module.__flops__ += int(output_elements_count)
 
 
-def relu_flops_counter_hook(module, input, output):
+def relu_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
     active_elements_count = output.numel()
     module.__flops__ += int(active_elements_count)
 
 
-def linear_flops_counter_hook(module, input, output):
-    input = input[0]
+def linear_flops_counter_hook(module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
     output_last_dim = output.shape[
         -1]  # pytorch checks dimensions, so here we don't care much
-    module.__flops__ += int(np.prod(input.shape) * output_last_dim)
+    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
 
 
-def pool_flops_counter_hook(module, input, output):
-    input = input[0]
-    module.__flops__ += int(np.prod(input.shape))
+def pool_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    module.__flops__ += int(np.prod(input[0].shape))
 
 
-def norm_flops_counter_hook(module, input, output):
-    input = input[0]
-
-    batch_flops = np.prod(input.shape)
+def norm_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    batch_flops = np.prod(input[0].shape)
     if (getattr(module, 'affine', False)
             or getattr(module, 'elementwise_affine', False)):
         batch_flops *= 2
     module.__flops__ += int(batch_flops)
 
 
-def deconv_flops_counter_hook(conv_module, input, output):
+def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
     # Can have multiple inputs, getting the first one
-    input = input[0]
-
-    batch_size = input.shape[0]
-    input_height, input_width = input.shape[2:]
+    batch_size = input[0].shape[0]
+    input_height, input_width = input[0].shape[2:]
 
     kernel_height, kernel_width = conv_module.kernel_size
     in_channels = conv_module.in_channels
@@ -458,17 +465,16 @@ def deconv_flops_counter_hook(conv_module, input, output):
     bias_flops = 0
     if conv_module.bias is not None:
         output_height, output_width = output.shape[2:]
-        bias_flops = out_channels * batch_size * output_height * output_height
+        bias_flops = out_channels * batch_size * output_height * output_width
     overall_flops = overall_conv_flops + bias_flops
 
     conv_module.__flops__ += int(overall_flops)
 
 
-def conv_flops_counter_hook(conv_module, input, output):
+def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
     # Can have multiple inputs, getting the first one
-    input = input[0]
-
-    batch_size = input.shape[0]
+    batch_size = input[0].shape[0]
     output_dims = list(output.shape[2:])
 
     kernel_dims = list(conv_module.kernel_size)
@@ -495,25 +501,23 @@ def conv_flops_counter_hook(conv_module, input, output):
     conv_module.__flops__ += int(overall_flops)
 
 
-def batch_counter_hook(module, input, output):
+def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
     batch_size = 1
     if len(input) > 0:
         # Can have multiple inputs, getting the first one
-        input = input[0]
-        batch_size = len(input)
+        batch_size = len(input[0])
     else:
-        pass
-        print('Warning! No positional inputs found for a module, '
-              'assuming batch size is 1.')
+        warnings.warn('No positional inputs found for a module, '
+                      'assuming batch size is 1.')
     module.__batch_counter__ += batch_size
 
 
-def add_batch_counter_variables_or_reset(module):
+def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
 
     module.__batch_counter__ = 0
 
 
-def add_batch_counter_hook_function(module):
+def add_batch_counter_hook_function(module: nn.Module) -> None:
     if hasattr(module, '__batch_counter_handle__'):
         return
 
@@ -521,36 +525,36 @@ def add_batch_counter_hook_function(module):
     module.__batch_counter_handle__ = handle
 
 
-def remove_batch_counter_hook_function(module):
+def remove_batch_counter_hook_function(module: nn.Module) -> None:
     if hasattr(module, '__batch_counter_handle__'):
         module.__batch_counter_handle__.remove()
         del module.__batch_counter_handle__
 
 
-def add_flops_counter_variable_or_reset(module):
+def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
     if is_supported_instance(module):
         if hasattr(module, '__flops__') or hasattr(module, '__params__'):
-            print('Warning: variables __flops__ or __params__ are already '
-                  'defined for the module' + type(module).__name__ +
-                  ' ptflops can affect your code!')
+            warnings.warn('variables __flops__ or __params__ are already '
+                          'defined for the module' + type(module).__name__ +
+                          ' ptflops can affect your code!')
         module.__flops__ = 0
         module.__params__ = get_model_parameters_number(module)
 
 
-def is_supported_instance(module):
+def is_supported_instance(module: nn.Module) -> bool:
     if type(module) in get_modules_mapping():
         return True
     return False
 
 
-def remove_flops_counter_hook_function(module):
+def remove_flops_counter_hook_function(module: nn.Module) -> None:
     if is_supported_instance(module):
         if hasattr(module, '__flops_handle__'):
             module.__flops_handle__.remove()
             del module.__flops_handle__
 
 
-def get_modules_mapping():
+def get_modules_mapping() -> Dict:
     return {
         # convolutions
         nn.Conv1d: conv_flops_counter_hook,
diff --git a/mmcv/cnn/utils/fuse_conv_bn.py b/mmcv/cnn/utils/fuse_conv_bn.py
index cb7076f80bf37f7931185bf0293ffcc1ce19c8ef..6ccaab3bf1eb3ce615bad910d6dc45a467bb1fe4 100644
--- a/mmcv/cnn/utils/fuse_conv_bn.py
+++ b/mmcv/cnn/utils/fuse_conv_bn.py
@@ -3,7 +3,7 @@ import torch
 import torch.nn as nn
 
 
-def _fuse_conv_bn(conv, bn):
+def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
     """Fuse conv and bn into one module.
 
     Args:
@@ -24,7 +24,7 @@ def _fuse_conv_bn(conv, bn):
     return conv
 
 
-def fuse_conv_bn(module):
+def fuse_conv_bn(module: nn.Module) -> nn.Module:
     """Recursively fuse conv and bn in a module.
 
     During inference, the functionary of batch norm layers is turned off
diff --git a/mmcv/cnn/utils/sync_bn.py b/mmcv/cnn/utils/sync_bn.py
index 8a79ff4a4f8dc70cf931fa319287682d4189e1a2..c534fc0e17506dde31c20529ce7bef64eef87140 100644
--- a/mmcv/cnn/utils/sync_bn.py
+++ b/mmcv/cnn/utils/sync_bn.py
@@ -1,9 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
+import torch.nn as nn
 
 import mmcv
 
 
-class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
+class _BatchNormXd(nn.modules.batchnorm._BatchNorm):
     """A general BatchNorm layer without input dimension check.
 
     Reproduced from @kapily's work:
@@ -14,11 +16,11 @@ class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm):
     SyncBatchNorm.
     """
 
-    def _check_input_dim(self, input):
+    def _check_input_dim(self, input: torch.Tensor):
         return
 
 
-def revert_sync_batchnorm(module):
+def revert_sync_batchnorm(module: nn.Module) -> nn.Module:
     """Helper function to convert all `SyncBatchNorm` (SyncBN) and
     `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
     `BatchNormXd` layers.
diff --git a/mmcv/cnn/utils/weight_init.py b/mmcv/cnn/utils/weight_init.py
index e1ac999e2470048ef05b3243b0d8b6959586785f..6e0d293ad4fb315462e34d5899ae6fccc4a7ba86 100644
--- a/mmcv/cnn/utils/weight_init.py
+++ b/mmcv/cnn/utils/weight_init.py
@@ -2,6 +2,7 @@
 import copy
 import math
 import warnings
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -13,7 +14,7 @@ from mmcv.utils import Registry, build_from_cfg, get_logger, print_log
 INITIALIZERS = Registry('initializer')
 
 
-def update_init_info(module, init_info):
+def update_init_info(module: nn.Module, init_info: str) -> None:
     """Update the `_params_init_info` in the module if the value of parameters
     are changed.
 
@@ -45,14 +46,17 @@ def update_init_info(module, init_info):
             module._params_init_info[param]['tmp_mean_value'] = mean_value
 
 
-def constant_init(module, val, bias=0):
+def constant_init(module: nn.Module, val: float, bias: float = 0) -> None:
     if hasattr(module, 'weight') and module.weight is not None:
         nn.init.constant_(module.weight, val)
     if hasattr(module, 'bias') and module.bias is not None:
         nn.init.constant_(module.bias, bias)
 
 
-def xavier_init(module, gain=1, bias=0, distribution='normal'):
+def xavier_init(module: nn.Module,
+                gain: float = 1,
+                bias: float = 0,
+                distribution: str = 'normal') -> None:
     assert distribution in ['uniform', 'normal']
     if hasattr(module, 'weight') and module.weight is not None:
         if distribution == 'uniform':
@@ -63,7 +67,10 @@ def xavier_init(module, gain=1, bias=0, distribution='normal'):
         nn.init.constant_(module.bias, bias)
 
 
-def normal_init(module, mean=0, std=1, bias=0):
+def normal_init(module: nn.Module,
+                mean: float = 0,
+                std: float = 1,
+                bias: float = 0) -> None:
     if hasattr(module, 'weight') and module.weight is not None:
         nn.init.normal_(module.weight, mean, std)
     if hasattr(module, 'bias') and module.bias is not None:
@@ -82,19 +89,22 @@ def trunc_normal_init(module: nn.Module,
         nn.init.constant_(module.bias, bias)  # type: ignore
 
 
-def uniform_init(module, a=0, b=1, bias=0):
+def uniform_init(module: nn.Module,
+                 a: float = 0,
+                 b: float = 1,
+                 bias: float = 0) -> None:
     if hasattr(module, 'weight') and module.weight is not None:
         nn.init.uniform_(module.weight, a, b)
     if hasattr(module, 'bias') and module.bias is not None:
         nn.init.constant_(module.bias, bias)
 
 
-def kaiming_init(module,
-                 a=0,
-                 mode='fan_out',
-                 nonlinearity='relu',
-                 bias=0,
-                 distribution='normal'):
+def kaiming_init(module: nn.Module,
+                 a: float = 0,
+                 mode: str = 'fan_out',
+                 nonlinearity: str = 'relu',
+                 bias: float = 0,
+                 distribution: str = 'normal') -> None:
     assert distribution in ['uniform', 'normal']
     if hasattr(module, 'weight') and module.weight is not None:
         if distribution == 'uniform':
@@ -107,7 +117,7 @@ def kaiming_init(module,
         nn.init.constant_(module.bias, bias)
 
 
-def caffe2_xavier_init(module, bias=0):
+def caffe2_xavier_init(module: nn.Module, bias: float = 0) -> None:
     # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
     # Acknowledgment to FAIR's internal code
     kaiming_init(
@@ -119,19 +129,23 @@ def caffe2_xavier_init(module, bias=0):
         distribution='uniform')
 
 
-def bias_init_with_prob(prior_prob):
+def bias_init_with_prob(prior_prob: float) -> float:
     """initialize conv/fc bias value according to a given probability value."""
     bias_init = float(-np.log((1 - prior_prob) / prior_prob))
     return bias_init
 
 
-def _get_bases_name(m):
+def _get_bases_name(m: nn.Module) -> List[str]:
     return [b.__name__ for b in m.__class__.__bases__]
 
 
-class BaseInit(object):
+class BaseInit:
 
-    def __init__(self, *, bias=0, bias_prob=None, layer=None):
+    def __init__(self,
+                 *,
+                 bias: float = 0,
+                 bias_prob: Optional[float] = None,
+                 layer: Union[str, List, None] = None):
         self.wholemodule = False
         if not isinstance(bias, (int, float)):
             raise TypeError(f'bias must be a number, but got a {type(bias)}')
@@ -154,7 +168,7 @@ class BaseInit(object):
             self.bias = bias
         self.layer = [layer] if isinstance(layer, str) else layer
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}, bias={self.bias}'
         return info
 
@@ -172,11 +186,11 @@ class ConstantInit(BaseInit):
             Defaults to None.
     """
 
-    def __init__(self, val, **kwargs):
+    def __init__(self, val: Union[int, float], **kwargs):
         super().__init__(**kwargs)
         self.val = val
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
 
         def init(m):
             if self.wholemodule:
@@ -191,7 +205,7 @@ class ConstantInit(BaseInit):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
         return info
 
@@ -214,12 +228,15 @@ class XavierInit(BaseInit):
             Defaults to None.
     """
 
-    def __init__(self, gain=1, distribution='normal', **kwargs):
+    def __init__(self,
+                 gain: float = 1,
+                 distribution: str = 'normal',
+                 **kwargs):
         super().__init__(**kwargs)
         self.gain = gain
         self.distribution = distribution
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
 
         def init(m):
             if self.wholemodule:
@@ -234,7 +251,7 @@ class XavierInit(BaseInit):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: gain={self.gain}, ' \
                f'distribution={self.distribution}, bias={self.bias}'
         return info
@@ -257,12 +274,12 @@ class NormalInit(BaseInit):
 
     """
 
-    def __init__(self, mean=0, std=1, **kwargs):
+    def __init__(self, mean: float = 0, std: float = 1, **kwargs):
         super().__init__(**kwargs)
         self.mean = mean
         self.std = std
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
 
         def init(m):
             if self.wholemodule:
@@ -277,7 +294,7 @@ class NormalInit(BaseInit):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: mean={self.mean},' \
                f' std={self.std}, bias={self.bias}'
         return info
@@ -355,12 +372,12 @@ class UniformInit(BaseInit):
             Defaults to None.
     """
 
-    def __init__(self, a=0, b=1, **kwargs):
+    def __init__(self, a: float = 0., b: float = 1., **kwargs):
         super().__init__(**kwargs)
         self.a = a
         self.b = b
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
 
         def init(m):
             if self.wholemodule:
@@ -375,7 +392,7 @@ class UniformInit(BaseInit):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: a={self.a},' \
                f' b={self.b}, bias={self.bias}'
         return info
@@ -409,10 +426,10 @@ class KaimingInit(BaseInit):
     """
 
     def __init__(self,
-                 a=0,
-                 mode='fan_out',
-                 nonlinearity='relu',
-                 distribution='normal',
+                 a: float = 0,
+                 mode: str = 'fan_out',
+                 nonlinearity: str = 'relu',
+                 distribution: str = 'normal',
                  **kwargs):
         super().__init__(**kwargs)
         self.a = a
@@ -420,7 +437,7 @@ class KaimingInit(BaseInit):
         self.nonlinearity = nonlinearity
         self.distribution = distribution
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
 
         def init(m):
             if self.wholemodule:
@@ -437,7 +454,7 @@ class KaimingInit(BaseInit):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
                f'nonlinearity={self.nonlinearity}, ' \
                f'distribution ={self.distribution}, bias={self.bias}'
@@ -456,12 +473,12 @@ class Caffe2XavierInit(KaimingInit):
             distribution='uniform',
             **kwargs)
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
         super().__call__(module)
 
 
 @INITIALIZERS.register_module(name='Pretrained')
-class PretrainedInit(object):
+class PretrainedInit:
     """Initialize module by loading a pretrained model.
 
     Args:
@@ -475,12 +492,15 @@ class PretrainedInit(object):
         map_location (str): map tensors into proper locations.
     """
 
-    def __init__(self, checkpoint, prefix=None, map_location=None):
+    def __init__(self,
+                 checkpoint: str,
+                 prefix: Optional[str] = None,
+                 map_location: Optional[str] = None):
         self.checkpoint = checkpoint
         self.prefix = prefix
         self.map_location = map_location
 
-    def __call__(self, module):
+    def __call__(self, module: nn.Module) -> None:
         from mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint,
                                  load_state_dict)
         logger = get_logger('mmcv')
@@ -503,12 +523,14 @@ class PretrainedInit(object):
         if hasattr(module, '_params_init_info'):
             update_init_info(module, init_info=self._get_init_info())
 
-    def _get_init_info(self):
+    def _get_init_info(self) -> str:
         info = f'{self.__class__.__name__}: load from {self.checkpoint}'
         return info
 
 
-def _initialize(module, cfg, wholemodule=False):
+def _initialize(module: nn.Module,
+                cfg: Dict,
+                wholemodule: bool = False) -> None:
     func = build_from_cfg(cfg, INITIALIZERS)
     # wholemodule flag is for override mode, there is no layer key in override
     # and initializer will give init values for the whole module with the name
@@ -517,7 +539,8 @@ def _initialize(module, cfg, wholemodule=False):
     func(module)
 
 
-def _initialize_override(module, override, cfg):
+def _initialize_override(module: nn.Module, override: Union[Dict, List],
+                         cfg: Dict) -> None:
     if not isinstance(override, (dict, list)):
         raise TypeError(f'override must be a dict or a list of dict, \
                 but got {type(override)}')
@@ -547,8 +570,8 @@ def _initialize_override(module, override, cfg):
                                f'but init_cfg is {cp_override}.')
 
 
-def initialize(module, init_cfg):
-    """Initialize a module.
+def initialize(module: nn.Module, init_cfg: Union[Dict, List[dict]]) -> None:
+    r"""Initialize a module.
 
     Args:
         module (``torch.nn.Module``): the module will be initialized.
@@ -556,6 +579,7 @@ def initialize(module, init_cfg):
             define initializer. OpenMMLab has implemented 6 initializers
             including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
             ``Kaiming``, and ``Pretrained``.
+
     Example:
         >>> module = nn.Linear(2, 3, bias=True)
         >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
diff --git a/mmcv/cnn/vgg.py b/mmcv/cnn/vgg.py
index 8778b649561a45a9652b1a15a26c2d171e58f3e1..a1d9ba211eb4b0056eb4127e19159e9ed5d5251f 100644
--- a/mmcv/cnn/vgg.py
+++ b/mmcv/cnn/vgg.py
@@ -1,12 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import logging
+from typing import List, Optional, Sequence, Tuple, Union
 
 import torch.nn as nn
+from torch import Tensor
 
 from .utils import constant_init, kaiming_init, normal_init
 
 
-def conv3x3(in_planes, out_planes, dilation=1):
+def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
     """3x3 convolution with padding."""
     return nn.Conv2d(
         in_planes,
@@ -16,12 +18,12 @@ def conv3x3(in_planes, out_planes, dilation=1):
         dilation=dilation)
 
 
-def make_vgg_layer(inplanes,
-                   planes,
-                   num_blocks,
-                   dilation=1,
-                   with_bn=False,
-                   ceil_mode=False):
+def make_vgg_layer(inplanes: int,
+                   planes: int,
+                   num_blocks: int,
+                   dilation: int = 1,
+                   with_bn: bool = False,
+                   ceil_mode: bool = False) -> List[nn.Module]:
     layers = []
     for _ in range(num_blocks):
         layers.append(conv3x3(inplanes, planes, dilation))
@@ -59,18 +61,18 @@ class VGG(nn.Module):
     }
 
     def __init__(self,
-                 depth,
-                 with_bn=False,
-                 num_classes=-1,
-                 num_stages=5,
-                 dilations=(1, 1, 1, 1, 1),
-                 out_indices=(0, 1, 2, 3, 4),
-                 frozen_stages=-1,
-                 bn_eval=True,
-                 bn_frozen=False,
-                 ceil_mode=False,
-                 with_last_pool=True):
-        super(VGG, self).__init__()
+                 depth: int,
+                 with_bn: bool = False,
+                 num_classes: int = -1,
+                 num_stages: int = 5,
+                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 ceil_mode: bool = False,
+                 with_last_pool: bool = True):
+        super().__init__()
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for vgg')
         assert num_stages >= 1 and num_stages <= 5
@@ -122,7 +124,7 @@ class VGG(nn.Module):
                 nn.Linear(4096, num_classes),
             )
 
-    def init_weights(self, pretrained=None):
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
             from ..runner import load_checkpoint
@@ -138,7 +140,7 @@ class VGG(nn.Module):
         else:
             raise TypeError('pretrained must be a str or None')
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
         outs = []
         vgg_layers = getattr(self, self.module_name)
         for i in range(len(self.stage_blocks)):
@@ -156,8 +158,8 @@ class VGG(nn.Module):
         else:
             return tuple(outs)
 
-    def train(self, mode=True):
-        super(VGG, self).train(mode)
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
         if self.bn_eval:
             for m in self.modules():
                 if isinstance(m, nn.BatchNorm2d):
diff --git a/mmcv/device/__init__.py b/mmcv/device/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba217b0771bcfada461d7c61a78f41a274e5aa6a
--- /dev/null
+++ b/mmcv/device/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import ipu, mlu, mps
+from .scatter_gather import scatter, scatter_kwargs
+from .utils import get_device
+
+__all__ = ['mlu', 'ipu', 'mps', 'get_device', 'scatter', 'scatter_kwargs']
diff --git a/mmcv/device/_functions.py b/mmcv/device/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..462a7e4ddca14685047b7937e3054108e164cf91
--- /dev/null
+++ b/mmcv/device/_functions.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+
+from mmcv.utils import deprecated_api_warning
+from .utils import get_device
+
+
+def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
+    """scatter copies tensor to devices directly."""
+    current_device = get_device()
+    if isinstance(input, list):
+        outputs = [scatter(_input, devices) for _input in input]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        return output.to(current_device) if devices != [-1] else output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    @deprecated_api_warning({'target_mlus': 'target_devices'},
+                            cls_name='Scatter')
+    def forward(target_devices, input):
+        outputs = scatter(input, target_devices)
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/device/ipu/__init__.py b/mmcv/device/ipu/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..d550865ad20790f0eb79015abc866548c0f2f83b
--- /dev/null
+++ b/mmcv/device/ipu/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import IPUFp16OptimizerHook
+    from .model_wrapper import ipu_model_wrapper
+    from .runner import IPUBaseRunner, IPUEpochBasedRunner, IPUIterBasedRunner
+    from .utils import cfg2options
+    __all__ = [
+        'cfg2options', 'ipu_model_wrapper', 'IPUFp16OptimizerHook',
+        'IPUDataLoader', 'IPUBaseRunner', 'IPUEpochBasedRunner',
+        'IPUIterBasedRunner'
+    ]
diff --git a/mmcv/device/ipu/dataloader.py b/mmcv/device/ipu/dataloader.py
new file mode 100755
index 0000000000000000000000000000000000000000..1485df2f31facff79238c70d89fdd9030fddcbce
--- /dev/null
+++ b/mmcv/device/ipu/dataloader.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+from functools import partial
+
+import poptorch
+from torch.utils.data.dataloader import default_collate
+
+from mmcv.parallel import DataContainer
+
+
+def collate(batch, samples_per_gpu=1):
+    """Put each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    TODO support for
+    :type:`~mmcv.parallel.DataContainer`. Currently, it will be ignored.
+    There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data.
+    2. cpu_only = False, stack = True, e.g., images tensors.
+    3. cpu_only = False, stack = False, e.g., gt bboxes.
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(
+            f'`batch` should be a sequence, but got {type(batch)}.')
+
+    if isinstance(batch[0], DataContainer):
+        # TODO `DataContainer` will be supported in the future.
+        raise TypeError('DataContainer is not supported in ipu data loader.')
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        collated_batch = []
+        for samples in transposed:
+            if not isinstance(samples[0], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch.append(collate(samples, samples_per_gpu))
+        return collated_batch
+    elif isinstance(batch[0], Mapping):
+        collated_batch = {}
+        for key in batch[0]:
+            if not isinstance(batch[0][key], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch[key] = collate([d[key] for d in batch])
+        return collated_batch
+    else:
+        return default_collate(batch)
+
+
+class IPUDataLoader(poptorch.DataLoader):
+    """Thin wrapper of `torch.utils.data.DataLoader`.
+
+    Compared with the pytorch DataLoder, this DataLoder changes the way of
+    calculation of batch size and adds the AsynchronousDataAccessor to
+    load and release data faster in cpu mode.
+
+    If this data loader is used in a distributed execution environment, it will
+    ensure that each process uses a different subset of the dataset, providing
+    you first call ``options.randomSeed(N)`` with an integer N which is the
+    same across all hosts.
+
+    Args:
+        dataset (torch.utils.data.Dataset): The dataset to get the data from.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        batch_size (int, optional): This is the batch size in the conventional
+            sense of being the size that runs through an operation in the model
+            at any given time.
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main
+            process. (default: ``0``)
+        drop_last (bool, optional): If True and the number of elements in the
+            dataset is not a multiple of the combined batch size then the
+            incomplete batch at the end will be dropped.
+        persistent_workers (bool, optional): Re-use workers between
+            iterations if True.
+        auto_distributed_partitioning (bool, optional): If True, partitions the
+            dataset for distributed execution automatically. Otherwise, it is
+            assumed that partitioning has been handled manually.
+        mode (poptorch.DataLoaderMode, optional): If `DataLoaderMode.Async`,
+            uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access
+            the dataset. If `DataLoaderMode.Sync`, accesses the dataset
+            synchronously.
+        async_options (Dict[str, Any], optional): Options to pass to
+            :py:class:`~poptorch.AsynchronousDataAccessor`.
+        rebatched_worker_size (int, optional): When using AsyncRebatched: batch
+            size of the tensors loaded by the workers.
+            Default to the combined batch size.
+            If specified the ``rebatched_worker_size`` must be less than
+            or equal to the combined batch size.
+        kwargs (Dict[str, Any], optional): Other options to pass to PyTorch's
+            ``DataLoader`` constructor.
+    """
+
+    def __init__(self,
+                 dataset,
+                 options,
+                 batch_size=1,
+                 shuffle=False,
+                 num_workers=0,
+                 drop_last=True,
+                 persistent_workers=True,
+                 auto_distributed_partitioning=True,
+                 mode='sync',
+                 async_options=None,
+                 rebatched_worker_size=None,
+                 **kwargs):
+        """Lazy init:
+
+        In many frameworks, the dataloader will be constructed before the
+        initialization of the ipu options, so the lazy init method is used
+        here, and the real initialization will not be done until the dataloader
+        needs to be used and the options are input.
+        """
+        # lazy init: sometimes, we cannot get IPU options when build data
+        #            loader
+        self.kwargs = {
+            'dataset': dataset,
+            'batch_size': batch_size,
+            'shuffle': shuffle,
+            'num_workers': num_workers,
+            'drop_last': drop_last,
+            'persistent_workers': persistent_workers,
+            'auto_distributed_partitioning': auto_distributed_partitioning,
+            'mode': mode,
+            'collate_fn': partial(collate, samples_per_gpu=batch_size),
+            'async_options': async_options,
+            'rebatched_worker_size': rebatched_worker_size,
+            **kwargs
+        }
+        self.dataset = dataset
+        self.initialized = False
+        if options:
+            self.init(options=options)
+
+    def init(self, options, **kwargs):
+        if not self.initialized:
+            kwargs = {**self.kwargs, **kwargs, 'options': options}
+            if kwargs['mode'] == 'sync':
+                kwargs['mode'] = poptorch.DataLoaderMode.Sync
+            elif kwargs['mode'] == 'async':
+                kwargs['mode'] = poptorch.DataLoaderMode.AsyncRebatched
+                if kwargs['async_options'] is None:
+                    kwargs['async_options'] = {
+                        'load_indefinitely': True,
+                        'buffer_size': 8
+                    }
+                if kwargs['rebatched_worker_size'] is None:
+                    kwargs['rebatched_worker_size'] = 128
+            super().__init__(**kwargs)
+            self.initialized = True
+
+        return self
diff --git a/mmcv/device/ipu/hierarchical_data_manager.py b/mmcv/device/ipu/hierarchical_data_manager.py
new file mode 100755
index 0000000000000000000000000000000000000000..a6f3b3cd2a139bcbc7852e7849071ab4b9fbb76f
--- /dev/null
+++ b/mmcv/device/ipu/hierarchical_data_manager.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+
+from mmcv.parallel import DataContainer
+
+# A customized None type for HierarchicalDataManager
+HierarchicalDataNone = object()
+
+
+class HierarchicalDataManager:
+    """A class manage all the tensors in the hierarchical data.
+
+    At present, the input data structure accepted by IPU is limited,
+    when the input data structure of mmcv varies.
+    Here, an intermediate class is needed to get and update tensors
+    from the original data.
+
+    HierarchicalDataManager will record a hierarchical input/output data in
+    self._hierarchical_data. For example, we have an input data:
+    {'img': tensorA, 'label': tensorB, 'img_metas': [tensorC, tensorD]}
+    To enable IPU to use the input, HierarchicalDataManager will collect
+    the torch tensors from self._hierarchical_data into a tuple like:
+    (tensorA, tensorB, tensorC, tensorD).
+    Meanwhile, the return of IPU is a tuple of tensors, HierarchicalDataManager
+    also have a function named update_all_tensors to update tensors in
+    self._hierarchical_data which is the output for upper calls.
+
+    Args:
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+    """
+
+    def __init__(self, logger=None):
+        self.atomic_types = (int, str, float, np.ndarray, type(None))
+        self.warning = warnings.warn if logger is None else logger.warning
+        # enable or disable input data's shape and value check
+        self.quick_mode = False
+        self._hierarchical_data = None
+
+    def quick(self):
+        self.quick_mode = True
+
+    def compare_atomic_type(self, a, b):
+        """Compare data, supported datatypes are numpy array and python basic
+        types."""
+        if isinstance(a, np.ndarray):
+            return np.all(a == b)
+        else:
+            return a == b
+
+    def record_hierarchical_data(self, data):
+        """Record a hierarchical data."""
+        if self._hierarchical_data is not None:
+            if isinstance(data, torch.Tensor):
+                assert isinstance(self._hierarchical_data, torch.Tensor), \
+                    'original hierarchical data is not torch.tensor'
+                self._hierarchical_data = data
+            else:
+                self.update_hierarchical_data(data)
+        else:
+            self._hierarchical_data = data
+
+    @property
+    def hierarchical_data(self):
+        return self._hierarchical_data
+
+    def update_hierarchical_data(self,
+                                 dataA,
+                                 dataB=HierarchicalDataNone,
+                                 strict=True,
+                                 address='data'):
+        """Update dataB with dataA in-place.
+
+        Args:
+            dataA (list or dict or tuple): New hierarchical data.
+            dataB (list or dict or tuple): hierarchical data to update.
+                if not specified, self.hierarchical_data will be updated then.
+            strict (bool, optional): If true, an error will be reported
+                when the following conditions occur:
+                1. Non-torch.Tensor data changed.
+                2. Torch.Tensor data shape changed.
+            address (str): Record the address of current data to be updated.
+                Default: 'data'.
+        """
+        if dataB is HierarchicalDataNone:
+            dataB = self.hierarchical_data
+
+        # Update with a da ta with the same structure
+        # but different values(tensors and basic python data types)
+        if isinstance(dataA, (tuple, list)):
+            for idx, node in enumerate(dataA):
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(idx)}]'
+                    assert isinstance(node, type(dataB[idx])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(node, torch.Tensor):
+                    dataB[idx] = node
+                else:
+                    self.update_hierarchical_data(
+                        node, dataB[idx], strict, address=new_address)
+        elif isinstance(dataA, dict):
+            for k, v in dataA.items():
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(k)}]'
+                    assert isinstance(v, type(dataB[k])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(v, torch.Tensor):
+                    dataB[k] = v
+                else:
+                    self.update_hierarchical_data(
+                        v, dataB[k], strict, address=new_address)
+        elif isinstance(dataA, self.atomic_types):
+            if not self.quick_mode:
+                is_equal = self.compare_atomic_type(dataA, dataB)
+                if not is_equal:
+                    if strict:
+                        raise ValueError(
+                            'all data except torch.Tensor should be same, '
+                            f'but data({address}) is changed.')
+                    else:
+                        self.warning(
+                            f'find a non-torch.Tensor data({type(dataA)}) '
+                            f'changed, and the address is {address}')
+        elif isinstance(dataA, DataContainer):
+            if not self.quick_mode:
+                assert isinstance(dataB, DataContainer)
+                new_address = address + '.data'
+                self.update_hierarchical_data(
+                    dataA.data, dataB.data, False, address=new_address)
+        else:
+            raise NotImplementedError(
+                f'not supported datatype:{type(dataA)}, address is {address}')
+
+    def collect_all_tensors(self, hierarchical_data=None):
+        """Collect torch.Tensor data from self.hierarchical_data to a list and
+        return."""
+        # get a list of tensor from self._hierarchical_data
+        if hierarchical_data is None:
+            hierarchical_data = self._hierarchical_data
+        tensors = []
+        if isinstance(hierarchical_data, torch.Tensor):
+            tensors = [hierarchical_data]
+        else:
+            self._collect_tensors(hierarchical_data, tensors)
+        return tensors
+
+    def _collect_tensors(self, data, tensors):
+        if isinstance(data, (tuple, list)):
+            for node in data:
+                if isinstance(node, torch.Tensor):
+                    tensors.append(node)
+                else:
+                    self._collect_tensors(node, tensors)
+        elif isinstance(data, dict):
+            for v in data.values():
+                if isinstance(v, torch.Tensor):
+                    tensors.append(v)
+                else:
+                    self._collect_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._collect_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+
+    def update_all_tensors(self, tensors):
+        """Put tensors from tuple back to self.hierarchical_data."""
+        if isinstance(self._hierarchical_data, torch.Tensor):
+            print(tensors, len(tensors))
+            assert len(tensors) == 1
+            assert isinstance(tensors[0], torch.Tensor)
+            self._hierarchical_data = tensors[0]
+        else:
+            # convert to list if tensors is tuple
+            tensors = list(tensors)
+            self._set_tensors(self._hierarchical_data, tensors)
+        return self.hierarchical_data
+
+    def _set_tensors(self, data, tensors):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = tensors.pop(0)
+                else:
+                    self._set_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._set_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+
+    def clean_all_tensors(self):
+        """Delete tensors from self.hierarchical_data."""
+        self._clean_tensors(self._hierarchical_data)
+
+    def _clean_tensors(self, data):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = None
+                else:
+                    self._clean_tensors(v)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._clean_tensors(data.data)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
diff --git a/mmcv/device/ipu/hook_wrapper.py b/mmcv/device/ipu/hook_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..141afb86d05a42c06fb5c4355cb47cae18e9bb2f
--- /dev/null
+++ b/mmcv/device/ipu/hook_wrapper.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import HOOKS, LrUpdaterHook, OptimizerHook
+from mmcv.utils import TORCH_VERSION, digit_version
+
+
+def wrap_lr_updater_hook(lr_hook_class):
+    """A wrapper function to wrap any subclass of LrUpdaterHook.
+
+    IPU needs extra operations to upload optimizer settings. This wrapper will
+    override function(_set_lr) of a subclass of LrUpdaterHook.
+    """
+    assert issubclass(lr_hook_class, LrUpdaterHook)
+
+    class ipu_lr_hook_class(lr_hook_class):
+
+        def _set_lr(self, runner, *args, **kwargs):
+            super()._set_lr(runner, *args, **kwargs)
+            # convert torch optimizer to poptorch optimizer
+            runner.model.setOptimizer(runner.optimizer)
+
+    return ipu_lr_hook_class
+
+
+def wrap_optimizer_hook(optimizer_hook_class):
+    """A wrapper function to wrap OptimizerHook.
+
+    This is an non-intrusive implementation of wrapping optimizer hook (or you
+    need to change every config file to use IPU optimizer hook) IPU's clip-norm
+    implementation is different from pytorch, so there should be an error
+    raised when using clip-norm.
+    """
+
+    class ipu_optimizer_hook_class(OptimizerHook):
+
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            if self.grad_clip is not None:
+                raise NotImplementedError('IPU does not support gradient clip')
+
+    return ipu_optimizer_hook_class
+
+
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class IPUFp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            assert grad_clip is None,\
+                'IPU mode does not support `grad_clip` currently'
+            assert coalesce,\
+                'implemented all reduce in distributed training currently'
+            assert bucket_size_mb == -1,\
+                '`bucket_size_mb` should not be set in IPU mode'
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                raise NotImplementedError(
+                    'IPU mode does not support dynamic loss scale currently')
+            elif isinstance(loss_scale, float):
+                self.loss_scale = loss_scale
+            elif isinstance(loss_scale, dict):
+                raise NotImplementedError(
+                    'IPU mode supports single scale currently')
+            else:
+                raise ValueError(
+                    f'loss_scale should be float, but got {loss_scale} ')
+
+        def after_train_iter(self, runner):
+            pass
+
+else:
+    raise RuntimeError('The IPU mode only supports torch 1.6 and above')
diff --git a/mmcv/device/ipu/model_wrapper.py b/mmcv/device/ipu/model_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..c345537e29b27cf7fff740269da8643c9570cd36
--- /dev/null
+++ b/mmcv/device/ipu/model_wrapper.py
@@ -0,0 +1,721 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+from collections import OrderedDict
+from typing import Optional, Union
+
+import poptorch
+import torch
+import torch.nn as nn
+from poptorch import PoplarExecutor, __version__, identity_loss
+from poptorch._args_parser import ArgsParser
+
+from mmcv.runner import auto_fp16
+from .hierarchical_data_manager import HierarchicalDataManager
+from .utils import compare_ndarray, model_sharding, recomputation_checkpoint
+
+
+class DictArgsParser(ArgsParser):
+    """A helper class for handling model input.
+
+    Args:
+        inputs (list): Inputs of model.
+    """
+
+    def __init__(self, inputs):
+        # Combine args and kwargs:
+        self._has_variadic_arguments = True
+        self._varnames = list(inputs.keys())
+        self._defaults = [inspect.Parameter.empty for _ in self._varnames]
+        self._warned_not_contiguous_input = False
+
+
+class WrappedNet(nn.Module):
+    """A net wrapper for model conversion.
+
+    This wrapper will make some changes and add some extra functions to
+    training/inference model.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        inputs_manager (:obj:`HierarchicalDataManager`): A parser
+            converting inputs from tuple to dictionary.
+        outputs_manager (:obj:`HierarchicalDataManager`): A parser
+            converting outputs from dictionary to tuple.
+        inter_outputs_in_cpu (dict): Specify the features to be
+            recorded.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+    """
+
+    def __init__(self,
+                 model,
+                 inputs_manager,
+                 outputs_manager,
+                 inter_outputs_in_cpu,
+                 modules_to_record=None):
+        super().__init__()
+        self.model = model
+        self.inputs_manager = inputs_manager
+        self.outputs_manager = outputs_manager
+        self.training = model.training
+        # Register a hook function to capture the intermediate features
+        # generated by the network to align the outputs between ipu and cpu
+        # Used to confirm whether the implementation of CPU is consistent
+        # with the implementation of IPU
+        self.inter_outputs_in_cpu = inter_outputs_in_cpu
+        if modules_to_record is None:
+            modules_to_record = []
+
+        for idx, (name, module) in enumerate(model.named_modules()):
+            if name in modules_to_record or idx in modules_to_record:
+                features_hook = self.get_input_output_hook(
+                    name, idx, self.inter_outputs_in_cpu)
+                module.register_forward_hook(hook=features_hook)
+
+    def get_input_output_hook(self, name, idx, save_dict):
+
+        def input_output_hook(module, fea_in, fea_out):
+            if isinstance(fea_in, tuple):
+                fea_in = list(fea_in)
+            if isinstance(fea_out, tuple):
+                fea_out = list(fea_out)
+            save_dict[name] = {
+                'fea_in': fea_in,
+                'fea_out': fea_out,
+                'idx': idx
+            }
+            return None
+
+        return input_output_hook
+
+    def forward(self, inputs_tuple):
+        """This function is used to be compiled to ipu, the inputs and outputs
+        need to be tuples, so here we need to restore the input back to a
+        dictionary and convert the output to a tuple."""
+        self.inputs_manager.update_all_tensors(inputs_tuple)
+        kwargs = {**(self.inputs_manager.hierarchical_data)}
+        if self.training:
+            outputs = self.forward_train(kwargs)
+            # tell poptorch which loss will be used finally
+            identity_loss(outputs['loss'], reduction='none')
+        else:
+            outputs = self.forward_eval(kwargs)
+
+        if isinstance(outputs, torch.Tensor):
+            # currently not support single tensor output,
+            # need to wrap it with a dictionary,
+            # use a keyword to identify this case
+            outputs = {'output of WrappedNet: single tensor': outputs}
+
+        # if there are some features need to be record, add extra outputs
+        for name in self.inter_outputs_in_cpu:
+            outputs[name] = self.inter_outputs_in_cpu[name]
+
+        # record all the places of return tensors in the converting stage
+        # while in the real run stage, all the tensor are changed in-place
+        # that means the output can be obtained directly outside this function
+        self.outputs_manager.record_hierarchical_data(outputs)
+        plain_outputs = self.outputs_manager.collect_all_tensors()
+        return plain_outputs
+
+    def forward_train(self, kwargs):
+        optimizer = kwargs.pop('optimizer')
+        outputs = self.train_step(kwargs, optimizer)
+        return outputs
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating are also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer`, optional): The
+                optimizer of runner is passed to ``train_step()``. This
+                argument is unused and reserved.
+
+        Returns:
+            dict: Dict of outputs. The following fields are contained.
+                - loss (torch.Tensor): A tensor for back propagation, which \
+                    can be a weighted sum of multiple losses.
+                - log_vars (dict): Dict contains all the variables to be sent \
+                    to the logger.
+                - num_samples (int): Indicates the batch size (when the model \
+                    is DDP, it means the batch size on each GPU), which is \
+                    used for averaging the logs.
+        """
+        losses = self.model(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+        return outputs
+
+    def _parse_losses(self, losses):
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(loss.mean() for loss in loss_value)
+            elif isinstance(loss_value, dict):
+                for name, value in loss_value.items():
+                    log_vars[name] = value
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars.items() if 'loss' in key)
+        log_vars['loss'] = loss
+
+        return loss, log_vars
+
+    def forward_eval(self, kwargs):
+        img = kwargs.pop('img')
+        img_metas = kwargs.pop('img_metas', None)
+        return_loss = kwargs.pop('return_loss')
+        assert not return_loss
+        # TODO Temporarily hard-code to close post_process,
+        # otherwise, in the third trace(_check_trace),
+        # post_process will convert output tensor to numpy array automatically,
+        # resulting in _check_trace failure
+        outputs = self.model(
+            img,
+            img_metas=img_metas,
+            return_loss=return_loss,
+            post_process=False)
+        return outputs
+
+
+class MMPoplarExecutor(PoplarExecutor):
+    """An executor for inputs/outputs parsing, model compilation, data
+    alignment and IPU upload/download.
+
+    Args:
+        model (:obj:`nn.Module`): The model to be compiled.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        training (bool): Model in training mode or eval mode.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        args (argument list): Arguments passed to the `__init__`
+            method of PoplarExecutor.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of PoplarExecutor.
+    """
+
+    def __init__(self,
+                 model,
+                 logger=None,
+                 training=True,
+                 modules_to_record=None,
+                 *args,
+                 **kwargs):
+        # self.model == self._user_model: input pytorch model
+        # self._model: wrapped model which is used to compile
+        # and update weights, these two models use same weights
+        # wrapped model only accept and output tuple, so
+        # HierarchicalDataManager will convert dictionary
+        # to tuple and convert them back
+        self.inputs_manager = HierarchicalDataManager(logger=logger)
+        self.outputs_manager = HierarchicalDataManager(logger=logger)
+        self.logger = logger
+        # the features calculated by CPU
+        self.inter_outputs_in_cpu = {}
+        # the features calculated by IPU
+        self.inter_outputs_in_ipu = {}
+        if modules_to_record is None:
+            # It is possible that the IPU implementation of some operators
+            # is inconsistent with the expected (CPU), here you can use
+            # this method to confirm whether there is a problem
+            self.compare_with_cpu = False
+        else:
+            self.compare_with_cpu = True
+        # move model.fp16_enabled to self.fp16_enabled,
+        # modify the position where the input is automatically casted to half
+        if getattr(model, 'fp16_enabled', False):
+            model.fp16_enabled = False
+            self.fp16_enabled = True
+        # make torch.jit.trace convert self._model
+        model = WrappedNet(
+            model,
+            self.inputs_manager,
+            self.outputs_manager,
+            self.inter_outputs_in_cpu,
+            modules_to_record=modules_to_record)
+        super().__init__(model, training=training, *args, **kwargs)
+        # overwrite self._args_parser in train_step or val_step
+        self._args_parser = None
+        if training:
+            assert self.training
+        else:
+            assert not self.training
+
+    @property
+    def training(self):
+        # If trying to get the attribute(training) of self,
+        # since the class has no training attribute,
+        # it will automatically look for the training attribute of self.model.
+        # However, the real attribute we want to check is self._training,
+        # self.model.training  and self._training are often inconsistent.
+        # It is not clear whether it is a Poptorch bug or a special design,
+        # temporarily use this function to fix the problem
+        return self._training  # comes from self.model._training
+
+    @auto_fp16(supported_types=(PoplarExecutor, ))
+    def run_model(self, data_dict):
+        # this function is used to parse input_dict
+        # and convert to output_dict
+        if self.isCompiled():
+            self.inputs_manager.record_hierarchical_data(data_dict)
+            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
+        else:
+            # get tensors out of data and put them in a tuple
+            self.inputs_manager.record_hierarchical_data(data_dict)
+            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
+            # turn logger in data manager off after compilation
+            self.inputs_manager.quick()
+            self.outputs_manager.quick()
+
+        # parser args in the first iter
+        if self._args_parser is None:
+            self._args_parser = DictArgsParser({'args': inputs_tuple})
+
+        # run or convert model
+        # the plain_outputs will be used in converting stage
+        plain_outputs = self(inputs_tuple)
+
+        self.inputs_manager.clean_all_tensors()
+
+        # put list of tensors back to the output dict
+        # according to the same order
+        self.outputs_manager.update_all_tensors(plain_outputs)
+        # get the real output dictionary from self.outputs_manager
+        output_dict = self.outputs_manager.hierarchical_data
+
+        # split output_dict into inter_outputs_in_ipu
+        # and output of the torch model
+        torch_model_output = {}
+        for name in output_dict:
+            if name in self.inter_outputs_in_cpu:
+                self.inter_outputs_in_ipu[name] = output_dict[name]
+            else:
+                torch_model_output[name] = output_dict[name]
+
+        if 'output of WrappedNet: single tensor' in output_dict:
+            assert len(torch_model_output) == 1
+            assert isinstance(
+                torch_model_output['output of WrappedNet: single tensor'],
+                torch.Tensor)
+            torch_model_output = \
+                torch_model_output['output of WrappedNet: single tensor']
+
+        return torch_model_output
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        # arguments from mmcls/models/classifiers/base.py:
+        # BaseClassifier.train_step
+        assert self.training
+        assert len(kwargs) == 0  # TODO, support later if necessary
+
+        # TODO support datacontainer as input
+        # currently, auto_fp16 and HierarchicalDataManager take too much
+        # time on traversing datacontainer
+        data['img_metas'] = None
+        num_samples = len(data['img'].data)
+
+        # TODO we will ignore optimizer because it will not be used in model,
+        # support later if necessary
+        data['optimizer'] = None
+        output_dict = self.run_model(data)
+
+        # outputs contained loss, log_vars, num_samples,
+        # only loss(torch.tensor) has been updated
+        # remove all unchanged vars, left torch.tensor
+        neat_output_dict = {'loss': output_dict['loss']}
+
+        # re-parse outputs, get back log_vars and num_samples
+        loss, log_vars = self.model._parse_losses(neat_output_dict)
+        final_output_dict = dict(
+            loss=loss, log_vars=log_vars, num_samples=num_samples)
+        return final_output_dict
+
+    def eval_call(self, img, img_metas=None, return_loss=True, **kwargs):
+        # arguments from mmdet/models/detectors/base.py:BaseDetector.forward
+        # tmp usssage for eval mode
+        assert not self.training
+        assert len(kwargs) == 0  # TODO, support later if necessary
+        assert not return_loss
+        data = {'img': img, 'img_metas': img_metas, 'return_loss': return_loss}
+
+        output_dict = self.run_model(data)
+
+        return output_dict
+
+    def detachFromDevice(self):
+        if self.isCompiled() and self._is_attached:
+            super().detachFromDevice()
+
+    def attachToDevice(self):
+        if self.isCompiled() and not self._is_attached:
+            super().attachToDevice()
+
+
+class TrainEvalModel:
+    """A class maintaining training MMPoplarExecutor and inference
+    MMPoplarExecutor.
+
+    Args:
+        train_model (:obj:`nn.Module`): The training model to be compiled.
+            ``train_model`` can be None if only executing validation.
+        eval_model (:obj:`nn.Module`): The inference model to be compiled.
+        options (mmcv.Config, dict): Options that will be used to compile
+            and run the model.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
+            optimizer, necessary if in training mode
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+    """
+
+    def __init__(self,
+                 train_model,
+                 eval_model,
+                 options,
+                 optimizer,
+                 modules_to_record=None,
+                 logger=None):
+        if train_model is None:
+            self._train_executor = None
+            self.training = False
+        else:
+            self._train_executor = get_training_model(
+                train_model,
+                options=options['training'],
+                optimizer=optimizer,
+                logger=logger,
+                modules_to_record=modules_to_record)
+            self.training = True
+        self._eval_executor = get_inference_model(
+            eval_model, options=options['inference'], logger=logger)
+
+    @property
+    def executor(self):
+        if self.training:
+            return self._train_executor
+        else:
+            return self._eval_executor
+
+    def train(self, mode: bool = True):
+        """Sets the module in training mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in
+        training/evaluation mode, if they are affected,
+        e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        if not isinstance(mode, bool):
+            raise ValueError('training mode is expected to be boolean, '
+                             f'but got {type(mode)}')
+        if self._train_executor is None and mode:
+            raise RuntimeError(
+                'The train_executor is not initialized.'
+                'If you want to initialize train_executor,'
+                'you need to input optimizer when converting pytorch model')
+
+        if mode == self.training:
+            self.model.train(mode)
+            return self
+        else:
+            if self.isCompiled():
+                # copy weights from IPU to cpu before off-load current session
+                self.copyWeightsToHost()
+                # detach the current session before change the mode,
+                # if is training mode and weights are updated,
+                # poptorch will copy weights from IPU to host
+                self.detachFromDevice()
+
+            self.training = mode  # session will changed with mode changing
+            self.model.train(mode)
+
+            # after changing mode, attach the current new session,
+            # and this function will copy weights of model to device
+            self.attachToDevice()
+            return self
+
+    def eval(self):
+        """Sets the module in evaluation mode.
+
+        This has any effect only on certain modules.
+        See documentations of particular modules
+        for details of their behaviors in training/evaluation mode,
+        if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
+
+        This is equivalent with :meth:`self.train(False)
+        <nn.Module.train>`.
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.eval()` and several similar mechanisms that may be confused with it.
+
+        Returns:
+            Module: self
+        """
+        return self.train(False)
+
+    def compare_data_between_ipu_and_cpu(self, inter_outputs_in_cpu,
+                                         inter_outputs_in_ipu):
+        for key, val in inter_outputs_in_cpu.items():
+            is_tensor = isinstance(val['fea_in'], torch.Tensor)
+            fea_in_cpu = val['fea_in']
+            fea_in_cpu_list = [fea_in_cpu] if is_tensor else fea_in_cpu
+            fea_in_ipu = inter_outputs_in_ipu[key]['fea_in']
+            fea_in_ipu_list = [fea_in_ipu] if is_tensor else fea_in_ipu
+
+            is_tensor = isinstance(val['fea_out'], torch.Tensor)
+            fea_out_cpu = val['fea_out']
+            fea_out_cpu_list = [fea_out_cpu] if is_tensor else fea_out_cpu
+            fea_out_ipu = inter_outputs_in_ipu[key]['fea_out']
+            fea_out_ipu_list = [fea_out_ipu] if is_tensor else fea_out_ipu
+
+            print('comparing layer:', key)
+            for idx, (featA, featB) in \
+                    enumerate(zip(fea_in_cpu_list, fea_in_ipu_list)):
+                print('fea_in, tensor ', idx)
+                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
+            for idx, (featA, featB) in \
+                    enumerate(zip(fea_out_cpu_list, fea_out_ipu_list)):
+                print('fea_out, tensor', idx)
+                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
+
+    # TODO Unified training and eval interface,
+    # merge train_step(train) and __call__(eval) together
+    def train_step(self, data, optimizer=None, **kwargs):
+        assert self.training, 'not supported train_step on eval mode'
+        inter_outputs_in_cpu = {}
+        if (self._train_executor.isCompiled()
+                and self._train_executor.compare_with_cpu):
+            self.copyWeightsToHost()
+            # run in CPU mode
+            self._train_executor.model.train_step(data, optimizer, **kwargs)
+            inter_outputs_in_cpu = {
+                **(self._train_executor.inter_outputs_in_cpu)
+            }
+        # run in IPU mode
+        result = self._train_executor.train_step(data, optimizer, **kwargs)
+        if (self._train_executor.isCompiled()
+                and self._train_executor.compare_with_cpu
+                and len(inter_outputs_in_cpu) > 0):
+            self.compare_data_between_ipu_and_cpu(
+                inter_outputs_in_cpu,
+                self._train_executor.inter_outputs_in_ipu)
+        return result
+
+    # TODO Unified training and eval interface,
+    # merge train_step(train) and __call__(eval) together
+    def __call__(self, *args, **kwargs):
+        if self.training:
+            raise NotImplementedError('use train_step rather than __call__')
+        else:
+            return self._eval_executor.eval_call(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        return getattr(self.executor, attr)
+
+
+def get_training_model(model: nn.Module,
+                       options: Optional[poptorch.Options] = None,
+                       optimizer: Optional[torch.optim.Optimizer] = None,
+                       logger=None,
+                       modules_to_record=None) -> poptorch.PoplarExecutor:
+    """Create a PopTorch training model from a PyTorch model, running on IPU
+    hardware in training mode.
+
+    Note:
+        PopTorch makes a shallow copy of the model. Changes to the
+        parameters in the returned training model affect the original model
+        and vice versa. However, primitive variable types are not synced: for
+        example calling ``model.train()`` on the original model, which
+        changes the ``training`` bool of the model instance, will not alter the
+        model returned by this function. You may need to call ``model.train()``
+        on your model before you call this function for correct behavior.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): The optimizers
+            to apply during training.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+
+    Returns:
+        The :class:`poptorch.PoplarExecutor` wrapper to use in place
+        of ``model``.
+    """
+    # Create a copy of the original model in case it needs to be wrapped
+    maybe_wrapped_model = copy.copy(model)
+
+    return MMPoplarExecutor(
+        model=maybe_wrapped_model,
+        logger=logger,
+        options=options,
+        training=True,
+        optimizer=optimizer,
+        user_model=model,
+        modules_to_record=modules_to_record,
+        poptorch_version=__version__)
+
+
+def get_inference_model(model: Union[nn.Module, poptorch.PoplarExecutor],
+                        options: Optional[poptorch.Options] = None,
+                        logger=None) -> poptorch.PoplarExecutor:
+    """Create a PopTorch inference model from a PyTorch model, running on IPU
+    hardware in inference mode.
+
+    Note:
+        PopTorch makes a shallow copy of the model. Changes to the
+        parameters in the returned inference model affect the original model
+        and vice versa. However, primitive variable types are not synced: for
+        example calling ``model.eval()`` on the original model will not alter
+        the model returned by this function. You may need to call
+        ``model.eval()`` on your model before you call this function for
+        correct behavior.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+
+    Returns:
+        The :class:`poptorch.PoplarExecutor` wrapper to use in place of
+        ``model``.
+    """
+
+    return MMPoplarExecutor(
+        model=copy.copy(model),
+        logger=logger,
+        options=options,
+        training=False,
+        poptorch_version=__version__)
+
+
+def ipu_model_wrapper(model,
+                      options,
+                      optimizer=None,
+                      logger=None,
+                      modules_to_record=None,
+                      ipu_model_cfg=None,
+                      fp16_cfg=None):
+    """Convert torch model to IPU model.
+
+    Args:
+        model (nn.Module): The target model to be converted.
+        options (dict[str, poptorch.Options]): IPU options, generated
+            by :func:`cfg2options`.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
+            optimizer, necessary if in training mode
+        logger (:obj:`logging.Logger`): Logger used during training.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        ipu_model_cfg (dict): A dictionary contains train_split_edges and
+            train_ckpt_nodes, See details in :func:`model_sharding` and
+            :func:`recomputation_checkpoint` functions.
+        fp16_cfg (dict): Config for IPU fp16 training. Currently supports
+            configs: `loss_scale`, `velocity_accum_type` and `accum_type`.
+            See details in
+            https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html
+
+    Returns:
+        TrainEvalModel: IPU wrapped model.
+    """
+    if ipu_model_cfg is None:
+        ipu_model_cfg = {}
+    training = model.training if optimizer is not None else False
+    # set mixed-precision
+    if fp16_cfg is not None:
+        from mmcv.runner import wrap_fp16_model
+        loss_scale = fp16_cfg['loss_scale']
+        wrap_fp16_model(model)
+        model.half()
+        # TODO tmp ussage to set loss scaling for torch original optimizer
+        if optimizer is not None:
+            optimizer.loss_scaling = loss_scale
+            if fp16_cfg.get('velocity_accum_type', False):
+                if fp16_cfg['velocity_accum_type'] == 'half':
+                    optimizer.velocity_accum_type = torch.half
+                else:
+                    optimizer.velocity_accum_type = torch.float32
+            if fp16_cfg.get('accum_type', False):
+                if fp16_cfg['accum_type'] == 'half':
+                    optimizer.accum_type = torch.half
+                else:
+                    optimizer.accum_type = torch.float32
+        # TODO support feature alignment for fp16
+        if modules_to_record is not None:
+            raise NotImplementedError(
+                'Feature alignment for fp16 is not implemented')
+
+    # set model partition
+    if optimizer is None:
+        train_model = None
+    else:
+        # split model into multi-IPUs if specified
+        train_model = model_sharding(
+            copy.copy(model).train(),
+            ipu_model_cfg.get('train_split_edges', []))
+
+        recomputation_checkpoint(train_model,
+                                 ipu_model_cfg.get('train_ckpt_nodes', []))
+
+        # TODO support feature alignment for gradient accumulation mode
+        gradient_accumulation = \
+            getattr(options['training'].Training, 'gradient_accumulation', 1)
+        if gradient_accumulation > 1:
+            assert modules_to_record is None, \
+                'Feature alignment for grad-accumulation mode not implemented'
+
+        # TODO support feature alignment for multi-replica mode
+        replication_factor = \
+            getattr(options['training'], 'replication_factor', 1)
+        if replication_factor > 1:
+            assert modules_to_record is None, \
+                'Feature alignment for multi-replica mode not implemented'
+
+    # TODO supports different model partitions between train and eval mode
+    assert len(ipu_model_cfg.get('eval_split_edges', [])) == 0,\
+        'Currently, BeginBlock can only be used once on the same model'
+    eval_model = copy.copy(model).eval()
+
+    # wrap model for compilation
+    model = TrainEvalModel(
+        train_model,
+        eval_model,
+        options=options,
+        optimizer=optimizer,
+        logger=logger,
+        modules_to_record=modules_to_record)
+    model.train(training)
+    return model
diff --git a/mmcv/device/ipu/runner.py b/mmcv/device/ipu/runner.py
new file mode 100755
index 0000000000000000000000000000000000000000..e2d4922677e08b2d6b5132a01034de8b043fa3f1
--- /dev/null
+++ b/mmcv/device/ipu/runner.py
@@ -0,0 +1,142 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.runner import (HOOKS, RUNNERS, BaseRunner, EpochBasedRunner,
+                         IterBasedRunner)
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import (IPUFp16OptimizerHook, wrap_lr_updater_hook,
+                               wrap_optimizer_hook)
+    from .model_wrapper import ipu_model_wrapper
+    from .utils import build_from_cfg_with_wrapper, cfg2options
+
+
+class IPUBaseRunner(BaseRunner):
+    """A base runner for IPU.
+
+    This runner has some extra processes for IPU which are shown below:
+
+    1. Parse options for IPU
+    2. wrap pytorch model for IPU
+    3. Raise errors while encountering illegal usage
+    4. Input IPU options and initialize dataloader if finding an instance
+       of IPUDataLoader
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options_cfg (mmcv.Config, dict): Options that will be used to compile
+            and run the model.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        ipu_model_cfg (mmcv.Config, dict): Config of model partition and
+            recomputing checkpoint
+        fp16_cfg (mmcv.Config): Config for fp16 training.
+        batch_processor (callable): A callable method that process a data
+            batch. Should be None for IPU runner
+        kwargs (Dict[str, Any], optional): Keyword arguments will be passed to
+        ``base_runner.BaseRunner``.
+    """
+
+    def __init__(self,
+                 model,
+                 options_cfg=None,
+                 modules_to_record=None,
+                 ipu_model_cfg=None,
+                 fp16_cfg=None,
+                 batch_processor=None,
+                 **kwargs):
+        assert hasattr(model, 'train_step') and batch_processor is None,\
+            'only support model with train_step'
+
+        if options_cfg is None:
+            options_cfg = {}
+        # call BaseRunner.__init__() here
+        super().__init__(model, **kwargs)
+
+        # process options of ipu
+        if IS_IPU_AVAILABLE:
+            self.options = cfg2options(options_cfg)
+            self.model = ipu_model_wrapper(
+                self.model,
+                self.options,
+                self.optimizer,
+                self.logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+        else:
+            raise NotImplementedError('cpu mode on IPURunner is not supported')
+
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        assert isinstance(lr_config, dict)
+        assert 'policy' in lr_config
+        policy_type = lr_config.pop('policy')
+        # If the type of policy is all in lower case,
+        # e.g., 'cyclic', then its first letter will be capitalized,
+        # e.g., to be 'Cyclic'.
+        # This is for the convenient usage of Lr updater.
+        # Since this is not applicable for `
+        # CosineAnnealingLrUpdater`, the string will not be changed
+        # if it contains capital letters.
+        if policy_type == policy_type.lower():
+            policy_type = policy_type.title()
+        hook_type = policy_type + 'LrUpdaterHook'
+        lr_config['type'] = hook_type
+        hook = build_from_cfg_with_wrapper(lr_config, HOOKS,
+                                           wrap_lr_updater_hook)
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        assert isinstance(optimizer_config, (dict, IPUFp16OptimizerHook))
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = build_from_cfg_with_wrapper(optimizer_config, HOOKS,
+                                               wrap_optimizer_hook)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def run(self, data_loaders, workflow, *args, **kwargs):
+        for i, flow in enumerate(workflow):
+            mode, _ = flow
+            # initialize IPU dataloader if not initialized
+            assert isinstance(data_loaders[i], IPUDataLoader),\
+                'IPU runner can only work with `IPUDataLoader`'
+            data_loaders[i].init(options=self.get_options(mode))
+
+        super().run(data_loaders, workflow, *args, **kwargs)
+
+    def get_options(self, mode):
+        if mode == 'train':
+            return self.options['training']
+        elif mode == 'val':
+            return self.options['inference']
+        else:
+            raise ValueError(f'mode should be train or val but got {mode}')
+
+
+@RUNNERS.register_module()
+class IPUEpochBasedRunner(IPUBaseRunner, EpochBasedRunner):
+    """Epoch-based Runner for IPU.
+
+    The Inheritance order(MRO) is: IPUEpochBasedRunner -> IPUBaseRunner ->
+    EpochBasedRunner -> BaseRunner This runner train models epoch by epoch.
+    """
+    pass
+
+
+@RUNNERS.register_module()
+class IPUIterBasedRunner(IPUBaseRunner, IterBasedRunner):
+    """Iteration-based Runner for IPU.
+
+    The Inheritance order(MRO) is: IPUIterBasedRunner -> IPUBaseRunner ->
+    IterBasedRunner -> BaseRunner This runner train models iteration by
+    iteration.
+    """
+    pass
diff --git a/mmcv/device/ipu/utils.py b/mmcv/device/ipu/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..79709db1ee1282e8daa6614ceb23481d3cd58338
--- /dev/null
+++ b/mmcv/device/ipu/utils.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+import numpy as np
+import popart
+import poptorch
+import torch
+import torch.nn as nn
+
+from mmcv.utils import Registry
+
+
+def _options_assigner(cfg, options_node):
+    # set popart.options by config
+    # cfg: dict, python data type
+    # options_node: python module or function
+    if isinstance(cfg, dict):
+        for key in cfg:
+            _options_assigner(cfg[key], getattr(options_node, key))
+    elif isinstance(cfg, (int, float, str, list)):
+        if callable(options_node):
+            options_node(cfg)
+        else:
+            error_msg = f'options_node type {type(options_node)} not supported'
+            raise NotImplementedError(error_msg)
+    else:
+        error_msg = f'cfg type {type(cfg)} not supported'
+        raise NotImplementedError(error_msg)
+
+
+def cfg2options(cfg):
+    """Parse dictionary to ipu options.
+
+    Args:
+        cfg (dict): A dictionary of ipu settings.
+
+    Returns:
+        dict[str, poptorch.Options]: Training options and inference options
+        of IPU.
+    """
+    # set ipu options for inference and training by config
+    train_cfg = cfg.pop('train_cfg', {})
+    eval_cfg = cfg.pop('eval_cfg', {})
+    eval_cfg['replicationFactor'] = 1  # eval mode only use one replica
+    eval_cfg['executionStrategy'] = 'ShardedExecution'
+    # overwrite default ipu cfg with specified train cfgs
+    training_ipu_cfg = {**cfg, **train_cfg}
+    # overwrite default ipu cfg with specified eval cfgs
+    inference_ipu_cfg = {**cfg, **eval_cfg}
+
+    ipu_options = {
+        'training': _cast_to_options(training_ipu_cfg),
+        'inference': _cast_to_options(inference_ipu_cfg)
+    }
+
+    # TODO configure these codes
+    ipu_options['training']._Popart.set('disableGradAccumulationTensorStreams',
+                                        True)
+    ipu_options['training']._Popart.set(
+        'accumulateOuterFragmentSettings.schedule',
+        int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized))
+    ipu_options['training'].Precision.enableStochasticRounding(True)
+
+    return ipu_options
+
+
+def _cast_to_options(cfg):
+    # If it cannot be directly assigned, use if statement to parse it,
+    # and if it can be directly assigned, use _options_assigner to assign
+    options = poptorch.Options()
+
+    if 'availableMemoryProportion' in cfg:
+        available_memory_proportion = cfg.pop('availableMemoryProportion')
+        mem_props = {}
+        for i, mem_prop in enumerate(available_memory_proportion):
+            mem_props[f'IPU{i}'] = mem_prop
+        options.setAvailableMemoryProportion(mem_props)
+
+    if 'executionStrategy' in cfg:
+        execution_strategy = cfg.pop('executionStrategy')
+        if execution_strategy == 'SameAsIpu':
+            options.setExecutionStrategy(
+                poptorch.PipelinedExecution(
+                    getattr(poptorch.AutoStage, execution_strategy)))
+        elif execution_strategy == 'ShardedExecution':
+            options.setExecutionStrategy(poptorch.ShardedExecution())
+        else:
+            raise NotImplementedError(
+                'executionStrategy should be "SameAsIpu" or "ShardedExecution"'
+                f', but got {execution_strategy}')
+
+    if 'partialsType' in cfg:
+        partials_type = cfg.pop('partialsType')
+        options.Precision.setPartialsType(getattr(
+            torch, partials_type))  # half or float
+
+    _options_assigner(cfg, options)
+    return options
+
+
+def model_sharding(model, split_edges):
+    """split models in-place into multi-IPUs.
+
+    Args:
+        model (nn.Module): The target model to be split.
+        split_edges (list of dict): Model layer names or layer numbers
+            of split edge. Each item of ``split_edges`` is a dictionary,
+            which may contain the following key-pairs:
+
+            - layer_to_call: PyTorch module to assign to the block
+            - user_id (optional): A user defined identifier for the block.
+            - ipu_id: The id of the IPU to run on.
+
+        Examples:
+            >>> split_edges = [
+            ...     dict(layer_to_call='model.conv1', ipu_id=0),
+            ...     dict(layer_to_call='model.conv3', ipu_id=1)]
+            >>> sharding_model = model_sharding(torch_model, split_edges)
+
+    Returns:
+        nn.Module: Split model.
+    """
+    if len(split_edges) == 0:
+        return model
+    assert isinstance(split_edges, list)
+    spilt_edges_dict = {edge['layer_to_call']: edge for edge in split_edges}
+
+    for idx, (name, module) in enumerate(model.named_modules()):
+        if idx in spilt_edges_dict and name in spilt_edges_dict:
+            raise ValueError(
+                'The same layer is referenced twice while doing model'
+                f' partition: idx is {idx} and name is {name}')
+
+        edge = spilt_edges_dict.pop(name, None)
+        edge = spilt_edges_dict.pop(idx, edge)
+        if edge is not None:
+            poptorch.BeginBlock(module, edge.get('user_id', name),
+                                edge['ipu_id'])
+
+    # ensure all split_edges are used
+    if len(spilt_edges_dict) > 0:
+        split_edge_names = list(spilt_edges_dict.keys())
+        raise RuntimeError(
+            f'split_edges: {split_edge_names} are not contained in the model')
+    return model
+
+
+def recomputation_checkpoint(model: nn.Module, module_names: list):
+    """Annotates the output of a module to be checkpointed instead of
+    recomputed.
+
+    If recomputation mode is enabled, ipu will release the activations of
+    the middle layers to save memory. During the backward of gradient,
+    the activation of the middle layer will be recalculated again.
+    This function is used to declare the activations of some intermediate
+    layers that need to be saved in order to skip the recomputation of
+    some layers.
+
+    Args:
+        model (nn.Module): The target model to apply recomputation
+            checkpoint.
+        module_names (list): Layer names of module.
+    """
+
+    def recompute_outputs(module, inputs, outputs):
+        if isinstance(outputs, tuple):
+            return tuple(poptorch.recomputationCheckpoint(y) for y in outputs)
+        else:
+            return poptorch.recomputationCheckpoint(outputs)
+
+    for name, module in model.named_modules():
+        if name in module_names:
+            module.register_forward_hook(recompute_outputs)
+            module_names.remove(name)
+
+    # check all module_names are used
+    assert len(module_names) == 0,\
+        f'recomputed nodes: {module_names} are not contained in the model'
+
+
+def compare_ndarray(featA, featB, rtol=1e-3, atol=1e-5):
+    """Align data between two activations or weights."""
+    try:
+        np.testing.assert_allclose(featA, featB, rtol=rtol, atol=atol)
+    except AssertionError as e:
+        print(e)
+
+
+def build_from_cfg_with_wrapper(cfg,
+                                registry,
+                                wrapper_func=None,
+                                default_args=None):
+    """Build a module from config dict and wrap module with "wrapper_func".
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+        wrapper_func (function): Used to wrap class
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+
+    if wrapper_func is None:
+        wrapped_obj_cls = obj_cls
+    else:
+        wrapped_obj_cls = wrapper_func(obj_cls)
+    try:
+        return wrapped_obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{wrapped_obj_cls.__name__}: {e}')
diff --git a/mmcv/device/mlu/__init__.py b/mmcv/device/mlu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c71ccf3ce38f3cbc9911f1d9d4b05a531771f2
--- /dev/null
+++ b/mmcv/device/mlu/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_parallel import MLUDataParallel
+from .distributed import MLUDistributedDataParallel
+
+__all__ = ['MLUDataParallel', 'MLUDistributedDataParallel']
diff --git a/mmcv/device/mlu/_functions.py b/mmcv/device/mlu/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..75660fa9b3635fed049cb150639244a658534824
--- /dev/null
+++ b/mmcv/device/mlu/_functions.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+
+
+def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
+    """scatter copies tensor to MLU directly."""
+    if isinstance(input, list):
+        outputs = [scatter(_input, devices) for _input in input]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        return output.to('mlu') if devices != [-1] else output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_mlus, input):
+        outputs = scatter(input, target_mlus)
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/device/mlu/data_parallel.py b/mmcv/device/mlu/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe14c0a55c92f96ec7f782a591ac10b007942dc
--- /dev/null
+++ b/mmcv/device/mlu/data_parallel.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmcv.parallel import MMDataParallel
+from .scatter_gather import scatter_kwargs
+
+
+class MLUDataParallel(MMDataParallel):
+    """The MLUDataParallel module that supports DataContainer.
+
+    MLUDataParallel is a class inherited from MMDataParall, which supports
+    MLU training and inference only.
+
+    The main differences with MMDataParallel:
+
+    - It only supports single-card of MLU, and only use first card to
+      run training and inference.
+
+    - It uses direct host-to-device copy instead of stream-background
+      scatter.
+
+    .. warning::
+        MLUDataParallel only supports single MLU training, if you need to
+        train with multiple MLUs, please use MLUDistributedDataParallel
+        instead. If you have multiple MLUs, you can set the environment
+        variable ``MLU_VISIBLE_DEVICES=0`` (or any other card number(s))
+        to specify the running device.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
+        self.device_ids = [0]
+        self.src_device_obj = torch.device('mlu:0')
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/mlu/distributed.py b/mmcv/device/mlu/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3768c754c908b219fd5a770d69e6ed5416781ba8
--- /dev/null
+++ b/mmcv/device/mlu/distributed.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.parallel import MMDistributedDataParallel
+from .scatter_gather import scatter_kwargs
+
+
+class MLUDistributedDataParallel(MMDistributedDataParallel):
+    """The DDP module supports DataContainer.
+
+    MLUDDP has one difference from MMDDP which moves data to MLU with coping
+    instead of scattering.
+    """
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/mlu/scatter_gather.py b/mmcv/device/mlu/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0c9b96f51252e4c510f66a2ec5fb7522716e29
--- /dev/null
+++ b/mmcv/device/mlu/scatter_gather.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from ._functions import Scatter
+
+
+def scatter(inputs, target_mlus, dim=0):
+    """Scatter inputs to target mlu.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_mlus != [-1]:
+                obj = obj.to('mlu')
+                return [obj]
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_mlus, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_mlus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for targets in target_mlus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_mlus, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_mlus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_mlus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/device/mps/__init__.py b/mmcv/device/mps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28144ef0ae8cf65527cefc469d07c7ff854c688
--- /dev/null
+++ b/mmcv/device/mps/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_parallel import MPSDataParallel
+
+__all__ = ['MPSDataParallel']
diff --git a/mmcv/device/mps/data_parallel.py b/mmcv/device/mps/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ae5396d24193376432ae98b792ec89fac678738
--- /dev/null
+++ b/mmcv/device/mps/data_parallel.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmcv.parallel import MMDataParallel
+from ..scatter_gather import scatter_kwargs
+
+
+class MPSDataParallel(MMDataParallel):
+    """The MPSDataParallel module that supports DataContainer.
+
+    MPSDataParallel is a class inherited from MMDataParall, which supports
+    MPS training and inference only.
+
+    The main differences with MMDataParallel:
+
+    - It only supports single-card of MPS, and only use first card to
+      run training and inference.
+
+    - It uses direct host-to-device copy instead of stream-background
+      scatter.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
+        self.device_ids = [0]
+        self.src_device_obj = torch.device('mps:0')
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/scatter_gather.py b/mmcv/device/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..744b0ca51e9de4cb7c43d60a986621461519f781
--- /dev/null
+++ b/mmcv/device/scatter_gather.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import deprecated_api_warning
+from ._functions import Scatter
+from .utils import get_device
+
+
+@deprecated_api_warning({'target_mlus': 'target_devices'})
+def scatter(inputs, target_devices, dim=0):
+    """Scatter inputs to target devices.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+    current_device = get_device()
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_devices != [-1]:
+                obj = obj.to(current_device)
+                return [obj]
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_devices, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_devices, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for _ in target_devices]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+@deprecated_api_warning({'target_mlus': 'target_devices'})
+def scatter_kwargs(inputs, kwargs, target_devices, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_devices, dim) if inputs else []
+    kwargs = scatter(kwargs, target_devices, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/device/utils.py b/mmcv/device/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2adec08dd98ad83cce3a9c28d3a6651808f7112
--- /dev/null
+++ b/mmcv/device/utils.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def get_device() -> str:
+    """Returns the currently existing device type.
+
+    Returns:
+        str: cuda | mlu | mps | cpu.
+    """
+    if IS_CUDA_AVAILABLE:
+        return 'cuda'
+    elif IS_MLU_AVAILABLE:
+        return 'mlu'
+    elif IS_MPS_AVAILABLE:
+        return 'mps'
+    else:
+        return 'cpu'
diff --git a/mmcv/engine/test.py b/mmcv/engine/test.py
index f236b1cda2f39517bda3e4cce9badc19c6cbf190..83546caec47fb11952fd820b342c71b83b74fac2 100644
--- a/mmcv/engine/test.py
+++ b/mmcv/engine/test.py
@@ -4,15 +4,18 @@ import pickle
 import shutil
 import tempfile
 import time
+from typing import Optional
 
 import torch
 import torch.distributed as dist
+import torch.nn as nn
+from torch.utils.data import DataLoader
 
 import mmcv
 from mmcv.runner import get_dist_info
 
 
-def single_gpu_test(model, data_loader):
+def single_gpu_test(model: nn.Module, data_loader: DataLoader) -> list:
     """Test model with a single gpu.
 
     This method tests model with a single gpu and displays test progress bar.
@@ -41,7 +44,10 @@ def single_gpu_test(model, data_loader):
     return results
 
 
-def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+def multi_gpu_test(model: nn.Module,
+                   data_loader: DataLoader,
+                   tmpdir: Optional[str] = None,
+                   gpu_collect: bool = False) -> Optional[list]:
     """Test model with multiple gpus.
 
     This method tests model with multiple gpus and collects the results
@@ -82,13 +88,15 @@ def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
 
     # collect results from all ranks
     if gpu_collect:
-        results = collect_results_gpu(results, len(dataset))
+        result_from_ranks = collect_results_gpu(results, len(dataset))
     else:
-        results = collect_results_cpu(results, len(dataset), tmpdir)
-    return results
+        result_from_ranks = collect_results_cpu(results, len(dataset), tmpdir)
+    return result_from_ranks
 
 
-def collect_results_cpu(result_part, size, tmpdir=None):
+def collect_results_cpu(result_part: list,
+                        size: int,
+                        tmpdir: Optional[str] = None) -> Optional[list]:
     """Collect results under cpu mode.
 
     On cpu mode, this function will save the results on different gpus to
@@ -126,7 +134,8 @@ def collect_results_cpu(result_part, size, tmpdir=None):
     else:
         mmcv.mkdir_or_exist(tmpdir)
     # dump the part result to the dir
-    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    part_file = osp.join(tmpdir, f'part_{rank}.pkl')  # type: ignore
+    mmcv.dump(result_part, part_file)
     dist.barrier()
     # collect all parts
     if rank != 0:
@@ -135,7 +144,7 @@ def collect_results_cpu(result_part, size, tmpdir=None):
         # load results of all parts from tmp dir
         part_list = []
         for i in range(world_size):
-            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
             part_result = mmcv.load(part_file)
             # When data is severely insufficient, an empty part_result
             # on a certain gpu could makes the overall outputs empty.
@@ -148,11 +157,11 @@ def collect_results_cpu(result_part, size, tmpdir=None):
         # the dataloader may pad some samples
         ordered_results = ordered_results[:size]
         # remove tmp dir
-        shutil.rmtree(tmpdir)
+        shutil.rmtree(tmpdir)  # type: ignore
         return ordered_results
 
 
-def collect_results_gpu(result_part, size):
+def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
     """Collect results under gpu mode.
 
     On gpu mode, this function will encode results to gpu tensors and use gpu
@@ -200,3 +209,5 @@ def collect_results_gpu(result_part, size):
         # the dataloader may pad some samples
         ordered_results = ordered_results[:size]
         return ordered_results
+    else:
+        return None
diff --git a/mmcv/fileio/file_client.py b/mmcv/fileio/file_client.py
index b2d622868cdd006dc7446bcde0dc54731c17116a..ee7c3164e2c631c546dfe3345c45f8b8394a9995 100644
--- a/mmcv/fileio/file_client.py
+++ b/mmcv/fileio/file_client.py
@@ -8,7 +8,7 @@ import warnings
 from abc import ABCMeta, abstractmethod
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Iterable, Iterator, Optional, Tuple, Union
+from typing import Any, Generator, Iterator, Optional, Tuple, Union
 from urllib.request import urlopen
 
 import mmcv
@@ -64,7 +64,8 @@ class CephBackend(BaseStorageBackend):
             raise ImportError('Please install ceph to enable CephBackend.')
 
         warnings.warn(
-            'CephBackend will be deprecated, please use PetrelBackend instead')
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
         self._client = ceph.S3Client()
         assert isinstance(path_mapping, dict) or path_mapping is None
         self.path_mapping = path_mapping
@@ -209,9 +210,9 @@ class PetrelBackend(BaseStorageBackend):
         """
         if not has_method(self._client, 'delete'):
             raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `delete` method, please use a higher version or dev'
-                 ' branch instead.'))
+                'Current version of Petrel Python SDK has not supported '
+                'the `delete` method, please use a higher version or dev'
+                ' branch instead.')
 
         filepath = self._map_path(filepath)
         filepath = self._format_path(filepath)
@@ -229,9 +230,9 @@ class PetrelBackend(BaseStorageBackend):
         if not (has_method(self._client, 'contains')
                 and has_method(self._client, 'isdir')):
             raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `contains` and `isdir` methods, please use a higher'
-                 'version or dev branch instead.'))
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` and `isdir` methods, please use a higher'
+                'version or dev branch instead.')
 
         filepath = self._map_path(filepath)
         filepath = self._format_path(filepath)
@@ -246,13 +247,13 @@ class PetrelBackend(BaseStorageBackend):
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
+            ``False`` otherwise.
         """
         if not has_method(self._client, 'isdir'):
             raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `isdir` method, please use a higher version or dev'
-                 ' branch instead.'))
+                'Current version of Petrel Python SDK has not supported '
+                'the `isdir` method, please use a higher version or dev'
+                ' branch instead.')
 
         filepath = self._map_path(filepath)
         filepath = self._format_path(filepath)
@@ -266,13 +267,13 @@ class PetrelBackend(BaseStorageBackend):
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
+            otherwise.
         """
         if not has_method(self._client, 'contains'):
             raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `contains` method, please use a higher version or '
-                 'dev branch instead.'))
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` method, please use a higher version or '
+                'dev branch instead.')
 
         filepath = self._map_path(filepath)
         filepath = self._format_path(filepath)
@@ -297,7 +298,10 @@ class PetrelBackend(BaseStorageBackend):
         return '/'.join(formatted_paths)
 
     @contextmanager
-    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
         """Download a file from ``filepath`` and return a temporary path.
 
         ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
@@ -362,9 +366,9 @@ class PetrelBackend(BaseStorageBackend):
         """
         if not has_method(self._client, 'list'):
             raise NotImplementedError(
-                ('Current version of Petrel Python SDK has not supported '
-                 'the `list` method, please use a higher version or dev'
-                 ' branch instead.'))
+                'Current version of Petrel Python SDK has not supported '
+                'the `list` method, please use a higher version or dev'
+                ' branch instead.')
 
         dir_path = self._map_path(dir_path)
         dir_path = self._format_path(dir_path)
@@ -473,17 +477,16 @@ class LmdbBackend(BaseStorageBackend):
                  readahead=False,
                  **kwargs):
         try:
-            import lmdb
+            import lmdb  # NOQA
         except ImportError:
             raise ImportError('Please install lmdb to enable LmdbBackend.')
 
         self.db_path = str(db_path)
-        self._client = lmdb.open(
-            self.db_path,
-            readonly=readonly,
-            lock=lock,
-            readahead=readahead,
-            **kwargs)
+        self.readonly = readonly
+        self.lock = lock
+        self.readahead = readahead
+        self.kwargs = kwargs
+        self._client = None
 
     def get(self, filepath):
         """Get values according to the filepath.
@@ -491,14 +494,29 @@ class LmdbBackend(BaseStorageBackend):
         Args:
             filepath (str | obj:`Path`): Here, filepath is the lmdb key.
         """
-        filepath = str(filepath)
+        if self._client is None:
+            self._client = self._get_client()
+
         with self._client.begin(write=False) as txn:
-            value_buf = txn.get(filepath.encode('ascii'))
+            value_buf = txn.get(str(filepath).encode('utf-8'))
         return value_buf
 
     def get_text(self, filepath, encoding=None):
         raise NotImplementedError
 
+    def _get_client(self):
+        import lmdb
+
+        return lmdb.open(
+            self.db_path,
+            readonly=self.readonly,
+            lock=self.lock,
+            readahead=self.readahead,
+            **self.kwargs)
+
+    def __del__(self):
+        self._client.close()
+
 
 class HardDiskBackend(BaseStorageBackend):
     """Raw hard disks storage backend."""
@@ -531,7 +549,7 @@ class HardDiskBackend(BaseStorageBackend):
         Returns:
             str: Expected text reading from ``filepath``.
         """
-        with open(filepath, 'r', encoding=encoding) as f:
+        with open(filepath, encoding=encoding) as f:
             value_buf = f.read()
         return value_buf
 
@@ -598,7 +616,7 @@ class HardDiskBackend(BaseStorageBackend):
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
+            ``False`` otherwise.
         """
         return osp.isdir(filepath)
 
@@ -610,7 +628,7 @@ class HardDiskBackend(BaseStorageBackend):
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
+            otherwise.
         """
         return osp.isfile(filepath)
 
@@ -631,7 +649,9 @@ class HardDiskBackend(BaseStorageBackend):
 
     @contextmanager
     def get_local_path(
-            self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]:
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
         """Only for unified API and do nothing."""
         yield filepath
 
@@ -700,7 +720,8 @@ class HTTPBackend(BaseStorageBackend):
         return value_buf.decode(encoding)
 
     @contextmanager
-    def get_local_path(self, filepath: str) -> Iterable[str]:
+    def get_local_path(
+            self, filepath: str) -> Generator[Union[str, Path], None, None]:
         """Download a file from ``filepath``.
 
         ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
@@ -770,19 +791,16 @@ class FileClient:
         'petrel': PetrelBackend,
         'http': HTTPBackend,
     }
-    # This collection is used to record the overridden backends, and when a
-    # backend appears in the collection, the singleton pattern is disabled for
-    # that backend, because if the singleton pattern is used, then the object
-    # returned will be the backend before overwriting
-    _overridden_backends = set()
+
     _prefix_to_backends = {
         's3': PetrelBackend,
         'http': HTTPBackend,
         'https': HTTPBackend,
     }
-    _overridden_prefixes = set()
 
-    _instances = {}
+    _instances: dict = {}
+
+    client: Any
 
     def __new__(cls, backend=None, prefix=None, **kwargs):
         if backend is None and prefix is None:
@@ -802,10 +820,7 @@ class FileClient:
         for key, value in kwargs.items():
             arg_key += f':{key}:{value}'
 
-        # if a backend was overridden, it will create a new object
-        if (arg_key in cls._instances
-                and backend not in cls._overridden_backends
-                and prefix not in cls._overridden_prefixes):
+        if arg_key in cls._instances:
             _instance = cls._instances[arg_key]
         else:
             # create a new object and put it to _instance
@@ -839,8 +854,8 @@ class FileClient:
             's3'
 
         Returns:
-            str | None: Return the prefix of uri if the uri contains '://'
-                else ``None``.
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
         """
         assert is_filepath(uri)
         uri = str(uri)
@@ -899,7 +914,9 @@ class FileClient:
                 'add "force=True" if you want to override it')
 
         if name in cls._backends and force:
-            cls._overridden_backends.add(name)
+            for arg_key, instance in list(cls._instances.items()):
+                if isinstance(instance.client, cls._backends[name]):
+                    cls._instances.pop(arg_key)
         cls._backends[name] = backend
 
         if prefixes is not None:
@@ -911,7 +928,12 @@ class FileClient:
                 if prefix not in cls._prefix_to_backends:
                     cls._prefix_to_backends[prefix] = backend
                 elif (prefix in cls._prefix_to_backends) and force:
-                    cls._overridden_prefixes.add(prefix)
+                    overridden_backend = cls._prefix_to_backends[prefix]
+                    if isinstance(overridden_backend, list):
+                        overridden_backend = tuple(overridden_backend)
+                    for arg_key, instance in list(cls._instances.items()):
+                        if isinstance(instance.client, overridden_backend):
+                            cls._instances.pop(arg_key)
                     cls._prefix_to_backends[prefix] = backend
                 else:
                     raise KeyError(
@@ -987,7 +1009,7 @@ class FileClient:
 
         Returns:
             bytes | memoryview: Expected bytes object or a memory view of the
-                bytes object.
+            bytes object.
         """
         return self.client.get(filepath)
 
@@ -1060,7 +1082,7 @@ class FileClient:
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a directory,
-                ``False`` otherwise.
+            ``False`` otherwise.
         """
         return self.client.isdir(filepath)
 
@@ -1072,7 +1094,7 @@ class FileClient:
 
         Returns:
             bool: Return ``True`` if ``filepath`` points to a file, ``False``
-                otherwise.
+            otherwise.
         """
         return self.client.isfile(filepath)
 
@@ -1092,7 +1114,10 @@ class FileClient:
         return self.client.join_path(filepath, *filepaths)
 
     @contextmanager
-    def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]:
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
         """Download data from ``filepath`` and write the data to local path.
 
         ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
diff --git a/mmcv/fileio/handlers/base.py b/mmcv/fileio/handlers/base.py
index 288878bc57282fbb2f12b32290152ca8e9d3cab0..0c9cc15b67cbf7d320c2b9c6cbd441a5d5adf235 100644
--- a/mmcv/fileio/handlers/base.py
+++ b/mmcv/fileio/handlers/base.py
@@ -21,10 +21,10 @@ class BaseFileHandler(metaclass=ABCMeta):
     def dump_to_str(self, obj, **kwargs):
         pass
 
-    def load_from_path(self, filepath, mode='r', **kwargs):
+    def load_from_path(self, filepath: str, mode: str = 'r', **kwargs):
         with open(filepath, mode) as f:
             return self.load_from_fileobj(f, **kwargs)
 
-    def dump_to_path(self, obj, filepath, mode='w', **kwargs):
+    def dump_to_path(self, obj, filepath: str, mode: str = 'w', **kwargs):
         with open(filepath, mode) as f:
             self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/mmcv/fileio/handlers/pickle_handler.py b/mmcv/fileio/handlers/pickle_handler.py
index b37c79bed4ef9fd8913715e62dbe3fc5cafdc3aa..073856fd25a731b42f3cd19269ad95744b20598f 100644
--- a/mmcv/fileio/handlers/pickle_handler.py
+++ b/mmcv/fileio/handlers/pickle_handler.py
@@ -12,8 +12,7 @@ class PickleHandler(BaseFileHandler):
         return pickle.load(file, **kwargs)
 
     def load_from_path(self, filepath, **kwargs):
-        return super(PickleHandler, self).load_from_path(
-            filepath, mode='rb', **kwargs)
+        return super().load_from_path(filepath, mode='rb', **kwargs)
 
     def dump_to_str(self, obj, **kwargs):
         kwargs.setdefault('protocol', 2)
@@ -24,5 +23,4 @@ class PickleHandler(BaseFileHandler):
         pickle.dump(obj, file, **kwargs)
 
     def dump_to_path(self, obj, filepath, **kwargs):
-        super(PickleHandler, self).dump_to_path(
-            obj, filepath, mode='wb', **kwargs)
+        super().dump_to_path(obj, filepath, mode='wb', **kwargs)
diff --git a/mmcv/fileio/handlers/yaml_handler.py b/mmcv/fileio/handlers/yaml_handler.py
index c5aa2eea1e8c76f8baf753d1c8c959dee665e543..1c1b077943d634b3ddcf5ee470855179b8308e9c 100644
--- a/mmcv/fileio/handlers/yaml_handler.py
+++ b/mmcv/fileio/handlers/yaml_handler.py
@@ -2,9 +2,10 @@
 import yaml
 
 try:
-    from yaml import CLoader as Loader, CDumper as Dumper
+    from yaml import CDumper as Dumper
+    from yaml import CLoader as Loader
 except ImportError:
-    from yaml import Loader, Dumper
+    from yaml import Loader, Dumper  # type: ignore
 
 from .base import BaseFileHandler  # isort:skip
 
diff --git a/mmcv/fileio/io.py b/mmcv/fileio/io.py
index aaefde58aa3ea5b58f86249ce7e1c40c186eb8dd..91192103cf331e8ceb970d6f1f5ac050137c0871 100644
--- a/mmcv/fileio/io.py
+++ b/mmcv/fileio/io.py
@@ -1,11 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from io import BytesIO, StringIO
 from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, TextIO, Union
 
-from ..utils import is_list_of, is_str
+from ..utils import is_list_of
 from .file_client import FileClient
 from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
 
+FileLikeObject = Union[TextIO, StringIO, BytesIO]
+
 file_handlers = {
     'json': JsonHandler(),
     'yaml': YamlHandler(),
@@ -15,7 +18,10 @@ file_handlers = {
 }
 
 
-def load(file, file_format=None, file_client_args=None, **kwargs):
+def load(file: Union[str, Path, FileLikeObject],
+         file_format: Optional[str] = None,
+         file_client_args: Optional[Dict] = None,
+         **kwargs):
     """Load data from json/yaml/pickle files.
 
     This method provides a unified api for loading data from serialized files.
@@ -45,13 +51,14 @@ def load(file, file_format=None, file_client_args=None, **kwargs):
     """
     if isinstance(file, Path):
         file = str(file)
-    if file_format is None and is_str(file):
+    if file_format is None and isinstance(file, str):
         file_format = file.split('.')[-1]
     if file_format not in file_handlers:
         raise TypeError(f'Unsupported format: {file_format}')
 
     handler = file_handlers[file_format]
-    if is_str(file):
+    f: FileLikeObject
+    if isinstance(file, str):
         file_client = FileClient.infer_client(file_client_args, file)
         if handler.str_like:
             with StringIO(file_client.get_text(file)) as f:
@@ -66,7 +73,11 @@ def load(file, file_format=None, file_client_args=None, **kwargs):
     return obj
 
 
-def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
+def dump(obj: Any,
+         file: Optional[Union[str, Path, FileLikeObject]] = None,
+         file_format: Optional[str] = None,
+         file_client_args: Optional[Dict] = None,
+         **kwargs):
     """Dump data to json/yaml/pickle strings or files.
 
     This method provides a unified api for dumping data as strings or to files,
@@ -96,18 +107,18 @@ def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
     if isinstance(file, Path):
         file = str(file)
     if file_format is None:
-        if is_str(file):
+        if isinstance(file, str):
             file_format = file.split('.')[-1]
         elif file is None:
             raise ValueError(
                 'file_format must be specified since file is None')
     if file_format not in file_handlers:
         raise TypeError(f'Unsupported format: {file_format}')
-
+    f: FileLikeObject
     handler = file_handlers[file_format]
     if file is None:
         return handler.dump_to_str(obj, **kwargs)
-    elif is_str(file):
+    elif isinstance(file, str):
         file_client = FileClient.infer_client(file_client_args, file)
         if handler.str_like:
             with StringIO() as f:
@@ -123,7 +134,8 @@ def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs):
         raise TypeError('"file" must be a filename str or a file-object')
 
 
-def _register_handler(handler, file_formats):
+def _register_handler(handler: BaseFileHandler,
+                      file_formats: Union[str, List[str]]) -> None:
     """Register a handler for some file extensions.
 
     Args:
@@ -142,7 +154,7 @@ def _register_handler(handler, file_formats):
         file_handlers[ext] = handler
 
 
-def register_handler(file_formats, **kwargs):
+def register_handler(file_formats: Union[str, list], **kwargs) -> Callable:
 
     def wrap(cls):
         _register_handler(cls(**kwargs), file_formats)
diff --git a/mmcv/fileio/parse.py b/mmcv/fileio/parse.py
index f60f0d611b8d75692221d0edd7dc993b0a6445c9..f28e59119325a1bb68b38dd884c59b68dbed6508 100644
--- a/mmcv/fileio/parse.py
+++ b/mmcv/fileio/parse.py
@@ -1,16 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 from io import StringIO
+from pathlib import Path
+from typing import Dict, List, Optional, Union
 
 from .file_client import FileClient
 
 
-def list_from_file(filename,
-                   prefix='',
-                   offset=0,
-                   max_num=0,
-                   encoding='utf-8',
-                   file_client_args=None):
+def list_from_file(filename: Union[str, Path],
+                   prefix: str = '',
+                   offset: int = 0,
+                   max_num: int = 0,
+                   encoding: str = 'utf-8',
+                   file_client_args: Optional[Dict] = None) -> List:
     """Load a text file and parse the content as a list of strings.
 
     Note:
@@ -52,10 +54,10 @@ def list_from_file(filename,
     return item_list
 
 
-def dict_from_file(filename,
-                   key_type=str,
-                   encoding='utf-8',
-                   file_client_args=None):
+def dict_from_file(filename: Union[str, Path],
+                   key_type: type = str,
+                   encoding: str = 'utf-8',
+                   file_client_args: Optional[Dict] = None) -> Dict:
     """Load a text file and parse the content as a dict.
 
     Each line of the text file will be two or more columns split by
diff --git a/mmcv/image/__init__.py b/mmcv/image/__init__.py
index d0051d609d3de4e7562e3fe638335c66617c4d91..92ecec4046a6f5ee25b4ea07215ed7c7c810dcfa 100644
--- a/mmcv/image/__init__.py
+++ b/mmcv/image/__init__.py
@@ -9,10 +9,10 @@ from .geometric import (cutout, imcrop, imflip, imflip_, impad,
 from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
 from .misc import tensor2imgs
 from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
-                          adjust_lighting, adjust_sharpness, auto_contrast,
-                          clahe, imdenormalize, imequalize, iminvert,
-                          imnormalize, imnormalize_, lut_transform, posterize,
-                          solarize)
+                          adjust_hue, adjust_lighting, adjust_sharpness,
+                          auto_contrast, clahe, imdenormalize, imequalize,
+                          iminvert, imnormalize, imnormalize_, lut_transform,
+                          posterize, solarize)
 
 __all__ = [
     'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
@@ -24,5 +24,6 @@ __all__ = [
     'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
     'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
     'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
-    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting'
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
+    'adjust_hue'
 ]
diff --git a/mmcv/image/colorspace.py b/mmcv/image/colorspace.py
index 814533952fdfda23d67cb6a3073692d8c1156add..08f9952408c8e0bb38b17c10e2089e900ed418c2 100644
--- a/mmcv/image/colorspace.py
+++ b/mmcv/image/colorspace.py
@@ -1,9 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Union
+
 import cv2
 import numpy as np
 
 
-def imconvert(img, src, dst):
+def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
     """Convert an image from the src colorspace to dst colorspace.
 
     Args:
@@ -19,7 +21,7 @@ def imconvert(img, src, dst):
     return out_img
 
 
-def bgr2gray(img, keepdim=False):
+def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
     """Convert a BGR image to grayscale image.
 
     Args:
@@ -36,7 +38,7 @@ def bgr2gray(img, keepdim=False):
     return out_img
 
 
-def rgb2gray(img, keepdim=False):
+def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
     """Convert a RGB image to grayscale image.
 
     Args:
@@ -53,7 +55,7 @@ def rgb2gray(img, keepdim=False):
     return out_img
 
 
-def gray2bgr(img):
+def gray2bgr(img: np.ndarray) -> np.ndarray:
     """Convert a grayscale image to BGR image.
 
     Args:
@@ -67,7 +69,7 @@ def gray2bgr(img):
     return out_img
 
 
-def gray2rgb(img):
+def gray2rgb(img: np.ndarray) -> np.ndarray:
     """Convert a grayscale image to RGB image.
 
     Args:
@@ -81,7 +83,7 @@ def gray2rgb(img):
     return out_img
 
 
-def _convert_input_type_range(img):
+def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
     """Convert the type and range of the input image.
 
     It converts the input image to np.float32 type and range of [0, 1].
@@ -109,7 +111,8 @@ def _convert_input_type_range(img):
     return img
 
 
-def _convert_output_type_range(img, dst_type):
+def _convert_output_type_range(
+        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
     """Convert the type and range of the image according to dst_type.
 
     It converts the image to desired type and range. If `dst_type` is np.uint8,
@@ -140,7 +143,7 @@ def _convert_output_type_range(img, dst_type):
     return img.astype(dst_type)
 
 
-def rgb2ycbcr(img, y_only=False):
+def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
     """Convert a RGB image to YCbCr image.
 
     This function produces the same results as Matlab's `rgb2ycbcr` function.
@@ -160,7 +163,7 @@ def rgb2ycbcr(img, y_only=False):
 
     Returns:
         ndarray: The converted YCbCr image. The output image has the same type
-            and range as input image.
+        and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img)
@@ -174,7 +177,7 @@ def rgb2ycbcr(img, y_only=False):
     return out_img
 
 
-def bgr2ycbcr(img, y_only=False):
+def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
     """Convert a BGR image to YCbCr image.
 
     The bgr version of rgb2ycbcr.
@@ -194,7 +197,7 @@ def bgr2ycbcr(img, y_only=False):
 
     Returns:
         ndarray: The converted YCbCr image. The output image has the same type
-            and range as input image.
+        and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img)
@@ -208,7 +211,7 @@ def bgr2ycbcr(img, y_only=False):
     return out_img
 
 
-def ycbcr2rgb(img):
+def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
     """Convert a YCbCr image to RGB image.
 
     This function produces the same results as Matlab's ycbcr2rgb function.
@@ -227,7 +230,7 @@ def ycbcr2rgb(img):
 
     Returns:
         ndarray: The converted RGB image. The output image has the same type
-            and range as input image.
+        and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img) * 255
@@ -240,7 +243,7 @@ def ycbcr2rgb(img):
     return out_img
 
 
-def ycbcr2bgr(img):
+def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
     """Convert a YCbCr image to BGR image.
 
     The bgr version of ycbcr2rgb.
@@ -259,7 +262,7 @@ def ycbcr2bgr(img):
 
     Returns:
         ndarray: The converted BGR image. The output image has the same type
-            and range as input image.
+        and range as input image.
     """
     img_type = img.dtype
     img = _convert_input_type_range(img) * 255
@@ -272,11 +275,11 @@ def ycbcr2bgr(img):
     return out_img
 
 
-def convert_color_factory(src, dst):
+def convert_color_factory(src: str, dst: str) -> Callable:
 
     code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
 
-    def convert_color(img):
+    def convert_color(img: np.ndarray) -> np.ndarray:
         out_img = cv2.cvtColor(img, code)
         return out_img
 
diff --git a/mmcv/image/geometric.py b/mmcv/image/geometric.py
index cf97c201cb4e43796c911919d03fb26a07ed817d..eecd795ea08127055cd8e90eb11c5e51fe586c18 100644
--- a/mmcv/image/geometric.py
+++ b/mmcv/image/geometric.py
@@ -37,15 +37,27 @@ cv2_interp_codes = {
     'lanczos': cv2.INTER_LANCZOS4
 }
 
+# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
+# Set pillow_interp_codes according to the naming scheme used.
 if Image is not None:
-    pillow_interp_codes = {
-        'nearest': Image.NEAREST,
-        'bilinear': Image.BILINEAR,
-        'bicubic': Image.BICUBIC,
-        'box': Image.BOX,
-        'lanczos': Image.LANCZOS,
-        'hamming': Image.HAMMING
-    }
+    if hasattr(Image, 'Resampling'):
+        pillow_interp_codes = {
+            'nearest': Image.Resampling.NEAREST,
+            'bilinear': Image.Resampling.BILINEAR,
+            'bicubic': Image.Resampling.BICUBIC,
+            'box': Image.Resampling.BOX,
+            'lanczos': Image.Resampling.LANCZOS,
+            'hamming': Image.Resampling.HAMMING
+        }
+    else:
+        pillow_interp_codes = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
 
 
 def imresize(img,
@@ -70,7 +82,7 @@ def imresize(img,
 
     Returns:
         tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
+        `resized_img`.
     """
     h, w = img.shape[:2]
     if backend is None:
@@ -130,7 +142,7 @@ def imresize_to_multiple(img,
 
     Returns:
         tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
+        `resized_img`.
     """
     h, w = img.shape[:2]
     if size is not None and scale_factor is not None:
@@ -145,7 +157,7 @@ def imresize_to_multiple(img,
         size = _scale_size((w, h), scale_factor)
 
     divisor = to_2tuple(divisor)
-    size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)])
+    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
     resized_img, w_scale, h_scale = imresize(
         img,
         size,
@@ -175,7 +187,7 @@ def imresize_like(img,
 
     Returns:
         tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-            `resized_img`.
+        `resized_img`.
     """
     h, w = dst_img.shape[:2]
     return imresize(img, (w, h), return_scale, interpolation, backend=backend)
@@ -460,18 +472,17 @@ def impad(img,
             areas when padding_mode is 'constant'. Default: 0.
         padding_mode (str): Type of padding. Should be: constant, edge,
             reflect or symmetric. Default: constant.
-
             - constant: pads with a constant value, this value is specified
-                with pad_val.
+              with pad_val.
             - edge: pads with the last value at the edge of the image.
-            - reflect: pads with reflection of image without repeating the
-                last value on the edge. For example, padding [1, 2, 3, 4]
-                with 2 elements on both sides in reflect mode will result
-                in [3, 2, 1, 2, 3, 4, 3, 2].
-            - symmetric: pads with reflection of image repeating the last
-                value on the edge. For example, padding [1, 2, 3, 4] with
-                2 elements on both sides in symmetric mode will result in
-                [2, 1, 1, 2, 3, 4, 4, 3]
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
 
     Returns:
         ndarray: The padded image.
@@ -479,7 +490,9 @@ def impad(img,
 
     assert (shape is not None) ^ (padding is not None)
     if shape is not None:
-        padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0])
+        width = max(shape[1] - img.shape[1], 0)
+        height = max(shape[0] - img.shape[0], 0)
+        padding = (0, 0, width, height)
 
     # check pad_val
     if isinstance(pad_val, tuple):
diff --git a/mmcv/image/io.py b/mmcv/image/io.py
index d47aaa845256e4e991582a939733c45d62a4de38..ae81b561a84cccfa4923364679dce56d762db1bc 100644
--- a/mmcv/image/io.py
+++ b/mmcv/image/io.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import io
 import os.path as osp
+import warnings
 from pathlib import Path
 
 import cv2
@@ -8,7 +9,8 @@ import numpy as np
 from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
                  IMREAD_UNCHANGED)
 
-from mmcv.utils import check_file_exist, is_str, mkdir_or_exist
+from mmcv.fileio import FileClient
+from mmcv.utils import is_filepath, is_str
 
 try:
     from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
@@ -137,9 +139,16 @@ def _pillow2array(img, flag='color', channel_order='bgr'):
     return array
 
 
-def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
+def imread(img_or_path,
+           flag='color',
+           channel_order='bgr',
+           backend=None,
+           file_client_args=None):
     """Read an image.
 
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
     Args:
         img_or_path (ndarray or str or Path): Either a numpy array or str or
             pathlib.Path. If it is a numpy array (loaded image), then
@@ -157,44 +166,42 @@ def imread(img_or_path, flag='color', channel_order='bgr', backend=None):
             `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
             If backend is None, the global imread_backend specified by
             ``mmcv.use_backend()`` will be used. Default: None.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
 
     Returns:
         ndarray: Loaded image array.
+
+    Examples:
+        >>> import mmcv
+        >>> img_path = '/path/to/img.jpg'
+        >>> img = mmcv.imread(img_path)
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+        ...     backend='cv2')
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+        ...     backend='pillow')
+        >>> s3_img_path = 's3://bucket/img.jpg'
+        >>> # infer the file backend by the prefix s3
+        >>> img = mmcv.imread(s3_img_path)
+        >>> # manually set the file backend petrel
+        >>> img = mmcv.imread(s3_img_path, file_client_args={
+        ...     'backend': 'petrel'})
+        >>> http_img_path = 'http://path/to/img.jpg'
+        >>> img = mmcv.imread(http_img_path)
+        >>> img = mmcv.imread(http_img_path, file_client_args={
+        ...     'backend': 'http'})
     """
 
-    if backend is None:
-        backend = imread_backend
-    if backend not in supported_backends:
-        raise ValueError(f'backend: {backend} is not supported. Supported '
-                         "backends are 'cv2', 'turbojpeg', 'pillow'")
     if isinstance(img_or_path, Path):
         img_or_path = str(img_or_path)
 
     if isinstance(img_or_path, np.ndarray):
         return img_or_path
     elif is_str(img_or_path):
-        check_file_exist(img_or_path,
-                         f'img file does not exist: {img_or_path}')
-        if backend == 'turbojpeg':
-            with open(img_or_path, 'rb') as in_file:
-                img = jpeg.decode(in_file.read(),
-                                  _jpegflag(flag, channel_order))
-                if img.shape[-1] == 1:
-                    img = img[:, :, 0]
-            return img
-        elif backend == 'pillow':
-            img = Image.open(img_or_path)
-            img = _pillow2array(img, flag, channel_order)
-            return img
-        elif backend == 'tifffile':
-            img = tifffile.imread(img_or_path)
-            return img
-        else:
-            flag = imread_flags[flag] if is_str(flag) else flag
-            img = cv2.imread(img_or_path, flag)
-            if flag == IMREAD_COLOR and channel_order == 'rgb':
-                cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
-            return img
+        file_client = FileClient.infer_client(file_client_args, img_or_path)
+        img_bytes = file_client.get(img_or_path)
+        return imfrombytes(img_bytes, flag, channel_order, backend)
     else:
         raise TypeError('"img" must be a numpy array or a str or '
                         'a pathlib.Path object')
@@ -206,29 +213,45 @@ def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
     Args:
         content (bytes): Image bytes got from files or other streams.
         flag (str): Same as :func:`imread`.
+        channel_order (str): The channel order of the output, candidates
+            are 'bgr' and 'rgb'. Default to 'bgr'.
         backend (str | None): The image decoding backend type. Options are
-            `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the
-            global imread_backend specified by ``mmcv.use_backend()`` will be
-            used. Default: None.
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+            None, the global imread_backend specified by ``mmcv.use_backend()``
+            will be used. Default: None.
 
     Returns:
         ndarray: Loaded image array.
+
+    Examples:
+        >>> img_path = '/path/to/img.jpg'
+        >>> with open(img_path, 'rb') as f:
+        >>>     img_buff = f.read()
+        >>> img = mmcv.imfrombytes(img_buff)
+        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
     """
 
     if backend is None:
         backend = imread_backend
     if backend not in supported_backends:
-        raise ValueError(f'backend: {backend} is not supported. Supported '
-                         "backends are 'cv2', 'turbojpeg', 'pillow'")
+        raise ValueError(
+            f'backend: {backend} is not supported. Supported '
+            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
     if backend == 'turbojpeg':
         img = jpeg.decode(content, _jpegflag(flag, channel_order))
         if img.shape[-1] == 1:
             img = img[:, :, 0]
         return img
     elif backend == 'pillow':
-        buff = io.BytesIO(content)
-        img = Image.open(buff)
-        img = _pillow2array(img, flag, channel_order)
+        with io.BytesIO(content) as buff:
+            img = Image.open(buff)
+            img = _pillow2array(img, flag, channel_order)
+        return img
+    elif backend == 'tifffile':
+        with io.BytesIO(content) as buff:
+            img = tifffile.imread(buff)
         return img
     else:
         img_np = np.frombuffer(content, np.uint8)
@@ -239,20 +262,53 @@ def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
         return img
 
 
-def imwrite(img, file_path, params=None, auto_mkdir=True):
+def imwrite(img,
+            file_path,
+            params=None,
+            auto_mkdir=None,
+            file_client_args=None):
     """Write image to file.
 
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
+    Warning:
+        The parameter `auto_mkdir` will be deprecated in the future and every
+        file clients will make directory automatically.
+
     Args:
         img (ndarray): Image array to be written.
         file_path (str): Image file path.
         params (None or list): Same as opencv :func:`imwrite` interface.
         auto_mkdir (bool): If the parent folder of `file_path` does not exist,
-            whether to create it automatically.
+            whether to create it automatically. It will be deprecated.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
 
     Returns:
         bool: Successful or not.
+
+    Examples:
+        >>> # write to hard disk client
+        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+        >>> # infer the file backend by the prefix s3
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+        >>> # manually set the file backend petrel
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', file_client_args={
+        ...     'backend': 'petrel'})
     """
-    if auto_mkdir:
-        dir_name = osp.abspath(osp.dirname(file_path))
-        mkdir_or_exist(dir_name)
-    return cv2.imwrite(file_path, img, params)
+    assert is_filepath(file_path)
+    file_path = str(file_path)
+    if auto_mkdir is not None:
+        warnings.warn(
+            'The parameter `auto_mkdir` will be deprecated in the future and '
+            'every file clients will make directory automatically.')
+    file_client = FileClient.infer_client(file_client_args, file_path)
+    img_ext = osp.splitext(file_path)[-1]
+    # Encode image according to image suffix.
+    # For example, if image path is '/path/your/img.jpg', the encode
+    # format is '.jpg'.
+    flag, img_buff = cv2.imencode(img_ext, img, params)
+    file_client.put(img_buff.tobytes(), file_path)
+    return flag
diff --git a/mmcv/image/misc.py b/mmcv/image/misc.py
index dfc4a9c6e4c073a672a9a94a06bf0bf2a418c228..43934a689dd7ac6d35b772b7ce9921ff3b1fff50 100644
--- a/mmcv/image/misc.py
+++ b/mmcv/image/misc.py
@@ -9,18 +9,21 @@ except ImportError:
     torch = None
 
 
-def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
-    """Convert tensor to 3-channel images.
+def tensor2imgs(tensor, mean=None, std=None, to_rgb=True):
+    """Convert tensor to 3-channel images or 1-channel gray images.
 
     Args:
         tensor (torch.Tensor): Tensor that contains multiple images, shape (
-            N, C, H, W).
-        mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0).
-        std (tuple[float], optional): Standard deviation of images.
-            Defaults to (1, 1, 1).
+            N, C, H, W). :math:`C` can be either 3 or 1.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
         to_rgb (bool, optional): Whether the tensor was converted to RGB
             format in the first place. If so, convert it back to BGR.
-            Defaults to True.
+            For the tensor with 1 channel, it must be False. Defaults to True.
 
     Returns:
         list[np.ndarray]: A list that contains multiple images.
@@ -29,8 +32,14 @@ def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True):
     if torch is None:
         raise RuntimeError('pytorch is not installed')
     assert torch.is_tensor(tensor) and tensor.ndim == 4
-    assert len(mean) == 3
-    assert len(std) == 3
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+        (channels == len(mean) == len(std) == 1 and not to_rgb)
 
     num_imgs = tensor.size(0)
     mean = np.array(mean, dtype=np.float32)
diff --git a/mmcv/image/photometric.py b/mmcv/image/photometric.py
index 5085d012019c0cbf56f66f421a378278c1a058ae..b41cea7172ae0ece858d868b73dc65deaea3510c 100644
--- a/mmcv/image/photometric.py
+++ b/mmcv/image/photometric.py
@@ -426,3 +426,46 @@ def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
 
     clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
     return clahe.apply(np.array(img, dtype=np.uint8))
+
+
+def adjust_hue(img: np.ndarray, hue_factor: float) -> np.ndarray:
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and cyclically
+    shifting the intensities in the hue channel (H). The image is then
+    converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Modified from
+    https://github.com/pytorch/vision/blob/main/torchvision/
+    transforms/functional.py
+
+    Args:
+        img (ndarray): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        ndarray: Hue adjusted image.
+    """
+
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
+    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
+        raise TypeError('img should be ndarray with dim=[2 or 3].')
+
+    dtype = img.dtype
+    img = img.astype(np.uint8)
+    hsv_img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV_FULL)
+    h, s, v = cv2.split(hsv_img)
+    h = h.astype(np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        h += np.uint8(hue_factor * 255)
+    hsv_img = cv2.merge([h, s, v])
+    return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2RGB_FULL).astype(dtype)
diff --git a/mmcv/model_zoo/torchvision_0.12.json b/mmcv/model_zoo/torchvision_0.12.json
new file mode 100644
index 0000000000000000000000000000000000000000..06defe67484dff91cf6f69109324cb1dd9d64bc3
--- /dev/null
+++ b/mmcv/model_zoo/torchvision_0.12.json
@@ -0,0 +1,57 @@
+{
+    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
+    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
+    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
+    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
+    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
+    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
+    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
+    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
+    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
+    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
+    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
+    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
+    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
+    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
+    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
+    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
+    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
+    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
+    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
+    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
+    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
+    "shufflenetv2_x1.5": null,
+    "shufflenetv2_x2.0": null,
+    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
+    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
+    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
+    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
+    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
+    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth"
+}
diff --git a/mmcv/onnx/info.py b/mmcv/onnx/info.py
index e599973689245ff7c279bed0640842a9f0891750..b8325a9c0d0dc3b48b77e9da307341059017ea28 100644
--- a/mmcv/onnx/info.py
+++ b/mmcv/onnx/info.py
@@ -1,10 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
+import warnings
 
 import torch
 
 
-def is_custom_op_loaded():
+def is_custom_op_loaded() -> bool:
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     flag = False
     try:
         from ..tensorrt import is_tensorrt_plugin_loaded
diff --git a/mmcv/onnx/onnx_utils/symbolic_helper.py b/mmcv/onnx/onnx_utils/symbolic_helper.py
index a9a31eb4aeb24b6057acf9d4c352ee7e940377dd..cc9e96f8fbbb0cadec23411ddf93b31a90d049d0 100644
--- a/mmcv/onnx/onnx_utils/symbolic_helper.py
+++ b/mmcv/onnx/onnx_utils/symbolic_helper.py
@@ -59,7 +59,7 @@ def _parse_arg(value, desc):
             raise RuntimeError(
                 "ONNX symbolic doesn't know to interpret ListConstruct node")
 
-    raise RuntimeError('Unexpected node type: {}'.format(value.node().kind()))
+    raise RuntimeError(f'Unexpected node type: {value.node().kind()}')
 
 
 def _maybe_get_const(value, desc):
@@ -328,4 +328,4 @@ cast_pytorch_to_onnx = {
 # Global set to store the list of quantized operators in the network.
 # This is currently only used in the conversion of quantized ops from PT
 # -> C2 via ONNX.
-_quantized_ops = set()
+_quantized_ops: set = set()
diff --git a/mmcv/onnx/symbolic.py b/mmcv/onnx/symbolic.py
index 94cc1c620d01c4fa062cc4576fcb591f90923a65..3599b3f26683ea2d1907aa5e839e02e474791370 100644
--- a/mmcv/onnx/symbolic.py
+++ b/mmcv/onnx/symbolic.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 """Modified from https://github.com/pytorch/pytorch."""
 import os
+import warnings
 
 import numpy as np
 import torch
@@ -409,8 +410,8 @@ def cummin(g, input, dim):
 
 @parse_args('v', 'v', 'is')
 def roll(g, input, shifts, dims):
-    from torch.onnx.symbolic_opset9 import squeeze
     from packaging import version
+    from torch.onnx.symbolic_opset9 import squeeze
     input_shape = g.op('Shape', input)
 
     need_flatten = len(dims) == 0
@@ -467,6 +468,18 @@ def roll(g, input, shifts, dims):
 
 
 def register_extra_symbolics(opset=11):
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     register_op('one_hot', one_hot, '', opset)
     register_op('im2col', im2col, '', opset)
     register_op('topk', topk, '', opset)
diff --git a/mmcv/ops/__init__.py b/mmcv/ops/__init__.py
old mode 100644
new mode 100755
index 999e090a458ee148ceca0649f1e3806a40e909bd..a65f14fff5f92039947d82a291fca09408f69f87
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .active_rotated_filter import active_rotated_filter
 from .assign_score_withk import assign_score_withk
 from .ball_query import ball_query
 from .bbox import bbox_overlaps
@@ -6,7 +7,9 @@ from .border_align import BorderAlign, border_align
 from .box_iou_rotated import box_iou_rotated
 from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
 from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
 from .contour_expand import contour_expand
+from .convex_iou import convex_giou, convex_iou
 from .corner_pool import CornerPool
 from .correlation import Correlation
 from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
@@ -16,6 +19,7 @@ from .deprecated_wrappers import Conv2d_deprecated as Conv2d
 from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
 from .deprecated_wrappers import Linear_deprecated as Linear
 from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
 from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
                          sigmoid_focal_loss, softmax_focal_loss)
 from .furthest_point_sample import (furthest_point_sample,
@@ -25,9 +29,11 @@ from .gather_points import gather_points
 from .group_points import GroupAll, QueryAndGroup, grouping_operation
 from .info import (get_compiler_version, get_compiling_cuda_version,
                    get_onnxruntime_op_path)
-from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+                    nms3d_normal, nms_bev, nms_normal_bev)
 from .knn import knn
 from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
 from .modulated_deform_conv import (ModulatedDeformConv2d,
                                     ModulatedDeformConv2dPack,
                                     modulated_deform_conv2d)
@@ -38,15 +44,25 @@ from .point_sample import (SimpleRoIAlign, point_sample,
                            rel_roi_point_to_rel_img_point)
 from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
                               points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
 from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
 from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
 from .roi_align import RoIAlign, roi_align
 from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
 from .roi_pool import RoIPool, roi_pool
 from .roiaware_pool3d import RoIAwarePool3d
 from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
 from .saconv import SAConv2d
 from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                          SparseConvTranspose3d, SparseInverseConv2d,
+                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
 from .sync_bn import SyncBatchNorm
 from .three_interpolate import three_interpolate
 from .three_nn import three_nn
@@ -70,12 +86,21 @@ __all__ = [
     'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
     'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
     'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+    'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated',
     'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
     'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
     'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
     'border_align', 'gather_points', 'furthest_point_sample',
     'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
-    'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization',
-    'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d',
-    'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all'
+    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool'
 ]
diff --git a/mmcv/ops/active_rotated_filter.py b/mmcv/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c2aa7806ab62a6d0544f6dc1fb609af3a8a483
--- /dev/null
+++ b/mmcv/ops/active_rotated_filter.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/mmcv/ops/assign_score_withk.py b/mmcv/ops/assign_score_withk.py
index 4906adaa2cffd1b46912fbe7d4f87ef2f9fa0012..deca0892bddc52b51e9d2543a9e893f0bd67ebdb 100644
--- a/mmcv/ops/assign_score_withk.py
+++ b/mmcv/ops/assign_score_withk.py
@@ -1,3 +1,6 @@
+from typing import Tuple
+
+import torch
 from torch.autograd import Function
 
 from ..utils import ext_loader
@@ -27,11 +30,11 @@ class AssignScoreWithK(Function):
 
     @staticmethod
     def forward(ctx,
-                scores,
-                point_features,
-                center_features,
-                knn_idx,
-                aggregate='sum'):
+                scores: torch.Tensor,
+                point_features: torch.Tensor,
+                center_features: torch.Tensor,
+                knn_idx: torch.Tensor,
+                aggregate: str = 'sum') -> torch.Tensor:
         """
         Args:
             scores (torch.Tensor): (B, npoint, K, M), predicted scores to
@@ -78,15 +81,20 @@ class AssignScoreWithK(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_out):
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
         """
         Args:
             grad_out (torch.Tensor): (B, out_dim, npoint, K)
 
         Returns:
-            grad_scores (torch.Tensor): (B, npoint, K, M)
-            grad_point_features (torch.Tensor): (B, N, M, out_dim)
-            grad_center_features (torch.Tensor): (B, N, M, out_dim)
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
         """
         _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
 
diff --git a/mmcv/ops/ball_query.py b/mmcv/ops/ball_query.py
index d0466847c6e5c1239e359a0397568413ebc1504a..d24e0446ca81a19a9e2d4b822cb32533f941d78f 100644
--- a/mmcv/ops/ball_query.py
+++ b/mmcv/ops/ball_query.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 from torch.autograd import Function
 
@@ -18,12 +20,13 @@ class BallQuery(Function):
             min_radius (float): minimum radius of the balls.
             max_radius (float): maximum radius of the balls.
             sample_num (int): maximum number of features in the balls.
-            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
+            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query.
 
         Returns:
-            Tensor: (B, npoint, nsample) tensor with the indices of
-                the features that form the query balls.
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
         """
         assert center_xyz.is_contiguous()
         assert xyz.is_contiguous()
@@ -48,7 +51,7 @@ class BallQuery(Function):
         return idx
 
     @staticmethod
-    def backward(ctx, a=None):
+    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
         return None, None, None, None
 
 
diff --git a/mmcv/ops/bbox.py b/mmcv/ops/bbox.py
index 0c4d58b6c91f652933974f519acd3403a833e906..bf6bd43bbb0adcb4b6d104a815f73ed2e5912069 100644
--- a/mmcv/ops/bbox.py
+++ b/mmcv/ops/bbox.py
@@ -1,10 +1,57 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
 
 
-def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+                       bboxes2: torch.Tensor,
+                       mode: str = 'iou',
+                       aligned: bool = False,
+                       offset: int = 0) -> torch.Tensor:
+    assert mode in ['iou', 'iof']
+
+    if aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+                  bboxes2: torch.Tensor,
+                  mode: str = 'iou',
+                  aligned: bool = False,
+                  offset: int = 0) -> torch.Tensor:
     """Calculate overlap between two set of bboxes.
 
     If ``aligned`` is ``False``, then calculate the ious between each bbox
@@ -12,14 +59,16 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
     bboxes1 and bboxes2.
 
     Args:
-        bboxes1 (Tensor): shape (m, 4) in <x1, y1, x2, y2> format or empty.
-        bboxes2 (Tensor): shape (n, 4) in <x1, y1, x2, y2> format or empty.
-            If aligned is ``True``, then m and n must be equal.
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
         mode (str): "iou" (intersection over union) or iof (intersection over
             foreground).
 
     Returns:
-        ious(Tensor): shape (m, n) if aligned == False else shape (m, 1)
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).
 
     Example:
         >>> bboxes1 = torch.FloatTensor([
@@ -63,10 +112,19 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0):
     if rows * cols == 0:
         return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
 
-    if aligned:
-        ious = bboxes1.new_zeros(rows)
+    if bboxes1.device.type == 'cpu':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
     else:
-        ious = bboxes1.new_zeros((rows, cols))
-    ext_module.bbox_overlaps(
-        bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset)
-    return ious
+        if aligned:
+            ious = bboxes1.new_zeros(rows)
+        else:
+            ious = bboxes1.new_zeros((rows, cols))
+        ext_module.bbox_overlaps(
+            bboxes1,
+            bboxes2,
+            ious,
+            mode=mode_flag,
+            aligned=aligned,
+            offset=offset)
+        return ious
diff --git a/mmcv/ops/border_align.py b/mmcv/ops/border_align.py
index ff305be328e9b0a15e1bbb5e6b41beb940f55c81..c09501b962cfce10b1da87e6b651d61911eb8406 100644
--- a/mmcv/ops/border_align.py
+++ b/mmcv/ops/border_align.py
@@ -2,6 +2,8 @@
 # modified from
 # https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
 
+from typing import Tuple
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -21,7 +23,8 @@ class BorderAlignFunction(Function):
             'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
 
     @staticmethod
-    def forward(ctx, input, boxes, pool_size):
+    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+                pool_size: int) -> torch.Tensor:
         ctx.pool_size = pool_size
         ctx.input_shape = input.size()
 
@@ -45,7 +48,8 @@ class BorderAlignFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
         boxes, argmax_idx = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
         # complex head architecture may cause grad_output uncontiguous
@@ -72,24 +76,25 @@ class BorderAlign(nn.Module):
 
     For each border line (e.g. top, left, bottom or right) of each box,
     border_align does the following:
-        1. uniformly samples `pool_size`+1 positions on this line, involving \
-           the start and end points.
-        2. the corresponding features on these points are computed by \
-           bilinear interpolation.
-        3. max pooling over all the `pool_size`+1 positions are used for \
-           computing pooled feature.
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.
 
     Args:
         pool_size (int): number of positions sampled over the boxes' borders
             (e.g. top, bottom, left, right).
-
     """
 
-    def __init__(self, pool_size):
-        super(BorderAlign, self).__init__()
+    def __init__(self, pool_size: int):
+        super().__init__()
         self.pool_size = pool_size
 
-    def forward(self, input, boxes):
+    def forward(self, input: torch.Tensor,
+                boxes: torch.Tensor) -> torch.Tensor:
         """
         Args:
             input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
@@ -98,8 +103,8 @@ class BorderAlign(nn.Module):
             boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
 
         Returns:
-            Tensor: Pooled features with shape [N,C,H*W,4]. The order is
-                (top,left,bottom,right) for the last dimension.
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
         """
         return border_align(input, boxes, self.pool_size)
 
diff --git a/mmcv/ops/box_iou_rotated.py b/mmcv/ops/box_iou_rotated.py
index 2d78015e9c2a9e7a52859b4e18f84a9aa63481a0..2443af27c92146ed4328e8f94b1415c7e72c542b 100644
--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
@@ -1,10 +1,16 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
 
 
-def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
+def box_iou_rotated(bboxes1: torch.Tensor,
+                    bboxes2: torch.Tensor,
+                    mode: str = 'iou',
+                    aligned: bool = False,
+                    clockwise: bool = True) -> torch.Tensor:
     """Return intersection-over-union (Jaccard index) of boxes.
 
     Both sets of boxes are expected to be in
@@ -14,18 +20,110 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
     of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
     bboxes1 and bboxes2.
 
-    Arguments:
-        boxes1 (Tensor): rotated bboxes 1. \
-            It has shape (N, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
-        boxes2 (Tensor): rotated bboxes 2. \
-            It has shape (M, 5), indicating (x, y, w, h, theta) for each row.
-            Note that theta is in radian.
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
         mode (str): "iou" (intersection over union) or iof (intersection over
             foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
 
     Returns:
-        ious(Tensor): shape (N, M) if aligned == False else shape (N,)
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
     """
     assert mode in ['iou', 'iof']
     mode_dict = {'iou': 0, 'iof': 1}
@@ -35,7 +133,12 @@ def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False):
     if aligned:
         ious = bboxes1.new_zeros(rows)
     else:
-        ious = bboxes1.new_zeros((rows * cols))
+        ious = bboxes1.new_zeros(rows * cols)
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
     bboxes1 = bboxes1.contiguous()
     bboxes2 = bboxes2.contiguous()
     ext_module.box_iou_rotated(
diff --git a/mmcv/ops/carafe.py b/mmcv/ops/carafe.py
index 5154cb3abfccfbbe0a1b2daa67018dbf80aaf6d2..18230c08074f5309e791810a4774e294084c3f5b 100644
--- a/mmcv/ops/carafe.py
+++ b/mmcv/ops/carafe.py
@@ -1,7 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
 from torch.autograd import Function
 from torch.nn.modules.module import Module
 
@@ -17,7 +20,8 @@ ext_module = ext_loader.load_ext('_ext', [
 class CARAFENaiveFunction(Function):
 
     @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
         return g.op(
             'mmcv::MMCVCARAFENaive',
             features,
@@ -27,7 +31,8 @@ class CARAFENaiveFunction(Function):
             scale_factor_f=scale_factor)
 
     @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
         assert scale_factor >= 1
         assert masks.size(1) == kernel_size * kernel_size * group_size
         assert masks.size(-1) == features.size(-1) * scale_factor
@@ -50,12 +55,15 @@ class CARAFENaiveFunction(Function):
             group_size=group_size,
             scale_factor=scale_factor)
 
-        if features.requires_grad or masks.requires_grad:
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
             ctx.save_for_backward(features, masks)
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
         assert grad_output.is_cuda
 
         features, masks = ctx.saved_tensors
@@ -83,8 +91,8 @@ carafe_naive = CARAFENaiveFunction.apply
 
 class CARAFENaive(Module):
 
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFENaive, self).__init__()
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
 
         assert isinstance(kernel_size, int) and isinstance(
             group_size, int) and isinstance(scale_factor, int)
@@ -92,7 +100,7 @@ class CARAFENaive(Module):
         self.group_size = group_size
         self.scale_factor = scale_factor
 
-    def forward(self, features, masks):
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
         return carafe_naive(features, masks, self.kernel_size, self.group_size,
                             self.scale_factor)
 
@@ -100,7 +108,8 @@ class CARAFENaive(Module):
 class CARAFEFunction(Function):
 
     @staticmethod
-    def symbolic(g, features, masks, kernel_size, group_size, scale_factor):
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
         return g.op(
             'mmcv::MMCVCARAFE',
             features,
@@ -110,7 +119,8 @@ class CARAFEFunction(Function):
             scale_factor_f=scale_factor)
 
     @staticmethod
-    def forward(ctx, features, masks, kernel_size, group_size, scale_factor):
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
         assert scale_factor >= 1
         assert masks.size(1) == kernel_size * kernel_size * group_size
         assert masks.size(-1) == features.size(-1) * scale_factor
@@ -139,12 +149,15 @@ class CARAFEFunction(Function):
             group_size=group_size,
             scale_factor=scale_factor)
 
-        if features.requires_grad or masks.requires_grad:
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
             ctx.save_for_backward(features, masks, rfeatures)
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
         assert grad_output.is_cuda
 
         features, masks, rfeatures = ctx.saved_tensors
@@ -180,7 +193,8 @@ carafe = CARAFEFunction.apply
 class CARAFE(Module):
     """ CARAFE: Content-Aware ReAssembly of FEatures
 
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.
 
     Args:
         kernel_size (int): reassemble kernel size
@@ -191,8 +205,8 @@ class CARAFE(Module):
         upsampled feature map
     """
 
-    def __init__(self, kernel_size, group_size, scale_factor):
-        super(CARAFE, self).__init__()
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
 
         assert isinstance(kernel_size, int) and isinstance(
             group_size, int) and isinstance(scale_factor, int)
@@ -200,7 +214,7 @@ class CARAFE(Module):
         self.group_size = group_size
         self.scale_factor = scale_factor
 
-    def forward(self, features, masks):
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
         return carafe(features, masks, self.kernel_size, self.group_size,
                       self.scale_factor)
 
@@ -211,8 +225,8 @@ class CARAFEPack(nn.Module):
     compressor 2) content encoder 3) CARAFE op.
 
     Official implementation of ICCV 2019 paper
-    CARAFE: Content-Aware ReAssembly of FEatures
-    Please refer to https://arxiv.org/abs/1905.02188 for more details.
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.
 
     Args:
         channels (int): input feature channels
@@ -228,14 +242,14 @@ class CARAFEPack(nn.Module):
     """
 
     def __init__(self,
-                 channels,
-                 scale_factor,
-                 up_kernel=5,
-                 up_group=1,
-                 encoder_kernel=3,
-                 encoder_dilation=1,
-                 compressed_channels=64):
-        super(CARAFEPack, self).__init__()
+                 channels: int,
+                 scale_factor: int,
+                 up_kernel: int = 5,
+                 up_group: int = 1,
+                 encoder_kernel: int = 3,
+                 encoder_dilation: int = 1,
+                 compressed_channels: int = 64):
+        super().__init__()
         self.channels = channels
         self.scale_factor = scale_factor
         self.up_kernel = up_kernel
@@ -261,7 +275,7 @@ class CARAFEPack(nn.Module):
                 xavier_init(m, distribution='uniform')
         normal_init(self.content_encoder, std=0.001)
 
-    def kernel_normalizer(self, mask):
+    def kernel_normalizer(self, mask: Tensor) -> Tensor:
         mask = F.pixel_shuffle(mask, self.scale_factor)
         n, mask_c, h, w = mask.size()
         # use float division explicitly,
@@ -274,11 +288,11 @@ class CARAFEPack(nn.Module):
 
         return mask
 
-    def feature_reassemble(self, x, mask):
+    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
         x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
         return x
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         compressed_x = self.channel_compressor(x)
         mask = self.content_encoder(compressed_x)
         mask = self.kernel_normalizer(mask)
diff --git a/mmcv/ops/cc_attention.py b/mmcv/ops/cc_attention.py
index ff8dd4c56849d504d265346316e2f8abb0a66598..9e5d3325263f18f6b5eb0bfbc522eeaef1999e3b 100644
--- a/mmcv/ops/cc_attention.py
+++ b/mmcv/ops/cc_attention.py
@@ -6,7 +6,7 @@ import torch.nn.functional as F
 from mmcv.cnn import PLUGIN_LAYERS, Scale
 
 
-def NEG_INF_DIAG(n, device):
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
     """Returns a diagonal matrix of size [n, n].
 
     The diagonal are all "-inf". This is for avoiding calculating the
@@ -41,7 +41,7 @@ class CrissCrossAttention(nn.Module):
         in_channels (int): Channels of the input feature map.
     """
 
-    def __init__(self, in_channels):
+    def __init__(self, in_channels: int) -> None:
         super().__init__()
         self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
         self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
@@ -49,14 +49,15 @@ class CrissCrossAttention(nn.Module):
         self.gamma = Scale(0.)
         self.in_channels = in_channels
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         """forward function of Criss-Cross Attention.
 
         Args:
-            x (Tensor): Input feature. \
-                shape (batch_size, in_channels, height, width)
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
         Returns:
-            Tensor: Output of the layer, with shape of \
+            torch.Tensor: Output of the layer, with the shape of
             (batch_size, in_channels, height, width)
         """
         B, C, H, W = x.size()
@@ -77,7 +78,7 @@ class CrissCrossAttention(nn.Module):
 
         return out
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         s = self.__class__.__name__
         s += f'(in_channels={self.in_channels})'
         return s
diff --git a/mmcv/ops/chamfer_distance.py b/mmcv/ops/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68eafb47c85418c374a1eaf086478e3fc0cb1d1
--- /dev/null
+++ b/mmcv/ops/chamfer_distance.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+    """This is an implementation of the 2D Chamfer Distance.
+
+    It has been used in the paper `Oriented RepPoints for Aerial Object
+    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+        """
+        Args:
+            xyz1 (Tensor): Point set with shape (B, N, 2).
+            xyz2 (Tensor): Point set with shape (B, N, 2).
+
+        Returns:
+            Sequence[Tensor]:
+
+                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+                    shape (B, N).
+                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+                    shape (B, N).
+                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+        """
+        batch_size, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        device = xyz1.device
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+
+        dist1 = torch.zeros(batch_size, n).to(device)
+        dist2 = torch.zeros(batch_size, m).to(device)
+        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+                                            idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2, idx1, idx2
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor,
+                 grad_idx1: Tensor,
+                 grad_idx2: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+
+        Args:
+            grad_dist1 (Tensor): Gradient of chamfer distance
+                (xyz1 to xyz2) with shape (B, N).
+            grad_dist2 (Tensor): Gradient of chamfer distance
+                (xyz2 to xyz1) with shape (B, N).
+            grad_idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+            grad_idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - grad_xyz1 (Tensor): Gradient of the point set with shape \
+                (B, N, 2).
+            - grad_xyz2 (Tensor):Gradient of the point set with shape \
+                (B, N, 2).
+        """
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        device = grad_dist1.device
+        grad_dist1 = grad_dist1.contiguous()
+        grad_dist2 = grad_dist2.contiguous()
+        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+        ext_module.chamfer_distance_backward(xyz1, xyz2, grad_xyz1, grad_xyz2,
+                                             grad_dist1, grad_dist2, idx1,
+                                             idx2)
+        return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
diff --git a/mmcv/ops/contour_expand.py b/mmcv/ops/contour_expand.py
index ea1111e1768b5f27e118bf7dbc0d9c70a7afd6d7..7184609ad9b64d421c17fdfe4a1a0dbeb62d64c8 100644
--- a/mmcv/ops/contour_expand.py
+++ b/mmcv/ops/contour_expand.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
 import numpy as np
 import torch
 
@@ -7,21 +9,22 @@ from ..utils import ext_loader
 ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
 
 
-def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
-                   kernel_num):
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   min_kernel_area: int, kernel_num: int) -> list:
     """Expand kernel contours so that foreground pixels are assigned into
     instances.
 
-    Arguments:
-        kernel_mask (np.array or Tensor): The instance kernel mask with
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
             size hxw.
-        internal_kernel_label (np.array or Tensor): The instance internal
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
             kernel label with size hxw.
         min_kernel_area (int): The minimum kernel area.
         kernel_num (int): The instance kernel number.
 
     Returns:
-        label (list): The instance index map with size hxw.
+        list: The instance index map with size hxw.
     """
     assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
     assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
@@ -42,7 +45,7 @@ def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
                 internal_kernel_label,
                 min_kernel_area=min_kernel_area,
                 kernel_num=kernel_num)
-            label = label.tolist()
+            label = label.tolist()  # type: ignore
     else:
         label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
                                           min_kernel_area, kernel_num)
diff --git a/mmcv/ops/convex_iou.py b/mmcv/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..50050363ac5b08cfa8f86dd186ab7087fac6f48a
--- /dev/null
+++ b/mmcv/ops/convex_iou.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+               polygons: torch.Tensor) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
diff --git a/mmcv/ops/corner_pool.py b/mmcv/ops/corner_pool.py
index a33d798b43d405e4c86bee4cd6389be21ca9c637..17ce24952a3b229fb552f450429c948e70aefa19 100644
--- a/mmcv/ops/corner_pool.py
+++ b/mmcv/ops/corner_pool.py
@@ -1,101 +1,90 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch import nn
+from torch import Tensor, nn
 from torch.autograd import Function
 
-from ..utils import ext_loader
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
 
-ext_module = ext_loader.load_ext('_ext', [
-    'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward',
-    'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward',
-    'right_pool_forward', 'right_pool_backward'
-])
 
-_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+    size = x.size(dim)
+    output = x.clone()
+
+    ind = 1
+    while ind < size:
+        if flip:
+            cur_start = 0
+            cur_len = size - ind
+            next_start = ind
+            next_len = size - ind
+        else:
+            cur_start = ind
+            cur_len = size - ind
+            next_start = 0
+            next_len = size - ind
+
+        # max_temp should be cloned for backward computation
+        max_temp = output.narrow(dim, cur_start, cur_len).clone()
+        cur_temp = output.narrow(dim, cur_start, cur_len)
+        next_temp = output.narrow(dim, next_start, next_len)
+
+        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+        ind = ind << 1
+
+    return output
 
 
 class TopPoolFunction(Function):
 
     @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
         output = g.op(
             'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
         return output
 
     @staticmethod
-    def forward(ctx, input):
-        output = ext_module.top_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.top_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, True)
 
 
 class BottomPoolFunction(Function):
 
     @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
         output = g.op(
             'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
         return output
 
     @staticmethod
-    def forward(ctx, input):
-        output = ext_module.bottom_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.bottom_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, False)
 
 
 class LeftPoolFunction(Function):
 
     @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
         output = g.op(
             'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
         return output
 
     @staticmethod
-    def forward(ctx, input):
-        output = ext_module.left_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.left_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, True)
 
 
 class RightPoolFunction(Function):
 
     @staticmethod
-    def symbolic(g, input):
+    def symbolic(g, input: Tensor) -> Tensor:
         output = g.op(
             'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
         return output
 
     @staticmethod
-    def forward(ctx, input):
-        output = ext_module.right_pool_forward(input)
-        ctx.save_for_backward(input)
-        return output
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, = ctx.saved_tensors
-        output = ext_module.right_pool_backward(input, grad_output)
-        return output
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, False)
 
 
 class CornerPool(nn.Module):
@@ -104,11 +93,13 @@ class CornerPool(nn.Module):
     Corner Pooling is a new type of pooling layer that helps a
     convolutional network better localize corners of bounding boxes.
 
-    Please refer to https://arxiv.org/abs/1808.01244 for more details.
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
     Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
 
     Args:
-        mode(str): Pooling orientation for the pooling layer
+        mode (str): Pooling orientation for the pooling layer
 
             - 'bottom': Bottom Pooling
             - 'left': Left Pooling
@@ -133,13 +124,13 @@ class CornerPool(nn.Module):
         'top': (2, True),
     }
 
-    def __init__(self, mode):
-        super(CornerPool, self).__init__()
+    def __init__(self, mode: str):
+        super().__init__()
         assert mode in self.pool_functions
         self.mode = mode
-        self.corner_pool = self.pool_functions[mode]
+        self.corner_pool: Function = self.pool_functions[mode]
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
         if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
             if torch.onnx.is_in_onnx_export():
                 assert torch.__version__ >= '1.7.0', \
@@ -158,4 +149,8 @@ class CornerPool(nn.Module):
                 pool_tensor = pool_tensor.flip(dim)
             return pool_tensor
         else:
-            return self.corner_pool.apply(x)
+            if torch.onnx.is_in_onnx_export():
+                return self.corner_pool.apply(x)
+            else:
+                dim, flip = self.cummax_dim_flip[self.mode]
+                return _corner_pool(x, dim, flip)
diff --git a/mmcv/ops/correlation.py b/mmcv/ops/correlation.py
index 3d0b79c301b29915dfaf4d2b1846c59be73127d3..319b7646782637e9ebaac4ef07b82d1f460031b5 100644
--- a/mmcv/ops/correlation.py
+++ b/mmcv/ops/correlation.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
 import torch
 from torch import Tensor, nn
 from torch.autograd import Function
@@ -15,14 +17,14 @@ class CorrelationFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input1,
-                input2,
-                kernel_size=1,
-                max_displacement=1,
-                stride=1,
-                padding=1,
-                dilation=1,
-                dilation_patch=1):
+                input1: Tensor,
+                input2: Tensor,
+                kernel_size: int = 1,
+                max_displacement: int = 1,
+                stride: int = 1,
+                padding: int = 1,
+                dilation: int = 1,
+                dilation_patch: int = 1) -> Tensor:
 
         ctx.save_for_backward(input1, input2)
 
@@ -60,7 +62,9 @@ class CorrelationFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
         input1, input2 = ctx.saved_tensors
 
         kH, kW = ctx.kernel_size
diff --git a/mmcv/ops/csrc/README.md b/mmcv/ops/csrc/README.md
index 3bc02004017a0d607131b4de168b320c3beed23c..dbc82b534b1ab27593361b3053cb61e12fbd420e 100644
--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
 │   ├── pytorch_device_registry.hpp
-│   └── cuda
-│       ├── common_cuda_helper.hpp
-│       ├── parrots_cudawarpfunction.cuh
-│       ├── ...
-│       └── ops_cuda_kernel.cuh
+│   ├── cuda
+│   │   ├── common_cuda_helper.hpp
+│   │   ├── parrots_cudawarpfunction.cuh
+│   │   ├── ...
+│   │   └── ops_cuda_kernel.cuh
+|   ├── mps
+│   │   ├── MPSLibrary.h
+│   │   ├── ...
+│   │   └── MPSUtils.h
+|   ├── mlu
+│   │   └── ...
+|   └── utils
+│   │   └── ...
 ├── onnxruntime
 │   ├── onnxruntime_register.h
 │   ├── onnxruntime_session_options_config_keys.h
@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── cuda
 │   │   ├── ...
 │   │   └── ops_cuda.cu
-│   └── cpu
+│   ├── cpu
+│   │   ├── ...
+│   │   └── ops.cpp
+│   ├── mps
+│   │   ├── ...
+│   |   └── op_mps.mm
+│   └── mlu
 │       ├── ...
-│       └── ops.cpp
+│       └── op_mlu.cpp
 └── tensorrt
     ├── trt_cuda_helper.cuh
     ├── trt_plugin_helper.hpp
@@ -63,108 +77,113 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 
 - `common`: This directory contains all tools and shared codes.
   - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
-- `onnxruntime`: **ONNX Runtime** support for custom ops.
+  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `utils`: The kernels and utils of spconv.
+- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
   - `cpu`: CPU implementation of supported ops.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
   - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
   - `cpu`: This directory contain cpu implementations of corresponding custom ops.
-- `tensorrt`: **TensorRT** support for custom ops.
+  - `mlu`: This directory contain launchers of each MLU kernels.
+  - `mps`: MPS ops implementation and launchers.
+- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
   - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
 
 ## How to add new PyTorch ops?
 
 1. (Optional) Add shared kernel in `common` to support special hardware platform.
 
-    ```c++
-    // src/common/cuda/new_ops_cuda_kernel.cuh
-
-    template <typename T>
-    __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
-        // forward here
-    }
-
-    ```
-
-    Add cuda kernel launcher in `pytorch/cuda`.
-
-    ```c++
-    // src/pytorch/cuda
-    #include <new_ops_cuda_kernel.cuh>
-
-    void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
-        // initialize
-        at::cuda::CUDAGuard device_guard(input.device());
-        cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-        ...
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-            input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
-                new_ops_forward_cuda_kernel<scalar_t>
-                    <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
-                        input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
-            }));
-        AT_CUDA_CHECK(cudaGetLastError());
-    }
-    ```
+   ```c++
+   // src/common/cuda/new_ops_cuda_kernel.cuh
+
+   template <typename T>
+   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+       // forward here
+   }
+
+   ```
+
+   Add cuda kernel launcher in `pytorch/cuda`.
+
+   ```c++
+   // src/pytorch/cuda
+   #include <new_ops_cuda_kernel.cuh>
+
+   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+       // initialize
+       at::cuda::CUDAGuard device_guard(input.device());
+       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+       ...
+       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+               new_ops_forward_cuda_kernel<scalar_t>
+                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+           }));
+       AT_CUDA_CHECK(cudaGetLastError());
+   }
+   ```
 
 2. Register implementation for different devices.
 
-    ```c++
-    // src/pytorch/cuda/cudabind.cpp
-    ...
+   ```c++
+   // src/pytorch/cuda/cudabind.cpp
+   ...
 
-    Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
-        // implement cuda forward here
-        // use `NewOpsForwardCUDAKernelLauncher` here
-    }
-    // declare interface here.
-    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
-    // register the implementation for given device (CUDA here).
-    REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
-    ```
+   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+       // implement cuda forward here
+       // use `NewOpsForwardCUDAKernelLauncher` here
+   }
+   // declare interface here.
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+   // register the implementation for given device (CUDA here).
+   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+   ```
 
 3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
 
-    ```c++
-    // src/pytorch/new_ops.cpp
-    Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
-        // dispatch the implementation according to the device type of input.
-        DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
-    }
-    ...
+   ```c++
+   // src/pytorch/new_ops.cpp
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+       // dispatch the implementation according to the device type of input.
+       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+   }
+   ...
 
-    Tensor new_ops_forward(Tensor input, Tensor output, ...){
-        return new_ops_forward_impl(input, output, ...);
-    }
-    ```
+   Tensor new_ops_forward(Tensor input, Tensor output, ...){
+       return new_ops_forward_impl(input, output, ...);
+   }
+   ```
 
 4. Binding the implementation in `pytorch/pybind.cpp`
 
-    ```c++
-    // src/pytorch/pybind.cpp
+   ```c++
+   // src/pytorch/pybind.cpp
 
-    ...
+   ...
 
-    Tensor new_ops_forward(Tensor input, Tensor output, ...);
+   Tensor new_ops_forward(Tensor input, Tensor output, ...);
 
-    ...
+   ...
 
-    // bind with pybind11
-    m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
-            py::arg("input"), py::arg("output"), ...);
+   // bind with pybind11
+   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+           py::arg("input"), py::arg("output"), ...);
 
-    ...
+   ...
 
-    ```
+   ```
 
 5. Build MMCV again. Enjoy new ops in python
 
-    ```python
-    from ..utils import ext_loader
-    ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+   ```python
+   from ..utils import ext_loader
+   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
 
-    ...
+   ...
 
-    ext_module.new_ops_forward(input, output, ...)
+   ext_module.new_ops_forward(input, output, ...)
 
-    ```
+   ```
diff --git a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
index 67190dc10eb245bb2bea23133ac984cd1c5a4888..243200e156f1384b625d6bac7fa4c68e533d9441 100644
--- a/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
+++ b/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -220,6 +220,10 @@ HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
                 return temp > 0;
               }
             });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
 #endif
 
   // Step 4:
diff --git a/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
index 056d12334b555bbbf14253382736bd6329805559..9f9250844b9ceeca0df0377640c3d28e3f61cecc 100644
--- a/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -22,34 +22,34 @@ __global__ void assign_score_withk_forward_cuda_kernel(
     const int O, const int aggregate, const T* points, const T* centers,
     const T* scores, const int64_t* knn_idx, T* output) {
   // ----- parallel loop for B, N1, K and O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * N1 * K * O) return;
-  // ------- loop for M ----------
-  const int b = (int)(i / (O * N1 * K));
-  const int o = (int)(i % (O * N1 * K) / (N1 * K));
-  const int n = (int)(i % (N1 * K) / K);
-  const int k = (int)(i % K);
-  const int cn = (int)knn_idx[b * K * N1 + n * K +
-                              0];  // The first neighbor is the center point
-  const int kn = (int)knn_idx[b * K * N1 + n * K + k];
-  if (kn >= N0 ||
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    return;
-  }
-  assert(b < B);
-  assert(kn < N0);
-  assert(cn < N0);
-  assert(o < O);
-  assert(n < N1);
-  const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
-  T val = output[out_idx];
-  for (int m = 0; m < M; m++) {
-    val += points[b * N0 * M * O + kn * M * O + m * O + o] *
-               scores[b * N1 * K * M + n * K * M + k * M + m] -
-           centers[b * N0 * M * O + cn * M * O + m * O + o] *
-               scores[b * N1 * K * M + n * K * M + k * M + m];
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
   }
-  output[out_idx] = val;
 }
 
 template <typename T>
@@ -58,27 +58,27 @@ __global__ void assign_score_withk_points_backward_cuda_kernel(
     const int O, const int aggregate, const T* grad_out, const T* scores,
     const int64_t* knn_idx, T* grad_points, T* grad_centers) {
   // ----- parallel loop for B, M, O ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * M * O) return;
-  int b = (int)(i / (M * O));
-  int m = (int)(i % (M * O) / O);
-  int o = (int)(i % O);
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
 
-  // ----- loop for N,K ---------
-  for (int n = 0; n < N; n++) {
-    for (int k = 0; k < K; k++) {
-      int kn = knn_idx[b * N * K + n * K + k];
-      int cn = knn_idx[b * N * K + n * K + 0];
-      if (kn >= N0 ||
-          kn < 0) {  // if index overflows, it is out of the neighborhood range
-        continue;
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
       }
-      atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
-                scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
-      atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
-                -scores[b * N * K * M + n * K * M + k * M + m] *
-                    grad_out[b * O * N * K + o * N * K + n * K + k]);
     }
   }
 }
@@ -89,28 +89,28 @@ __global__ void assign_score_withk_scores_backward_cuda_kernel(
     const int O, const int aggregate, const T* grad_out, const T* points,
     const T* centers, const int64_t* knn_idx, T* grad_scores) {
   // ----- parallel loop for B, N, K, M ---------
-  long i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= B * N * K * M) return;
-  const int b = (int)(i / (N * M * K));
-  const int n = (int)(i % (N * M * K) / M / K);
-  const int k = (int)(i % (M * K) / M);
-  const int m = (int)(i % M);
-  const int cn = knn_idx[b * N * K + n * K + 0];
-  const int kn = knn_idx[b * N * K + n * K + k];
-  if (kn >= N0 ||
-      kn < 0) {  // if index overflows, it is out of the neighborhood range
-    return;
-  }
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
 
-  // -------------- loop for O ------------------------
-  const int out_idx = b * N * K * M + n * K * M + k * M + m;
-  T val = grad_scores[out_idx];
-  for (int o = 0; o < O; o++) {
-    val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
-            centers[b * N0 * M * O + cn * M * O + m * O + o]) *
-           grad_out[b * O * N * K + o * N * K + n * K + k];
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
   }
-  grad_scores[out_idx] = val;
 }
 
 #endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
index ba2af01b5e4c67ec8498ac167e26a5116d853b62..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e 100644
--- a/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -21,35 +21,36 @@ __global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
   // output:
   //      idx: (B, M, nsample)
   int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= m) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
 
-  new_xyz += bs_idx * m * 3 + pt_idx * 3;
-  xyz += bs_idx * n * 3;
-  idx += bs_idx * m * nsample + pt_idx * nsample;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
 
-  float max_radius2 = max_radius * max_radius;
-  float min_radius2 = min_radius * min_radius;
-  T new_x = new_xyz[0];
-  T new_y = new_xyz[1];
-  T new_z = new_xyz[2];
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
 
-  int cnt = 0;
-  for (int k = 0; k < n; ++k) {
-    T x = xyz[k * 3 + 0];
-    T y = xyz[k * 3 + 1];
-    T z = xyz[k * 3 + 2];
-    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-           (new_z - z) * (new_z - z);
-    if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
-      if (cnt == 0) {
-        for (int l = 0; l < nsample; ++l) {
-          idx[l] = k;
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
         }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
       }
-      idx[cnt] = k;
-      ++cnt;
-      if (cnt >= nsample) break;
     }
   }
 }
diff --git a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
index 249c9e85009d00af2bee5380a0013135f36c303b..15bd91eca629895d3a99dde3fe6614036ca31dc9 100644
--- a/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -8,6 +8,27 @@
 #include "pytorch_cuda_helper.hpp"
 #endif
 
+template <typename T>
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+                                          T& y1, T& x2, T& y2) {
+  x1 = bbox[base];
+  y1 = bbox[base + 1];
+  x2 = bbox[base + 2];
+  y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox<float>(const float* bbox,
+                                                 const int base, float& x1,
+                                                 float& y1, float& x2,
+                                                 float& y2) {
+  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
+  x1 = bbox_offset.x;
+  y1 = bbox_offset.y;
+  x2 = bbox_offset.z;
+  y2 = bbox_offset.w;
+}
+
 template <typename T>
 __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                           T* ious, const int num_bbox1,
@@ -16,69 +37,111 @@ __global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
                                           const int offset) {
   if (aligned) {
     CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
-      int b1 = index;
-      int b2 = index;
-
-      int base1 = b1 * 4;
-      T b1_x1 = bbox1[base1];
-      T b1_y1 = bbox1[base1 + 1];
-      T b1_x2 = bbox1[base1 + 2];
-      T b1_y2 = bbox1[base1 + 3];
-      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      int base2 = b2 * 4;
-      T b2_x1 = bbox2[base2];
-      T b2_y1 = bbox2[base2 + 1];
-      T b2_x2 = bbox2[base2 + 2];
-      T b2_y2 = bbox2[base2 + 3];
-      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      T width = fmaxf(right - left + offset, 0.f);
-      T height = fmaxf(bottom - top + offset, 0.f);
-      T interS = width * height;
-      T baseS = 1.0;
-      if (mode == 0) {
-        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
-      } else if (mode == 1) {
-        baseS = fmaxf(b1_area, T(offset));
-      }
+      const int b1 = index;
+      const int b2 = index;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
       ious[index] = interS / baseS;
     }
   } else {
     CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
-      int b1 = index / num_bbox2;
-      int b2 = index % num_bbox2;
-
-      int base1 = b1 * 4;
-      T b1_x1 = bbox1[base1];
-      T b1_y1 = bbox1[base1 + 1];
-      T b1_x2 = bbox1[base1 + 2];
-      T b1_y2 = bbox1[base1 + 3];
-      T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
-
-      int base2 = b2 * 4;
-      T b2_x1 = bbox2[base2];
-      T b2_y1 = bbox2[base2 + 1];
-      T b2_x2 = bbox2[base2 + 2];
-      T b2_y2 = bbox2[base2 + 3];
-      T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
-
-      T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
-      T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
-      T width = fmaxf(right - left + offset, 0.f);
-      T height = fmaxf(bottom - top + offset, 0.f);
-      T interS = width * height;
-      T baseS = 1.0;
-      if (mode == 0) {
-        baseS = fmaxf(b1_area + b2_area - interS, T(offset));
-      } else if (mode == 1) {
-        baseS = fmaxf(b1_area, T(offset));
-      }
+      const int b1 = index / num_bbox2;
+      const int b2 = index % num_bbox2;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
       ious[index] = interS / baseS;
     }
   }
 }
 
+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+                                              const __half x2, const __half y2,
+                                              const __half offset) {
+  const __half half_w = __hadd(__hsub(x2, x1), offset);
+  const __half half_h = __hadd(__hsub(y2, y1), offset);
+  return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+  return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+  return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+    const int num_bbox2, const int mode, const bool aligned, const int offset) {
+  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+  const __half h_offset = __int2half_rn(offset);
+  CUDA_1D_KERNEL_LOOP(index, num_output) {
+    const int b1 = aligned ? index : index / num_bbox2;
+    const int b2 = aligned ? index : index % num_bbox2;
+
+    const int base1 = b1 << 2;
+    __half b1_x1, b1_y1, b1_x2, b1_y2;
+    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+    const int base2 = b2 << 2;
+    __half b2_x1, b2_y1, b2_x2, b2_y2;
+    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+    const __half left = __half_max(b1_x1, b2_x1),
+                 right = __half_min(b1_x2, b2_x2);
+    const __half top = __half_max(b1_y1, b2_y1),
+                 bottom = __half_min(b1_y2, b2_y2);
+    const __half width =
+        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+    const __half height =
+        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+    const __half interS = __hmul(width, height);
+
+    const __half baseS = __half_max(
+        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+        h_offset);
+    ious[index] = __hdiv(interS, baseS);
+  }
+}
+#endif  // __CUDA_ARCH__ >= 530
+
 #endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
index 07beeda57f70389d067e16b549b1a6042780a624..e7fa990fea1849f626baa0b81a726564373216a8 100644
--- a/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -32,12 +32,12 @@ __device__ inline int Loc2Index(const int n, const int c, const int h,
 #ifndef HIP_DIFF
 /* TODO: move this to a common place */
 template <typename scalar_t>
-__device__ inline scalar_t mmcv_min(scalar_t a, scalar_t b) {
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
   return a < b ? a : b;
 }
 
 template <typename scalar_t>
-__device__ inline scalar_t mmcv_max(scalar_t a, scalar_t b) {
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
   return a > b ? a : b;
 }
 #endif
diff --git a/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
@@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
+#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+
+template <typename scalar_t>
+__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
+                                                     const scalar_t* xyz, int m,
+                                                     const scalar_t* xyz2,
+                                                     scalar_t* result,
+                                                     int* result_i) {
+  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
+      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
+      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
+        buf[j] = xyz2[(i * m + k2) * 2 + j];
+      }
+      __syncthreads();
+      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
+        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
+        int best_i = 0;
+        scalar_t best = 1e10;
+        int end_ka = end_k & (~2);
+        if (end_ka == THREADS_PER_BLOCK) {
+          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        } else {
+          for (int k = 0; k < end_ka; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        }
+        for (int k = end_ka; k < end_k; k++) {
+          scalar_t x2 = buf[k * 2 + 0] - x1;
+          scalar_t y2 = buf[k * 2 + 1] - y1;
+          scalar_t d = x2 * x2 + y2 * y2;
+          if (k == 0 || d < best) {
+            best = d;
+            best_i = k + k2;
+          }
+        }
+        if (k2 == 0 || result[(i * n + j)] > best) {
+          result[(i * n + j)] = best;
+          result_i[(i * n + j)] = best_i;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void chamfer_distance_backward_cuda_kernel(
+    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
+    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
+    scalar_t* grad_xyz2) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
+      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
+      int j2 = idx1[i * n + j];
+      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
+      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
+      scalar_t g = grad_dist1[i * n + j] * 2;
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
+    }
+  }
+}
+#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
index dc5df1730ee20f7f97c5cbf14c7f8da849820feb..b12aa9a26a2cc162fd89f68ccc97e17749090a41 100644
--- a/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -7,12 +7,20 @@
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
        i += blockDim.x * gridDim.x)
 
-#define THREADS_PER_BLOCK 512
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
 
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define THREADS_PER_BLOCK 512
 
-inline int GET_BLOCKS(const int N) {
-  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
   int max_block_num = 4096;
   return min(optimal_block_num, max_block_num);
 }
diff --git a/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,831 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+  double x, y;
+  __device__ Point() {}
+  __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+  for (int i = 0; i < (n - 1) / 2.0; i++) {
+    Point* j = &(a[i]);
+    Point* k = &(a[n - 1 - i]);
+    swap1(j, k);
+  }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+  ps[n] = ps[0];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+  }
+  return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+                                           int* polygon_to_pred_index,
+                                           int n_pred, double* grad_C) {
+  ps[n] = ps[0];
+  double partion_grad[4 * 30 + 2];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+    partion_grad[i * 4 + 2] = ps[i + 1].y;
+    partion_grad[i * 4 + 3] = -ps[i + 1].x;
+    if (i != n - 1) {
+      partion_grad[i * 4 + 4] = -ps[i].y;
+      partion_grad[i * 4 + 5] = ps[i].x;
+    } else {
+      partion_grad[0] = -ps[i].y;
+      partion_grad[1] = ps[i].x;
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+        break;
+      }
+    }
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+        break;
+      }
+    }
+  }
+
+  return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+                                double* cut_grad, int m, int n, int i) {
+  double s1, s2;
+  double s2_s1_2;
+  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+
+  ds1_dxc = -(b.y - a.y);
+  ds1_dyc = b.x - a.x;
+  ds2_dxd = ds1_dxc;
+  ds2_dyd = ds1_dyc;
+  s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+
+  dxp_dxc =
+      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dxp_dyc =
+      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dxp_dxd =
+      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dxp_dyd =
+      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  dyp_dxc =
+      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dyp_dyc =
+      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dyp_dxd =
+      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dyp_dyd =
+      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  if (i == n - 1) {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 3] = dyp_dyd;
+  } else {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+  }
+
+  return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+                                   double* cut_grad) {
+  Point pp[MAXN];
+  double ccur_grad[MAXN] = {};
+  int m = 0;
+  p[n] = p[0];
+  int k = n;
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      ccur_grad[4 * n * m + 4 * i] = 1.0;
+      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+      m++;
+    }
+  }
+
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      for (int j = 0; j < 4 * k; j++) {
+        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+      }
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+                                       double* grad_AB, int order,
+                                       int convex_n) {
+  Point o(0, 0);
+  int res_flag = 0;
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+    res_flag = 1;
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3, n0 = 3, n1, n2, n3;
+  double cut_grad1[MAXN] = {};
+  double cut_grad2[MAXN] = {};
+  double cut_grad3[MAXN] = {};
+  double p1_p_grad[10][10] = {};
+  double p2_p1_grad[10][10] = {};
+  double p3_p2_grad[10][10] = {};
+
+  double p3_p1_grad[10][10] = {};
+  double p3_p_grad[10][10] = {};
+
+  // 1
+  polygon_cut(p, n, o, c, cut_grad1);
+  n1 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n0; j++) {
+      if (!(j % 2)) {
+        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+      } else {
+        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+      }
+    }
+  }
+
+  // 2
+  polygon_cut(p, n, c, d, cut_grad2);
+  n2 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n1; j++) {
+      if (!(j % 2)) {
+        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+      } else {
+        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+      }
+    }
+  }
+  // 3
+  polygon_cut(p, n, d, o, cut_grad3);
+  n3 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n2; j++) {
+      if (!(j % 2)) {
+        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+      } else {
+        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+      }
+    }
+  }
+
+  // mul
+  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n1; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n2; m++) {
+        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+      }
+      p3_p1_grad[i][j] = sum;
+    }
+  }
+
+  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n0; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n1; m++) {
+        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+      }
+      p3_p_grad[i][j] = sum;
+    }
+  }
+
+  // calculate S_grad
+  int polygon_index_box_index[20];
+  double grad_polygon[20];
+  double S_grad[6];
+
+  for (int i = 0; i < n3; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n3] = i;
+  }
+
+  double res =
+      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+  if (s1 * s2 == -1) {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+    res = -res;
+  } else {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+  }
+  return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+                                        double* grad_AB) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res +=
+          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+    }
+  }
+  return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[NMAX] = {}, top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+                                           int n2, double* grad_C) {
+  Point polygon[MAXN];
+  int n = n1 + n2, n_poly = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n - n1; j++) {
+      if (point_same(ps1[i], ps2[j])) {
+        for (int k = j; k < n - n1 - 1; k++) {
+          ps2[k] = ps2[k + 1];
+        }
+        n2--;
+        break;
+      }
+    }
+  }
+  n_poly = n1 + n2;
+  for (int i = 0; i < n_poly; i++) {
+    if (i < n1) {
+      polygon[i] = ps1[i];
+    } else {
+      polygon[i] = ps2[i - n1];
+    }
+  }
+
+  Jarvis(polygon, n_poly);
+
+  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int n_pred = 0;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n1; j++) {
+      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+        polygon_to_pred_index[n_pred] = i;
+        polygon_to_pred_index[n_pred + n1] = j;
+        n_pred += 1;
+        break;
+      }
+    }
+  }
+  if (n_pred == 0) {
+    double polygon_area = fabs(area(polygon, n_poly));
+    for (int i = 0; i < 18; i++) {
+      grad_C[i] = 0.0;
+    }
+    return polygon_area;
+  } else {
+    double polygon_area =
+        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+    if (polygon_area < 0) {
+      for (int i = 0; i < 18; i++) {
+        grad_C[i] = -grad_C[i];
+      }
+    }
+    return fabs(polygon_area);
+  }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+                                        int* points_to_convex_ind) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n_input; j++) {
+      if (point_same(in_poly[i], input_poly[j])) {
+        points_to_convex_ind[i] = j;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q,
+                                T* point_grad, const int idx) {
+  Point ps1[MAXN], ps2[MAXN];
+
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+  int n1 = n_convex;
+  int n2 = 4;
+
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+
+  int polygon_index_box_index[18];
+  for (int i = 0; i < n1; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n1] = i;
+  }
+
+  double grad_A[18] = {};
+  double grad_AB[18] = {};
+  double grad_C[18] = {};
+
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+  double S_pred =
+      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+  if (S_pred < 0) {
+    for (int i = 0; i < n_convex * 2; i++) {
+      grad_A[i] = -grad_A[i];
+    }
+  }
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+  double iou = inter_area / union_area;
+  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+  //    printf("%d:live\n", idx);
+  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+  float grad_point_temp[18] = {};
+
+  for (int i = 0; i < n_convex; i++) {
+    int grad_point = points_to_convex_ind[i];
+    grad_point_temp[2 * grad_point] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i] -
+                iou / union_area * grad_A[2 * i] -
+                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+    grad_point_temp[2 * grad_point + 1] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i + 1] -
+                iou / union_area * grad_A[2 * i + 1] -
+                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 9; i++) {
+    point_grad[2 * i] = grad_point_temp[2 * i];
+    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+  }
+  return (float)rot_giou;
+}
+
+template <typename T>
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+                                        const int gt_n_boxes, const T* ex_boxes,
+                                        const T* gt_boxes, T* point_grad) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    const T* cur_gt_box = gt_boxes + index * 8;
+    T* cur_grad = point_grad + index * 19;
+    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+    cur_grad[18] = giou;
+  }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+  double s1, s2;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+  Point pp[MAXN];
+  int m = 0;
+  p[n] = p[0];
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m]);
+      m++;
+    }
+  }
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+  Point o(0, 0);
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3;
+
+  polygon_cut(p, n, o, c);
+  polygon_cut(p, n, c, d);
+  polygon_cut(p, n, d, o);
+  double res = area(p, n);
+  if (s1 * s2 == -1) res = -res;
+  return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+                                        int n2) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+    }
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+  Point ps1[MAXN], ps2[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+  int n2 = 4;
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+  double S_pred = area(ps1, n1);
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+  double iou = inter_area / union_area;
+  return (float)iou;
+}
+
+template <typename T>
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+                                       const int gt_n_boxes, const T* ex_boxes,
+                                       const T* gt_boxes, T* iou) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    for (int i = 0; i < gt_n_boxes; i++) {
+      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+    }
+  }
+}
+#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
index 75ea4add72f597c88c8cdf511a7d2fd04727735b..2f7f112989127da235cb35476e15b206d4c2e3d4 100644
--- a/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -29,8 +29,8 @@ using namespace torch;
 #define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
 #define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
 
-#define THREADS_FORWARD 32
-#define THREADS_BACKWARD 16
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
 
 template <typename scalar_t>
 __global__ void correlation_forward_cuda_kernel(
@@ -42,8 +42,8 @@ __global__ void correlation_forward_cuda_kernel(
   const int C = rInput1.size(3);
 
   const int n = blockIdx.x;
-  const int h = blockIdx.y;
-  const int w = blockIdx.z;
+  const int h = blockIdx.y * blockDim.y + threadIdx.y;
+  const int w = blockIdx.z * blockDim.z + threadIdx.z;
   const int thread = threadIdx.x;
 
   const int start_i = -padH + h * dH;
@@ -52,13 +52,11 @@ __global__ void correlation_forward_cuda_kernel(
   const int patchRadH = dilation_patchH * (patchH - 1) / 2;
   const int patchRadW = dilation_patchW * (patchW - 1) / 2;
 
-  __shared__ scalar_t prod_sum[THREADS_FORWARD];
-
   for (int ph = 0; ph < patchH; ++ph) {
     int ph_dilated = ph * dilation_patchH - patchRadH;
     for (int pw = 0; pw < patchW; ++pw) {
       int pw_dilated = pw * dilation_patchW - patchRadW;
-      prod_sum[thread] = 0;
+      scalar_t prod_sum = 0.0f;
       for (int i = 0; i < kH; ++i) {
         int i1 = start_i + i * dilationH;
         int i2 = i1 + ph_dilated;
@@ -69,23 +67,20 @@ __global__ void correlation_forward_cuda_kernel(
               int j2 = j1 + pw_dilated;
               if
                 WITHIN_BOUNDS(j1, j2, iW, iW) {
-                  for (int c = thread; c < C; c += THREADS_FORWARD) {
+                  for (int c = thread; c < C; c += WARP_SIZE) {
                     scalar_t v1 = rInput1[n][i1][j1][c];
                     scalar_t v2 = rInput2[n][i2][j2][c];
-                    prod_sum[thread] += v1 * v2;
+                    prod_sum += v1 * v2;
                   }
                 }
             }
           }
       }
       // accumulate
-      __syncthreads();
+      for (int offset = 16; offset > 0; offset /= 2)
+        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
       if (thread == 0) {
-        scalar_t reduce_sum = 0;
-        for (int index = 0; index < THREADS_FORWARD; ++index) {
-          reduce_sum += prod_sum[index];
-        }
-        output[n][ph][pw][h][w] = reduce_sum;
+        output[n][ph][pw][h][w] = prod_sum;
       }
     }
   }
@@ -97,9 +92,10 @@ __global__ void correlation_backward_cuda_kernel_input1(
     TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
     const int patchW, const int padH, const int padW, const int dilationH,
     const int dilationW, const int dilation_patchH, const int dilation_patchW,
-    const int dH, const int dW, const int batch) {
-  const int iH = input2.size(2);
-  const int iW = input2.size(3);
+    const int dH, const int dW) {
+  const int iH = input2.size(1);
+  const int iW = input2.size(2);
+  const int C = input2.size(3);
 
   const int H = grad_output.size(3);
   const int W = grad_output.size(4);
@@ -107,54 +103,53 @@ __global__ void correlation_backward_cuda_kernel_input1(
   const int patchRadH = (patchH - 1) / 2;
   const int patchRadW = (patchW - 1) / 2;
 
-  const int n = batch;
-  const int c = blockIdx.x;
+  const int n = blockIdx.x;
   const int h = blockIdx.y;
   const int w = blockIdx.z;
-  const int ph_off = threadIdx.x;
-  const int pw_off = threadIdx.y;
 
   const int h_2 = h + padH;
   const int w_2 = w + padW;
   const int min_h = h_2 - kH * dilationH;
   const int min_w = w_2 - kW * dilationW;
 
-  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
-  prod_sum[ph_off][pw_off] = 0;
-
-  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
     int i1 = h + dilation_patchH * (ph - patchRadH);
-    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
-      int j1 = w + dilation_patchW * (pw - patchRadW);
-      if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
-        scalar_t val = input2[n][c][i1][j1];
-        for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
-          int i2 = (h_3) / dH;
-          if (i2 * dH != h_3) continue;
-          for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
-            int j2 = (w_3) / dW;
-            if (j2 * dW != w_3) continue;
-            if
-              WITHIN_BOUNDS(i2, j2, H, W) {
-                prod_sum[ph_off][pw_off] +=
-                    grad_output[n][ph][pw][i2][j2] * val;
-              }
+    int j1 = w + dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
           }
         }
       }
+      grad_cache[i] = grad_val;
     }
   }
-
   __syncthreads();
 
-  if (ph_off == 0 && pw_off == 0) {
-    scalar_t reduce_sum = 0;
-    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
-      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
-        reduce_sum += prod_sum[ph][pw];
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h + dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w + dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
       }
     }
-    grad_input1[n][c][h][w] = reduce_sum;
+    grad_input1[n][c][h][w] = grad_input_val;
   }
 }
 
@@ -163,9 +158,10 @@ __global__ void correlation_backward_cuda_kernel_input2(
     const TensorAcc5R grad_output, const TensorAcc4R input1,
     TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
     int padW, int dilationH, int dilationW, int dilation_patchH,
-    int dilation_patchW, int dH, int dW, int batch) {
-  const int iH = input1.size(2);
-  const int iW = input1.size(3);
+    int dilation_patchW, int dH, int dW) {
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  const int C = input1.size(3);
 
   const int patchRadH = (patchH - 1) / 2;
   const int patchRadW = (patchW - 1) / 2;
@@ -176,56 +172,54 @@ __global__ void correlation_backward_cuda_kernel_input2(
   const int dilatedKH = kH * dilationH;
   const int dilatedKW = kW * dilationW;
 
-  const int n = batch;
-  const int c = blockIdx.x;
+  const int n = blockIdx.x;
   const int h = blockIdx.y;
   const int w = blockIdx.z;
-  const int ph_off = threadIdx.x;
-  const int pw_off = threadIdx.y;
-
-  __shared__ scalar_t prod_sum[THREADS_BACKWARD][THREADS_BACKWARD];
-  prod_sum[ph_off][pw_off] = 0;
 
-  for (int ph = ph_off; ph < patchH; ph += THREADS_BACKWARD) {
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
     int i1 = h - dilation_patchH * (ph - patchRadH);
-    for (int pw = pw_off; pw < patchW; pw += THREADS_BACKWARD) {
-      int j1 = w - dilation_patchW * (pw - patchRadW);
-      if
-        WITHIN_BOUNDS(i1, j1, iH, iW) {
-          scalar_t val = input1[n][c][i1][j1];
-
-          const int h_2 = i1 + padH;
-          const int w_2 = j1 + padW;
-          const int min_h = h_2 - dilatedKH;
-          const int min_w = w_2 - dilatedKW;
-
-          for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
-            int i2 = (h_3) / dH;
-            if (i2 * dH != h_3) continue;
-            for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
-              int j2 = (w_3) / dW;
-              if (j2 * dW != w_3) continue;
-              if
-                WITHIN_BOUNDS(i2, j2, H, W) {
-                  prod_sum[ph_off][pw_off] +=
-                      grad_output[n][ph][pw][i2][j2] * val;
-                }
-            }
+    int j1 = w - dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+
+      const int h_2 = i1 + padH;
+      const int w_2 = j1 + padW;
+      const int min_h = h_2 - dilatedKH;
+      const int min_w = w_2 - dilatedKW;
+
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
           }
         }
+      }
+      grad_cache[i] = grad_val;
     }
   }
-
   __syncthreads();
 
-  if (ph_off == 0 && pw_off == 0) {
-    scalar_t reduce_sum = 0;
-    for (int ph = 0; ph < THREADS_BACKWARD; ++ph) {
-      for (int pw = 0; pw < THREADS_BACKWARD; ++pw) {
-        reduce_sum += prod_sum[ph][pw];
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h - dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w - dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
       }
     }
-    grad_input2[n][c][h][w] = reduce_sum;
+    grad_input2[n][c][h][w] = grad_input_val;
   }
 }
 #endif
diff --git a/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3ee1814e12d185a08640f9768d6c87b5eb3428e5
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
@@ -0,0 +1,136 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_NUM_VERT_IDX 9
+#define INTERSECTION_OFFSET 8
+#define EPSILON 1e-8
+
+inline int opt_n_thread(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
+}
+
+/*
+compare normalized vertices (vertices around (0,0))
+if vertex1 < vertex2 return true.
+order: minimum at x-aixs, become larger in anti-clockwise direction
+*/
+__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
+  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
+    return false;  // if equal, return false
+
+  if (y1 > 0 && y2 < 0) return true;
+  if (y1 < 0 && y2 > 0) return false;
+
+  float n1 = x1 * x1 + y1 * y1 + EPSILON;
+  float n2 = x2 * x2 + y2 * y2 + EPSILON;
+  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
+
+  if (y1 > 0 && y2 > 0) {
+    if (diff > EPSILON)
+      return true;
+    else
+      return false;
+  }
+  if (y1 < 0 && y2 < 0) {
+    if (diff < EPSILON)
+      return true;
+    else
+      return false;
+  }
+}
+
+__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ vertices,
+    const bool *__restrict__ mask, const int *__restrict__ num_valid,
+    int *__restrict__ idx) {
+  int batch_idx = blockIdx.x;
+  vertices += batch_idx * n * m * 2;
+  mask += batch_idx * n * m;
+  num_valid += batch_idx * n;
+  idx += batch_idx * n * MAX_NUM_VERT_IDX;
+
+  int index = threadIdx.x;  // index of polygon
+  int stride = blockDim.x;
+  for (int i = index; i < n; i += stride) {
+    int pad;  // index of arbitrary invalid intersection point (not box corner!)
+    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
+      if (!mask[i * m + j]) {
+        pad = j;
+        break;
+      }
+    }
+    if (num_valid[i] < 3) {
+      // not enough vertices, take an invalid intersection point
+      // (zero padding)
+      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+    } else {
+      // sort the valid vertices
+      // note the number of valid vertices is known
+      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
+      for (int j = 0; j < num_valid[i]; ++j) {
+        // initialize with a "big" value
+        float x_min = 1;
+        float y_min = -EPSILON;
+        int i_take = 0;
+        int i2;
+        float x2, y2;
+        if (j != 0) {
+          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
+          x2 = vertices[i * m * 2 + i2 * 2 + 0];
+          y2 = vertices[i * m * 2 + i2 * 2 + 1];
+        }
+        for (int k = 0; k < m; ++k) {
+          float x = vertices[i * m * 2 + k * 2 + 0];
+          float y = vertices[i * m * 2 + k * 2 + 1];
+          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
+            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
+              x_min = x;
+              y_min = y;
+              i_take = k;
+            }
+          }
+        }
+        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
+      }
+      // duplicate the first idx
+      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
+
+      // pad zeros
+      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+
+      // for corner case: the two boxes are exactly the same.
+      // in this case, idx would have duplicate elements, which makes the
+      // shoelace formula broken because of the definition, the duplicate
+      // elements only appear in the first 8 positions (they are "corners in
+      // box", not "intersection of edges")
+      if (num_valid[i] == 8) {
+        int counter = 0;
+        for (int j = 0; j < 4; ++j) {
+          int check = idx[i * MAX_NUM_VERT_IDX + j];
+          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
+            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
+          }
+        }
+        if (counter == 4) {
+          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
+          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
+            idx[i * MAX_NUM_VERT_IDX + j] = pad;
+          }
+        }
+      }
+
+      // TODO: still might need to cover some other corner cases :(
+    }
+  }
+}
diff --git a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
index c8fc61546acbce55c59abe8371590bba2e610442..6d932434cba245833e661b8c7e140601940bc35b 100644
--- a/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -22,13 +22,14 @@ __global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
-
-  out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
-  points += bs_idx * c * n + c_idx * n;
-  out[0] = points[idx[0]];
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+  }
 }
 
 template <typename T>
@@ -43,14 +44,15 @@ __global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
 
-  grad_out += bs_idx * c * m + c_idx * m + pt_idx;
-  idx += bs_idx * m + pt_idx;
-  grad_points += bs_idx * c * n + c_idx * n;
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
 
-  atomicAdd(grad_points + idx[0], grad_out[0]);
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
 }
 
 #endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
index 9cfc2dc865152769d55d4062b7f6bad25e9c70e8..dfad66fc16d8759f614d7f36fa961673976b1d95 100644
--- a/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -22,18 +22,19 @@ __global__ void group_points_forward_cuda_kernel(int b, int c, int n,
   //      out: (B, C, npoints, nsample)
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int pt_idx = index / nsample;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    if (bs_idx >= b || c_idx >= c) return;
 
-  int sample_idx = index % nsample;
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
 
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
-  int in_idx = bs_idx * c * n + c_idx * n + idx[0];
-  int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                  pt_idx * nsample + sample_idx;
 
-  out[out_idx] = points[in_idx];
+    out[out_idx] = points[in_idx];
+  }
 }
 
 template <typename T>
@@ -48,16 +49,17 @@ __global__ void group_points_backward_cuda_kernel(int b, int c, int n,
   //      grad_points: (B, C, N)
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int pt_idx = index / nsample;
-  if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
 
-  int sample_idx = index % nsample;
-  grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
-              pt_idx * nsample + sample_idx;
-  idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
 
-  atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
 }
 
 #endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
index 4e261cbd0cf1d69973eab34f32ab2a334d6a13a6..9ebdcad15eee05a9f412ef34eb12d3553874a4dc 100644
--- a/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -50,21 +50,17 @@ __device__ int check_rect_cross(const Point &p1, const Point &p2,
 }
 
 __device__ inline int check_in_box2d(const float *box, const Point &p) {
-  // params: box (5) [x1, y1, x2, y2, angle]
-  const float MARGIN = 1e-5;
-
-  float center_x = (box[0] + box[2]) / 2;
-  float center_y = (box[1] + box[3]) / 2;
-  float angle_cos = cos(-box[4]),
-        angle_sin =
-            sin(-box[4]);  // rotate the point in the opposite direction of box
-  float rot_x =
-      (p.x - center_x) * angle_cos - (p.y - center_y) * angle_sin + center_x;
-  float rot_y =
-      (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos + center_y;
-
-  return (rot_x > box[0] - MARGIN && rot_x < box[2] + MARGIN &&
-          rot_y > box[1] - MARGIN && rot_y < box[3] + MARGIN);
+  // params: box (7) [x, y, z, dx, dy, dz, heading]
+  const float MARGIN = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  // rotate the point in the opposite direction of box
+  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+          fabs(rot_y) < box[4] / 2 + MARGIN);
 }
 
 __device__ inline int intersection(const Point &p1, const Point &p0,
@@ -116,16 +112,19 @@ __device__ inline int point_cmp(const Point &a, const Point &b,
 }
 
 __device__ inline float box_overlap(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
 
-  float a_x1 = box_a[0], a_y1 = box_a[1], a_x2 = box_a[2], a_y2 = box_a[3],
-        a_angle = box_a[4];
-  float b_x1 = box_b[0], b_y1 = box_b[1], b_x2 = box_b[2], b_y2 = box_b[3],
-        b_angle = box_b[4];
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
 
-  Point center_a((a_x1 + a_x2) / 2, (a_y1 + a_y2) / 2);
-  Point center_b((b_x1 + b_x2) / 2, (b_y1 + b_y2) / 2);
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
 
   Point box_a_corners[5];
   box_a_corners[0].set(a_x1, a_y1);
@@ -209,10 +208,10 @@ __device__ inline float box_overlap(const float *box_a, const float *box_b) {
 }
 
 __device__ inline float iou_bev(const float *box_a, const float *box_b) {
-  // params: box_a (5) [x1, y1, x2, y2, angle]
-  // params: box_b (5) [x1, y1, x2, y2, angle]
-  float sa = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]);
-  float sb = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]);
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
   float s_overlap = box_overlap(box_a, box_b);
   return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
 }
@@ -220,149 +219,148 @@ __device__ inline float iou_bev(const float *box_a, const float *box_b) {
 __global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
     const int num_a, const float *boxes_a, const int num_b,
     const float *boxes_b, float *ans_overlap) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
-
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
-  }
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float s_overlap = box_overlap(cur_box_a, cur_box_b);
-  ans_overlap[a_idx * num_b + b_idx] = s_overlap;
-}
-
-__global__ void iou3d_boxes_iou_bev_forward_cuda_kernel(const int num_a,
-                                                        const float *boxes_a,
-                                                        const int num_b,
-                                                        const float *boxes_b,
-                                                        float *ans_iou) {
-  const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
-  const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
 
-  if (a_idx >= num_a || b_idx >= num_b) {
-    return;
+    const float *cur_box_a = boxes_a + a_idx * 7;
+    const float *cur_box_b = boxes_b + b_idx * 7;
+    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
   }
-
-  const float *cur_box_a = boxes_a + a_idx * 5;
-  const float *cur_box_b = boxes_b + b_idx * 5;
-  float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
-  ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
 }
 
-__global__ void nms_forward_cuda_kernel(const int boxes_num,
-                                        const float nms_overlap_thresh,
-                                        const float *boxes,
-                                        unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
+                                                const float nms_overlap_thresh,
+                                                const float *boxes,
+                                                unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
   // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
 
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
 
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_bev(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
       }
+      for (i = start; i < col_size; i++) {
+        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
     }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
 
 __device__ inline float iou_normal(float const *const a, float const *const b) {
-  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
-  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
   float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
   float interS = width * height;
-  float Sa = (a[2] - a[0]) * (a[3] - a[1]);
-  float Sb = (b[2] - b[0]) * (b[3] - b[1]);
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
   return interS / fmaxf(Sa + Sb - interS, EPS);
 }
 
-__global__ void nms_normal_forward_cuda_kernel(const int boxes_num,
-                                               const float nms_overlap_thresh,
-                                               const float *boxes,
-                                               unsigned long long *mask) {
-  // params: boxes (N, 5) [x1, y1, x2, y2, ry]
+__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
+    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
+    unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
   // params: mask (N, N/THREADS_PER_BLOCK_NMS)
 
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  // if (row_start > col_start) return;
-
-  const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-  const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
-                             THREADS_PER_BLOCK_NMS);
-
-  __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 5];
-
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 5 + 0] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 0];
-    block_boxes[threadIdx.x * 5 + 1] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 1];
-    block_boxes[threadIdx.x * 5 + 2] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 2];
-    block_boxes[threadIdx.x * 5 + 3] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 3];
-    block_boxes[threadIdx.x * 5 + 4] =
-        boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 5 + 4];
-  }
-  __syncthreads();
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
 
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
-    const float *cur_box = boxes + cur_box_idx * 5;
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
 
-    int i = 0;
-    unsigned long long t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (iou_normal(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
-        t |= 1ULL << i;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
       }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
     }
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
-    mask[cur_box_idx * col_blocks + col_start] = t;
   }
 }
 
diff --git a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
index 3181aa65cddf129e9e97dde97ceb97923b75c135..3cf52bb90eb27d02b28c52069c760c8a38f83f08 100644
--- a/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -51,40 +51,41 @@ __global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
                                         const T *xyz, const T *new_xyz,
                                         int *__restrict__ idx, T *dist2) {
   int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= m) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
 
-  new_xyz += bs_idx * m * 3 + pt_idx * 3;
-  xyz += bs_idx * n * 3;
-  idx += bs_idx * m * nsample + pt_idx * nsample;
-  dist2 += bs_idx * m * nsample + pt_idx * nsample;
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
 
-  T new_x = new_xyz[0];
-  T new_y = new_xyz[1];
-  T new_z = new_xyz[2];
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
 
-  float best_dist[100];
-  int best_idx[100];
-  for (int i = 0; i < nsample; i++) {
-    best_dist[i] = 1e10;
-    best_idx[i] = 0;
-  }
-  for (int i = 0; i < n; i++) {
-    T x = xyz[i * 3 + 0];
-    T y = xyz[i * 3 + 1];
-    T z = xyz[i * 3 + 2];
-    T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-           (new_z - z) * (new_z - z);
-    if (d2 < best_dist[0]) {
-      best_dist[0] = d2;
-      best_idx[0] = i;
-      reheap(best_dist, best_idx, nsample);
+    float best_dist[100];
+    int best_idx[100];
+    for (int i = 0; i < nsample; i++) {
+      best_dist[i] = 1e10;
+      best_idx[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+      T x = xyz[i * 3 + 0];
+      T y = xyz[i * 3 + 1];
+      T z = xyz[i * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < best_dist[0]) {
+        best_dist[0] = d2;
+        best_idx[0] = i;
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
     }
-  }
-  heap_sort(best_dist, best_idx, nsample);
-  for (int i = 0; i < nsample; i++) {
-    idx[i] = best_idx[i];
-    dist2[i] = best_dist[i];
   }
 }
 
diff --git a/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,300 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+  float convex_points[2][MAXN];
+  for (int j = 0; j < n_points; j++) {
+    convex_points[0][j] = ps[j].x;
+  }
+  for (int j = 0; j < n_points; j++) {
+    convex_points[1][j] = ps[j].y;
+  }
+
+  Point edges[MAXN];
+  float edges_angles[MAXN];
+  float unique_angles[MAXN];
+  int n_edges = n_points - 1;
+  int n_unique = 0;
+  int unique_flag = 0;
+
+  for (int i = 0; i < n_edges; i++) {
+    edges[i].x = ps[i + 1].x - ps[i].x;
+    edges[i].y = ps[i + 1].y - ps[i].y;
+  }
+  for (int i = 0; i < n_edges; i++) {
+    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+    if (edges_angles[i] >= 0) {
+      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+    } else {
+      edges_angles[i] =
+          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+    }
+  }
+  unique_angles[0] = edges_angles[0];
+  n_unique += 1;
+  for (int i = 1; i < n_edges; i++) {
+    for (int j = 0; j < n_unique; j++) {
+      if (edges_angles[i] == unique_angles[j]) {
+        unique_flag += 1;
+      }
+    }
+    if (unique_flag == 0) {
+      unique_angles[n_unique] = edges_angles[i];
+      n_unique += 1;
+      unique_flag = 0;
+    } else {
+      unique_flag = 0;
+    }
+  }
+
+  float minarea = 1e12;
+  for (int i = 0; i < n_unique; i++) {
+    float R[2][2];
+    float rot_points[2][MAXN];
+    R[0][0] = cos(unique_angles[i]);
+    R[0][1] = sin(unique_angles[i]);
+    R[1][0] = -sin(unique_angles[i]);
+    R[1][1] = cos(unique_angles[i]);
+    // R x Points
+    for (int m = 0; m < 2; m++) {
+      for (int n = 0; n < n_points; n++) {
+        float sum = 0.0;
+        for (int k = 0; k < 2; k++) {
+          sum = sum + R[m][k] * convex_points[k][n];
+        }
+        rot_points[m][n] = sum;
+      }
+    }
+
+    // xmin;
+    float xmin, ymin, xmax, ymax;
+    xmin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] < xmin) {
+          xmin = rot_points[0][j];
+        }
+      }
+    }
+    // ymin
+    ymin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] < ymin) {
+          ymin = rot_points[1][j];
+        }
+      }
+    }
+    // xmax
+    xmax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] > xmax) {
+          xmax = rot_points[0][j];
+        }
+      }
+    }
+    // ymax
+    ymax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] > ymax) {
+          ymax = rot_points[1][j];
+        }
+      }
+    }
+    float area = (xmax - xmin) * (ymax - ymin);
+    if (area < minarea) {
+      minarea = area;
+      minbox[0] = unique_angles[i];
+      minbox[1] = xmin;
+      minbox[2] = ymin;
+      minbox[3] = xmax;
+      minbox[4] = ymax;
+    }
+  }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  // float sign;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point *j = &(in_poly[0]);
+      Point *k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+template <typename T>
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+  Point ps1[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = p[i * 2];
+    convex[i].y = p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  Jarvis(convex, n_convex);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = convex[i].x;
+    ps1[i].y = convex[i].y;
+  }
+  ps1[n1].x = convex[0].x;
+  ps1[n1].y = convex[0].y;
+
+  float minbbox[5] = {0};
+  minBoundingRect(ps1, n1 + 1, minbbox);
+  float angle = minbbox[0];
+  float xmin = minbbox[1];
+  float ymin = minbbox[2];
+  float xmax = minbbox[3];
+  float ymax = minbbox[4];
+  float R[2][2];
+
+  R[0][0] = cos(angle);
+  R[0][1] = sin(angle);
+  R[1][0] = -sin(angle);
+  R[1][1] = cos(angle);
+
+  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template <typename T>
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+                                              const T *ex_boxes, T *minbox) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T *cur_box = ex_boxes + index * 18;
+    T *cur_min_box = minbox + index * 8;
+    Findminbox(cur_box, cur_min_box);
+  }
+}
+
+#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
index aff1ea26fafb6574060797d24131b8540594716d..12225ffdb3b1691ad9edabcd1663109f67ef1a6f 100644
--- a/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -14,11 +14,6 @@
 #include "common_cuda_helper.hpp"
 #include "pytorch_cuda_helper.hpp"
 
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads) {
-  return (N + num_threads - 1) / num_threads;
-}
-
 template <typename scalar_t>
 __device__ scalar_t ms_deform_attn_im2col_bilinear(
     const scalar_t *&bottom_data, const int &height, const int &width,
@@ -267,10 +262,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  const int qid_stride = num_heads * channels;
   CUDA_1D_KERNEL_LOOP(index, n) {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -285,11 +281,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
     const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
 
     for (int l_col = 0; l_col < num_levels; ++l_col) {
@@ -326,23 +322,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
                    _grad_h = cache_grad_sampling_loc[1],
                    _grad_a = cache_grad_attn_weight[0];
           int sid = 2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid) {
+          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
             _grad_w += cache_grad_sampling_loc[sid];
             _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
+            _grad_a += cache_grad_attn_weight[_tid];
             sid += 2;
           }
 
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
@@ -357,10 +353,10 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -375,8 +371,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -425,16 +422,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
         }
 
         if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
@@ -449,11 +446,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -468,8 +465,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -509,23 +507,23 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
                    _grad_h = cache_grad_sampling_loc[1],
                    _grad_a = cache_grad_attn_weight[0];
           int sid = 2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
+          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
             _grad_w += cache_grad_sampling_loc[sid];
             _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
+            _grad_a += cache_grad_attn_weight[_tid];
             sid += 2;
           }
 
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
@@ -540,11 +538,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -559,8 +557,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -618,16 +617,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
         }
 
         if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
@@ -642,11 +641,11 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
     const int channels, const int num_levels, const int num_query,
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
   CUDA_1D_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
-    scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
     int _temp = index;
     const int c_col = _temp % channels;
     _temp /= channels;
@@ -661,8 +660,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -720,16 +720,16 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
         }
 
         if (tid == 0) {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
         }
         __syncthreads();
 
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
@@ -759,8 +759,9 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
     int data_weight_ptr = sampling_index * num_levels * num_point;
     int data_loc_w_ptr = data_weight_ptr << 1;
     const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
     const int grad_weight_stride = 1;
     const int grad_loc_stride = 2;
     const int qid_stride = num_heads * channels;
@@ -787,12 +788,12 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(
           ms_deform_attn_col2im_bilinear_gm(
               data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
               w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              grad_sampling_loc, grad_attn_weight);
+              grad_sampling_loc_out, grad_attn_weight_out);
         }
         data_weight_ptr += 1;
         data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
       }
     }
   }
diff --git a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
index 40a2f462202cb06e7230ad3f1e17474e93ddc4cb..0a5c2505f5c7716ba025a5884debed73c46db9d5 100644
--- a/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -30,45 +30,88 @@ __device__ inline bool devIoU(float const *const a, float const *const b,
 __global__ void nms_cuda(const int n_boxes, const float iou_threshold,
                          const int offset, const float *dev_boxes,
                          unsigned long long *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-  const int tid = threadIdx.x;
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start) return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
 
-  if (row_start > col_start) return;
+__global__ void gather_keep_from_mask(bool *keep,
+                                      const unsigned long long *dev_mask,
+                                      const int n_boxes) {
+  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  const int tid = threadIdx.x;
 
-  const int row_size =
-      fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
-  const int col_size =
-      fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  // mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
 
-  __shared__ float block_boxes[threadsPerBlock * 4];
-  if (tid < col_size) {
-    block_boxes[tid * 4 + 0] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
-    block_boxes[tid * 4 + 1] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
-    block_boxes[tid * 4 + 2] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
-    block_boxes[tid * 4 + 3] =
-        dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+  // initialize removed.
+  for (int i = tid; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
   }
   __syncthreads();
 
-  if (tid < row_size) {
-    const int cur_box_idx = threadsPerBlock * row_start + tid;
-    const float *cur_box = dev_boxes + cur_box_idx * 4;
-    int i = 0;
-    unsigned long long int t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = tid + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
-        t |= 1ULL << i;
+  for (int nblock = 0; nblock < col_blocks; ++nblock) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes) break;
+      // select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (tid == 0) {
+          // mark the output.
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // remove all bboxes which overlap the candidate.
+        for (int j = tid; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock) removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
       }
     }
-    dev_mask[cur_box_idx * gridDim.y + col_start] = t;
   }
 }
+
 #endif  // NMS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
index 80bed9681f748390999a2963bd3448570b0dbf6a..747327afb83900177dd4721f1b0ba99153f658d7 100644
--- a/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
+++ b/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -43,18 +43,16 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
     // (x_center, y_center, width, height, angle_degrees) here.
     __shared__ T block_boxes[threadsPerBlock * 5];
     if (threadIdx.x < col_size) {
-      block_boxes[threadIdx.x * 6 + 0] =
+      block_boxes[threadIdx.x * 5 + 0] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
-      block_boxes[threadIdx.x * 6 + 1] =
+      block_boxes[threadIdx.x * 5 + 1] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
-      block_boxes[threadIdx.x * 6 + 2] =
+      block_boxes[threadIdx.x * 5 + 2] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
-      block_boxes[threadIdx.x * 6 + 3] =
+      block_boxes[threadIdx.x * 5 + 3] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
-      block_boxes[threadIdx.x * 6 + 4] =
+      block_boxes[threadIdx.x * 5 + 4] =
           dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
-      block_boxes[threadIdx.x * 6 + 5] =
-          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 5];
     }
     __syncthreads();
 
@@ -71,7 +69,7 @@ __global__ void nms_rotated_cuda_kernel(const int n_boxes,
         // Instead of devIoU used by original horizontal nms, here
         // we use the single_box_iou_rotated function from
         // box_iou_rotated_utils.h
-        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 6, 0) >
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
             iou_threshold) {
           t |= 1ULL << i;
         }
diff --git a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
index 12182cc3704eaacd1da838ce357c2677ad029eaa..342362079a5ce3dde6d19532b3014872f4373330 100644
--- a/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -45,20 +45,21 @@ __global__ void points_in_boxes_part_forward_cuda_kernel(
   // (B, npoints), default -1
 
   int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
 
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num + pt_idx;
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
 
-  T local_x = 0, local_y = 0;
-  int cur_in_flag = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[0] = k;
-      break;
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[0] = k;
+        break;
+      }
     }
   }
 }
@@ -73,19 +74,20 @@ __global__ void points_in_boxes_all_forward_cuda_kernel(
   // (B, npoints), default -1
 
   int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= batch_size || pt_idx >= pts_num) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
 
-  boxes += bs_idx * boxes_num * 7;
-  pts += bs_idx * pts_num * 3 + pt_idx * 3;
-  box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
 
-  T local_x = 0, local_y = 0;
-  for (int k = 0; k < boxes_num; k++) {
-    const int cur_in_flag =
-        check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
-    if (cur_in_flag) {
-      box_idx_of_points[k] = 1;
+    T local_x = 0, local_y = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      const int cur_in_flag =
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[k] = 1;
+      }
     }
   }
 }
diff --git a/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
@@ -0,0 +1,79 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+struct point {
+  float x, y;
+};
+
+template <typename scalar_t>
+__global__ void points_in_polygons_forward_cuda_kernel(
+    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
+    const int rows, const int cols, scalar_t *inside_flag) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int row = index / cols;
+    int col = index % cols;
+
+    const scalar_t *offset_vertex1 = vertex1 + row * 2;
+    const scalar_t *offset_vertex2 = vertex2 + col * 8;
+
+    point point_[1];
+    point polygon[4];
+
+    point_[0].x = offset_vertex1[0];
+    point_[0].y = offset_vertex1[1];
+
+    polygon[0].x = offset_vertex2[0];
+    polygon[0].y = offset_vertex2[1];
+    polygon[1].x = offset_vertex2[2];
+    polygon[1].y = offset_vertex2[3];
+    polygon[2].x = offset_vertex2[4];
+    polygon[2].y = offset_vertex2[5];
+    polygon[3].x = offset_vertex2[6];
+    polygon[3].y = offset_vertex2[7];
+
+    int nCross = 0;
+    int i, j;
+    float sx, sy, tx, ty, px, py, x;
+    for (i = 0, j = 3; i < 4; j = i, i++) {
+      sx = polygon[i].x;
+      sy = polygon[i].y;
+      tx = polygon[j].x;
+      ty = polygon[j].y;
+
+      px = point_[0].x;
+      py = point_[0].y;
+
+      if (py < min(sy, ty)) continue;
+      if (py > max(sy, ty)) continue;
+
+      if ((sx == px && sy == py) || (tx == px && ty == py)) {
+        break;
+      } else {
+        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
+          x = sx + (py - sy) * (tx - sx) / (ty - sy);
+          if (x == px) {
+            break;
+          }
+          if (x > px) {
+            nCross++;
+          }
+        }
+      }
+    }
+    if (nCross % 2 == 1) {
+      inside_flag[index] = 1.0;
+    } else {
+      inside_flag[index] = 0.0;
+    }
+    return;
+  }
+}
+
+#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ea8c37e22afdd5b3c48c5ea6fc29004d74340fb5
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
@@ -0,0 +1,381 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
+// Distributed under terms of the MIT license.
+#ifndef PRROI_POOL_CUDA_KERNEL_CUH
+#define PRROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
+                                                        const int h,
+                                                        const int w,
+                                                        const int height,
+                                                        const int width) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  T retVal = overflow ? 0.0f : data[h * width + w];
+  return retVal;
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
+  return (1.0f - abs(dh)) * (1.0f - abs(dw));
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
+                                                                   T c1, T c2) {
+  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
+                                              const T w, const int height,
+                                              const int width) {
+  T retVal = 0.0f;
+  int h1 = floorf(h);
+  int w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h);
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  return retVal;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
+                                               const int s_h, const int s_w,
+                                               const int e_h, const int e_w,
+                                               const T y0, const T x0,
+                                               const T y1, const T x1,
+                                               const int h0, const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  T sum_out = 0;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
+
+  return sum_out;
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
+                                                  const int h, const int w,
+                                                  const int height,
+                                                  const int width,
+                                                  const T coeff) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingMatDistributeDiff(
+    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
+    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
+    const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
+}
+
+template <typename T>
+__global__ void prroi_pool_forward_cuda_kernel(
+    const int nthreads, const T *input, const T *rois, T *output,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = offset_rois[3] * spatial_scale;
+    T roi_y2 = offset_rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
+    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    T *this_out = output + index;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    if (bin_size == 0) {
+      *this_out = 0;
+      continue;
+    }
+
+    T sum_out = 0;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        sum_out += PrRoIPoolingMatCalculation(
+            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+    *this_out = sum_out / bin_size;
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_backward_cuda_kernel(
+    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_out_grad = grad_output + index;
+    T *this_data_grad =
+        grad_input + (roi_batch_ind * channels + c) * height * width;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        PrRoIPoolingMatDistributeDiff(
+            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_coor_backward_cuda_kernel(
+    const int nthreads, const T *output, const T *grad_output, const T *input,
+    const T *rois, T *grad_rois, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T output_grad_val = grad_output[index];
+    const T *this_input_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    const T output_val = output[index];
+    T *this_rois_grad = grad_rois + n * 5;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
+
+    // WARNING: to be discussed
+    if (sum_out == 0) return;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
+    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
+      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
+                                    height, width));
+
+      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
+                                    height, width));
+    }
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
+      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
+                                    height, width));
+
+      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
+                                    height, width));
+    }
+
+    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
+    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
+    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
+    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
+
+    partial_x1 = partial_x1 / bin_size * spatial_scale;
+    partial_x2 = partial_x2 / bin_size * spatial_scale;
+    partial_y1 = partial_y1 / bin_size * spatial_scale;
+    partial_y2 = partial_y2 / bin_size * spatial_scale;
+
+    // (index, x1, y1, x2, y2)
+    this_rois_grad[0] = 0;
+    atomicAdd(this_rois_grad + 1,
+              (partial_x1 * (1.0f - T(pw) / pooled_width) +
+               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 2,
+              (partial_y1 * (1.0f - T(ph) / pooled_height) +
+               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
+                                   partial_x1 * T(pw) / pooled_width) *
+                                      output_grad_val);
+    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
+                                   partial_y1 * T(ph) / pooled_height) *
+                                      output_grad_val);
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,242 @@
+// Modified from
+// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
+#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int num_orientations, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+
+    const scalar_t *offset_bottom_data_plus =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        scalar_t val_plus = bilinear_interpolate<scalar_t>(
+            offset_bottom_data_plus, height, width, y, x, index);
+        output_val += r_var * val + l_var * val_plus;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+    scalar_t *offset_bottom_diff_plus =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    int top_offset =
+        (n * channels * num_orientations + c * num_orientations + o) *
+        pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
+
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
+                    g1 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
+                    g2 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
+                    g3 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
+                    g4 * l_var);
+
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RiRoIAlignBackward
+
+#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
index 33571f29674f53674415afe1bb4cc3ea0d8a9865..8274dc50c709630c4ee456efd543aa1265049b41 100644
--- a/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -20,7 +20,7 @@ template <typename scalar_t>
 __global__ void roi_align_rotated_forward_cuda_kernel(
     const int nthreads, const scalar_t *bottom_data,
     const scalar_t *bottom_rois, const scalar_t spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, scalar_t *top_data) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
@@ -58,11 +58,11 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
         bottom_data + (roi_batch_ind * channels + c) * height * width;
 
     // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sample_num > 0)
-                             ? sample_num
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
                              : ceilf(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
-        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
 
     // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
     // Appropriate translation needs to be applied after.
@@ -104,7 +104,7 @@ __global__ void roi_align_rotated_forward_cuda_kernel(
 template <typename scalar_t>
 __global__ void roi_align_rotated_backward_cuda_kernel(
     const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
-    const scalar_t spatial_scale, const int sample_num, const bool aligned,
+    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
     const bool clockwise, const int channels, const int height, const int width,
     const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
@@ -146,11 +146,11 @@ __global__ void roi_align_rotated_backward_cuda_kernel(
     const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
 
     // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sample_num > 0)
-                             ? sample_num
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
                              : ceilf(roi_height / pooled_height);  // e.g., = 2
     int roi_bin_grid_w =
-        (sample_num > 0) ? sample_num : ceilf(roi_width / pooled_width);
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
 
     // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
     // Appropriate translation needs to be applied after.
diff --git a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
index 3b95dc79080323a0b7d1d6bba06a3a46b04a3f05..fc0aacf1435f8715fae92de535bf01bac07ac39a 100644
--- a/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -44,37 +44,38 @@ __global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
   // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
   // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
   // y_idxs, z_idxs) by binary bit
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
-  if (pt_idx >= pts_num || box_idx >= boxes_num) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
 
-  pts += pt_idx * 3;
-  rois += box_idx * 7;
-  pts_mask += box_idx * pts_num + pt_idx;
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
 
-  T local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
 
-  pts_mask[0] = -1;
-  if (cur_in_flag > 0) {
-    T local_z = pts[2] - rois[2];
-    T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0) {
+      T local_z = pts[2] - rois[2];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
 
-    T x_res = x_size / out_x;
-    T y_res = y_size / out_y;
-    T z_res = z_size / out_z;
+      T x_res = x_size / out_x;
+      T y_res = y_size / out_y;
+      T z_res = z_size / out_z;
 
-    unsigned int x_idx = int((local_x + x_size / 2) / x_res);
-    unsigned int y_idx = int((local_y + y_size / 2) / y_res);
-    unsigned int z_idx = int(local_z / z_res);
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int z_idx = int(local_z / z_res);
 
-    x_idx = min(max(x_idx, 0), out_x - 1);
-    y_idx = min(max(y_idx, 0), out_y - 1);
-    z_idx = min(max(z_idx, 0), out_z - 1);
+      x_idx = min(max(x_idx, 0), out_x - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
 
-    unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
 
-    pts_mask[0] = idx_encoding;
+      pts_mask[0] = idx_encoding;
+    }
   }
 }
 
@@ -86,26 +87,24 @@ __global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
                                              T *pts_idx_of_voxels) {
   // params pts_mask: (N, npoints)  0 or 1
   // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
-
-  int box_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (box_idx >= boxes_num) return;
-
-  int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
-
-  for (int k = 0; k < pts_num; k++) {
-    if (pts_mask[box_idx * pts_num + k] != -1) {
-      unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
-      unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
-      unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
-      unsigned int z_idx = idx_encoding & 0xFF;
-      unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
-                                 y_idx * out_z * max_pts_each_voxel +
-                                 z_idx * max_pts_each_voxel;
-      unsigned int cnt = pts_idx_of_voxels[base_offset];
-      if (cnt < max_num_pts) {
-        pts_idx_of_voxels[base_offset + cnt + 1] = k;
-        pts_idx_of_voxels[base_offset]++;
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+        unsigned int z_idx = idx_encoding & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                   y_idx * out_z * max_pts_each_voxel +
+                                   z_idx * max_pts_each_voxel;
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
+        if (cnt < max_num_pts) {
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
+          pts_idx_of_voxels[base_offset]++;
+        }
       }
     }
   }
@@ -124,39 +123,38 @@ __global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-
-  int argmax_idx = -1;
-  float max_val = -1e50;
-
-  int total_pts = pts_idx_of_voxels[0];
-
-  for (int k = 1; k <= total_pts; k++) {
-    if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] > max_val) {
-      max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-      argmax_idx = pts_idx_of_voxels[k];
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+        argmax_idx = pts_idx_of_voxels[k];
+      }
     }
-  }
 
-  if (argmax_idx != -1) {
-    pooled_features[0] = max_val;
+    if (argmax_idx != -1) {
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
   }
-  argmax[0] = argmax_idx;
 }
 
 template <typename T>
@@ -172,30 +170,28 @@ __global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  pooled_features += box_idx * out_x * out_y * out_z * channels +
-                     offset_base * channels + channel_idx;
-
-  float sum_val = 0;
-  int total_pts = pts_idx_of_voxels[0];
-
-  for (int k = 1; k <= total_pts; k++) {
-    sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
-  }
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
 
-  if (total_pts > 0) {
-    pooled_features[0] = sum_val / total_pts;
+    if (total_pts > 0) {
+      pooled_features[0] = sum_val / total_pts;
+    }
   }
 }
 
@@ -210,24 +206,22 @@ __global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  argmax += box_idx * out_x * out_y * out_z * channels +
-            offset_base * channels + channel_idx;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
               offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
 
-  if (argmax[0] == -1) return;
+    if (argmax[0] == -1) return;
 
-  atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
 }
 
 template <typename T>
@@ -242,26 +236,24 @@ __global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
 
   int box_idx = blockIdx.z;
   int channel_idx = blockIdx.y;
-  int voxel_idx_flat = blockIdx.x * blockDim.x + threadIdx.x;
-
-  int x_idx = voxel_idx_flat / (out_y * out_z);
-  int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
-  int z_idx = voxel_idx_flat % out_z;
-  if (box_idx >= boxes_num || channel_idx >= channels || x_idx >= out_x ||
-      y_idx >= out_y || z_idx >= out_z)
-    return;
-
-  int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
-  pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
-                       offset_base * max_pts_each_voxel;
-  grad_out += box_idx * out_x * out_y * out_z * channels +
-              offset_base * channels + channel_idx;
-
-  int total_pts = pts_idx_of_voxels[0];
-  float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
-  for (int k = 1; k <= total_pts; k++) {
-    atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
-              grad_out[0] * cur_grad);
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+                grad_out[0] * cur_grad);
+    }
   }
 }
 
diff --git a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
index 7597719e69098ca4942c803e9853556daaa3b375..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68 100644
--- a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
   // params boxes3d: (B, M, 7)
   // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
   // background points
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
   int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
 
-  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
-    return;
-  }
-  int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-  pts_assign[assign_idx] = 0;
+    int assign_idx =
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
 
-  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
 
-  T local_x = 0, local_y = 0;
-  int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
-                                      local_x, local_y);
-  pts_assign[assign_idx] = cur_in_flag;
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                        local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+  }
 }
 
 __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
@@ -69,35 +69,32 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
   // params pts_assign: (B, N)
   // params pts_idx: (B, M, 512)
   // params pooled_empty_flag: (B, M)
-
-  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (boxes_idx >= boxes_num) {
-    return;
-  }
-
-  int bs_idx = blockIdx.y;
-
-  int cnt = 0;
-  for (int k = 0; k < pts_num; k++) {
-    if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) {
-      if (cnt < sampled_pts_num) {
-        pts_idx[bs_idx * boxes_num * sampled_pts_num +
-                boxes_idx * sampled_pts_num + cnt] = k;
-        cnt++;
-      } else
-        break;
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                  boxes_idx * sampled_pts_num + cnt] = k;
+          cnt++;
+        } else
+          break;
+      }
     }
-  }
 
-  if (cnt == 0) {
-    pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-  } else if (cnt < sampled_pts_num) {
-    // duplicate same points for sampling
-    for (int k = cnt; k < sampled_pts_num; k++) {
-      int duplicate_idx = k % cnt;
-      int base_offset =
-          bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-      pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+    if (cnt == 0) {
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    } else if (cnt < sampled_pts_num) {
+      // duplicate same points for sampling
+      for (int k = cnt; k < sampled_pts_num; k++) {
+        int duplicate_idx = k % cnt;
+        int base_offset =
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
     }
   }
 }
@@ -112,33 +109,26 @@ __global__ void roipoint_pool3d_forward(
   // params pts_feature: (B, N, C)
   // params pooled_features: (B, M, 512, 3+C)
   // params pooled_empty_flag: (B, M)
-
-  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
   int box_idx = blockIdx.y;
   int bs_idx = blockIdx.z;
-
-  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
-      bs_idx >= batch_size) {
-    return;
-  }
-
-  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
-    return;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
   }
-
-  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
-                 box_idx * sampled_pts_num + sample_pt_idx;
-  int src_pt_idx = pts_idx[temp_idx];
-  int dst_feature_offset = temp_idx * (3 + feature_in_len);
-
-  for (int j = 0; j < 3; j++)
-    pooled_features[dst_feature_offset + j] =
-        xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-
-  int src_feature_offset =
-      bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-  memcpy(pooled_features + dst_feature_offset + 3,
-         pts_feature + src_feature_offset, feature_in_len * sizeof(T));
 }
 
 #endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_forward_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_backward_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    atomicAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
@@ -0,0 +1,236 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INDICE_CU_H_
+#define INDICE_CU_H_
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareDeConvIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignGridAndIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numAct, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    index = indicePairUnique[ix];
+    gridsOut[index] = ix;
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignIndicePairsKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numActIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        indicePairs(i, 1, ix) = gridsOut[index];
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void prepareSubMGridKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    gridsOut[index] = ix;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void getSubMIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      if (gridsOut[index] > -1) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = gridsOut[index];
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridKernel(const Index *indicePairUnique,
+                                tv::TensorView<IndexGrid> gridsOut,
+                                int numAct) {
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    gridsOut[indicePairUnique[ix]] = -1;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridSubMKernel(
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
+  int outSpatialShapeReg[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShapeReg[i] = outSpatialShape[i];
+  }
+  Index spatialVolume = 1;
+  auto indsPtr = indices;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    indsPtr = indices + ix * (NDim + 1);
+    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
+  }
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7
--- /dev/null
+++ b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  scalar_t buf[vecloadFactor];
+  scalar_t buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
index 43aecb3a0d3585491584c54a6881645573baafbf..971b496e589d2210131351305cbaf0ed1a027cb1 100644
--- a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
 
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
 
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-  out += bs_idx * c * n + c_idx * n;
-
-  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                weight[2] * points[idx[2]];
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                  weight[2] * points[idx[2]];
+  }
 }
 
 template <typename T>
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(
 
   int bs_idx = blockIdx.z;
   int c_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
-
-  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-  weight += bs_idx * n * 3 + pt_idx * 3;
-  grad_points += bs_idx * c * m + c_idx * m;
-  idx += bs_idx * n * 3 + pt_idx * 3;
-
-  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
 }
 
 #endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
index 824da4c5c02fbaf3b87730df910e0763269cd832..15434121b94033afb2fcb9945a83db15b92262d4 100644
--- a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -19,48 +19,49 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
   //      idx: (B, N, 3)
 
   int bs_idx = blockIdx.y;
-  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bs_idx >= b || pt_idx >= n) return;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b) return;
 
-  unknown += bs_idx * n * 3 + pt_idx * 3;
-  known += bs_idx * m * 3;
-  dist2 += bs_idx * n * 3 + pt_idx * 3;
-  idx += bs_idx * n * 3 + pt_idx * 3;
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
 
-  T ux = unknown[0];
-  T uy = unknown[1];
-  T uz = unknown[2];
+    T ux = unknown[0];
+    T uy = unknown[1];
+    T uz = unknown[2];
 
-  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-  int besti1 = 0, besti2 = 0, besti3 = 0;
-  for (int k = 0; k < m; ++k) {
-    T x = known[k * 3 + 0];
-    T y = known[k * 3 + 1];
-    T z = known[k * 3 + 2];
-    T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-    if (d < best1) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = best1;
-      besti2 = besti1;
-      best1 = d;
-      besti1 = k;
-    } else if (d < best2) {
-      best3 = best2;
-      besti3 = besti2;
-      best2 = d;
-      besti2 = k;
-    } else if (d < best3) {
-      best3 = d;
-      besti3 = k;
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      T x = known[k * 3 + 0];
+      T y = known[k * 3 + 1];
+      T z = known[k * 3 + 2];
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
     }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
   }
-  dist2[0] = best1;
-  dist2[1] = best2;
-  dist2[2] = best3;
-  idx[0] = besti1;
-  idx[1] = besti2;
-  idx[2] = besti3;
 }
 
 #endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
index 62e118b35294b864b5374394c8ae84070b8c5afb..021b488d8d716c9e8132173bf04491d42b7b6fa2 100644
--- a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel(
     // To save some computation
     auto points_offset = points + index * num_features;
     auto coors_offset = coors + index * NDim;
-    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
+    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
     if (c_x < 0 || c_x >= grid_x) {
       coors_offset[0] = -1;
       continue;
     }
 
-    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
+    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
     if (c_y < 0 || c_y >= grid_y) {
       coors_offset[0] = -1;
       coors_offset[1] = -1;
       continue;
     }
 
-    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
+    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
     if (c_z < 0 || c_z >= grid_z) {
       coors_offset[0] = -1;
       coors_offset[1] = -1;
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
   CUDA_1D_KERNEL_LOOP(index, num_points) {
     auto coor_offset = coor + index * NDim;
     // skip invalid points
-    if ((index >= num_points) || (coor_offset[0] == -1)) return;
+    if (coor_offset[0] == -1) continue;
 
     int num = 0;
     int coor_x = coor_offset[0];
@@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
           point_to_pointidx[index] = i;
         } else if (num >= max_points) {
           // out of boundary
-          return;
+          break;
         }
       }
     }
@@ -166,4 +166,51 @@ __global__ void determin_voxel_num(
   }
 }
 
+__global__ void nondeterministic_get_assign_pos(
+    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
+    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void nondeterministic_assign_point_voxel(
+    const int nthreads, const T* points, const int32_t* coors_map,
+    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
+    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
+    const int max_voxels, const int max_points, const int num_features,
+    const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1 && coors_pts_pos < max_points) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
 #endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..58e695a0153e59ca9d0c66040962c2e12d6226b6
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
@@ -0,0 +1,322 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <float.h>
+
+#include "common_mlu_helper.hpp"
+
+#define COORD_NUM 4
+
+__nram__ char nmem_buf[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
+                             void *nram_addition, const int32_t deal_num) {
+  __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
+  __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
+}
+
+template <>
+__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
+                                   void *nram_src1, void *nram_addition,
+                                   const int32_t deal_num) {
+  __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
+  __bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
+                        deal_num);
+  __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
+  __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
+}
+
+template <typename T>
+__mlu_func__ void bboxOverlapsWorkflow(
+    T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
+    T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
+    T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
+    const int32_t offset, const int32_t mode, const int32_t batches_stride,
+    const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
+  int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
+  int32_t batch_start = taskId * task_batch_stride;
+  int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
+                               ? task_batch_stride
+                               : num_bbox1 - batch_start;
+  batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
+
+  if (aligned) {
+    int32_t num_loop_cpy = batch_per_task / batches_stride;
+    int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
+    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
+    for (int32_t i = 0; i < num_loop_cpy; i++) {
+      int32_t index = batch_start + i * batches_stride;
+      int32_t handle_batches = index + batches_stride > num_bbox1
+                                   ? num_rem_cpy_batches
+                                   : batches_stride;
+      int32_t b1 = index;
+      int32_t b2 = index;
+
+      int32_t base1 = b1 * COORD_NUM;
+      __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+
+      int32_t base2 = b2 * COORD_NUM;
+      __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      // get the width and height
+      __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
+      __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
+      __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
+      __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
+
+      // right - left + offset ---> left
+      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
+      __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+
+      // bottom - top + offset ---> right
+      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
+      __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+
+      // zero vector ---> bottom
+      __nramset(vec_bottom, batches_stride, 0.f);
+
+      // width --> vec_left
+      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
+      T *width = vec_left;
+      // height --> vec_right
+      __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
+      T *height = vec_right;
+
+      // get the b1_area
+      // (b1_x2 - b1_x1 + offset)  --->  vec_top
+      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
+      __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+
+      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
+      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
+      __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+
+      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
+      // --->  vec_top;
+      __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
+      T *b1_area = vec_top;
+
+      // get the b2_area
+      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
+      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
+      __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+
+      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
+      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
+      __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+
+      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
+      // --->  b2_x1;
+      __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
+      T *b2_area = vec_b2_x1;
+
+      // inter_s = width * height
+      __bang_mul(height, width, height, batches_stride);
+      T *inter_s = height;
+
+      // offset vector ---> vec_b2_y1
+      __nramset(vec_b2_y1, batches_stride, T(offset));
+      T *vec_offset = vec_b2_y1;
+
+      if (mode == 0) {
+        __bang_add(b1_area, b1_area, b2_area, batches_stride);
+        __bang_sub(b1_area, b1_area, inter_s, batches_stride);
+        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+      } else {
+        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+      }
+      T *base_s = b1_area;
+
+      // ious = inter_s / base_s
+      computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
+      __memcpy((T *)ious + index, width, handle_batches * sizeof(T),
+               NRAM2GDRAM);
+    }
+  } else {
+    int32_t num_loop_cpy = num_bbox2 / batches_stride;
+    int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
+    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
+    for (int32_t i = 0; i < batch_per_task; i++) {
+      int32_t index1 = batch_start + i;
+      int32_t b1 = index1;
+      int32_t base1 = b1 * COORD_NUM;
+
+      // set bbox1 and bbox2 to nram
+      __nramset(vec_b1_x1, batches_stride, bbox1[base1]);
+      __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
+      __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
+      __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
+
+      for (int32_t j = 0; j < num_loop_cpy; j++) {
+        int32_t index2 = j * batches_stride;
+        int32_t handle_batches = index2 + batches_stride > num_bbox2
+                                     ? num_rem_cpy_batches
+                                     : batches_stride;
+        int32_t b2 = index2;
+        int32_t base2 = b2 * COORD_NUM;
+
+        // copy bbox2 to nram
+        __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+
+        // get the width and height
+        __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
+        __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
+        __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
+        __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
+
+        // right - left + offset ---> left
+        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
+        __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+        // bottom - top + offset ---> right
+        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
+        __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+
+        // zero vector ---> bottom
+        __nramset(vec_bottom, batches_stride, (T)0);
+
+        // width --> vec_left
+        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
+        T *width = vec_left;
+        // height --> vec_right
+        __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
+        T *height = vec_right;
+
+        // get the b1_area
+        // (b1_x2 - b1_x1 + offset)  --->  vec_top
+        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
+        __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
+        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
+        __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
+        // --->  vec_top;
+        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
+        T *b1_area = vec_top;
+
+        // get the b2_area
+        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
+        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
+        __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
+        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
+        __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
+        // --->  b2_x1;
+        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
+        T *b2_area = vec_b2_x1;
+
+        // inter_s = width * height
+        __bang_mul(height, width, height, batches_stride);
+        T *inter_s = height;
+
+        // offset vector ---> vec_b2_y1
+        __nramset(vec_b2_y1, batches_stride, T(offset));
+        T *vec_offset = vec_b2_y1;
+
+        if (mode == 0) {
+          __bang_add(b1_area, b1_area, b2_area, batches_stride);
+          __bang_sub(b1_area, b1_area, inter_s, batches_stride);
+          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+        } else {
+          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+        }
+        T *base_s = b1_area;
+
+        // ious = inter_s / base_s
+        computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
+        int32_t gdram_offset = index1 * num_bbox2 + index2;
+        __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
+                 NRAM2GDRAM);
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
+    const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
+    const int32_t num_bbox2, const int32_t mode, const bool aligned,
+    const int32_t offset) {
+  /*
+   * NRAM partition
+   *  |-------------------------------------------------------------|
+   *  |   vec_b1_x1   |  vec_b1_y1   |   vec_b1_x2  |   vec_b1_y2   |
+   *  |-------------------------------------------------------------|
+   *  |   vec_b2_x1   |  vec_b2_y1   |   vec_b2_x2  |   vec_b2_y2   |
+   *  |-------------------------------------------------------------|
+   *  |    vec_left   |  vec_right   |    vec_top   |   vec_bottom  |
+   *  |-------------------------------------------------------------|
+   *
+  */
+  const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
+  const int32_t split_nram_num = 12;
+  const int32_t nram_stride =
+      align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
+
+  void *vec_b1_x1 = nmem_buf;
+  void *vec_b1_y1 = nmem_buf + nram_stride;
+  void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
+  void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
+
+  void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
+  void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
+  void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
+  void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
+
+  void *vec_left = nmem_buf + 8 * nram_stride;
+  void *vec_right = nmem_buf + 9 * nram_stride;
+  void *vec_top = nmem_buf + 10 * nram_stride;
+  void *vec_bottom = nmem_buf + 11 * nram_stride;
+
+  const int32_t vec_length = nram_stride / sizeof(T);
+  bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
+                       (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
+                       (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
+                       (T *)vec_right, (T *)vec_top, (T *)vec_bottom,
+                       (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
+                       vec_length, num_bbox1, num_bbox2, aligned);
+}
+
+void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                        cnrtQueue_t queue, const cnrtDataType_t d_type,
+                        const void *bbox1, const void *bbox2, void *ious,
+                        const int32_t num_bbox1, const int32_t num_bbox2,
+                        const int32_t mode, const bool aligned,
+                        const int32_t offset) {
+  if (d_type == CNRT_FLOAT16) {
+    MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
+        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
+  } else {
+    MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
+        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
+  }
+}
diff --git a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..669a9d78e0c48b6761e05ca933cb4689bbcbc272
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef COMMON_MLU_HELPER_HPP_
+#define COMMON_MLU_HELPER_HPP_
+
+#define NFU_ALIGN_SIZE 128          // Byte
+#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
+
+#ifdef __BANG_ARCH__
+#define MAX_NRAM_SIZE \
+  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#define MAX_SRAM_SIZE \
+  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#else
+#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
+#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
+#endif
+
+#ifndef PAD_UP
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+#endif
+
+#ifndef PAD_DOWN
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+#endif
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+/*!
+ * @brief Converts int32 to float32 data type.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
+#else
+  // get sign bit
+  const float move_23bit = 8388608.0;
+  // 0x80000000 = 1,000000000,0000000000000000000000000000
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  // get 1 or 0 from sign bit
+  // judg is Odd
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x00000001);
+  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
+                   (char *)src_addition, src_count * sizeof(float),
+                   NFU_ALIGN_SIZE);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000001);
+  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  // minus xor, positive num invariant
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xffffffff);
+  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
+  // convert int32 to float32
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
+  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x4b000000);
+  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
+                   src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_sub_const(dst, dst, move_23bit, src_count);
+  // add one
+  __bang_add(dst, dst, dst_addition, src_count);
+  // set sign for float32
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xffffffff);
+  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x00000001);
+  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * 4, 128);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to int32 data type with to_zero round mode.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
+#else
+  // sign ===> src_addition
+  // dst=-1.0 : when src[i] is a negative number
+  // dst=+1.0 : when src[i] is a positive number
+  const int floatDchar = sizeof(float) / sizeof(char);
+  __bang_active_sign((float *)dst, src, src_count);
+  // dst_addition = abs(src)
+  __bang_mul(dst_addition, src, (float *)dst, src_count);
+  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
+  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
+  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xbf800000);
+  // set negative flag -1.0 = 0xbf80000
+  __bang_cycle_eq(
+      (float *)dst, (float *)dst, (float *)src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
+  __bang_active_abs(dst_addition, src, src_count);
+  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
+  // mask shift move 23
+  __bang_cycle_add_tz(
+      dst_addition, dst_addition, src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
+  // two`s complement for negatibe
+  // dst=1.0 , when src <-1.0
+  // dst=0.0 , when src >=-1.0
+  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
+  // to fix max value
+  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
+  // means max value.
+  __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
+              src_count * floatDchar);
+  // get low 23bit
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            (unsigned)0x007fffff);
+  // mask low 23bit is 1
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * floatDchar,
+                    NFU_ALIGN_SIZE / sizeof(char));
+  // set 9 high bit ===> dst
+  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
+  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
+  __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  // src or dst_addition
+  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+  __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+#endif  // COMMON_MLU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7624379b68d6df41aae0253df26b9add61c7a76e
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
@@ -0,0 +1,888 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <float.h>
+
+#include "common_mlu_helper.hpp"
+
+#define PING 0
+#define PONG 1
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
+                            const int32_t dst_stride = 0,
+                            const int32_t src_stride = 0,
+                            const int32_t count = 1) {
+  if (dst_stride == src_stride) {
+    __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
+  } else {
+    __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
+                   src_stride, count - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
+                             const int32_t c, const int32_t has_weight,
+                             const int32_t partition_nc) {
+  if (has_weight && partition_nc && t >= 0 && t < c) {
+    __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
+                              const int32_t size, const int32_t dst_stride = 0,
+                              const int32_t src_stride = 0,
+                              const int32_t count = 1) {
+  if (dst_stride == src_stride) {
+    __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
+  } else {
+    __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
+                   src_stride, count - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
+                          const int32_t has_weight, const int32_t partition_nc,
+                          const int32_t deal_num, const int32_t n_seg,
+                          const int32_t c, const int32_t c_seg,
+                          const int32_t c_start_index, const float alpha,
+                          const float gamma, T *compute_a, T *compute_b,
+                          T *output) {
+  // set params
+  const int32_t c_num =
+      has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
+  const int32_t c_end_index = c_start_index + c_seg;
+  const int32_t half_epsilon = 0x0400;
+  const T epsilon_f =
+      sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
+
+  // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma  if t == c_i
+  //                    = (1 - alpha) * p ^ gamma  if t != c_i
+  __nramset((T *)output, deal_num, (T)(1 - alpha));
+  __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
+  for (int32_t i = 0; i < n_seg; ++i) {
+    const int32_t t = *((uint32_t *)target + i);
+    if (t >= c_start_index && t < c_end_index) {
+      const uint32_t index = i * c_num + t - c_start_index;
+      *((T *)input + index) = -1.0 * (*((T *)input + index));
+      *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
+      *((T *)output + index) = alpha;
+    }
+  }
+  if (sizeof(T) == sizeof(half)) {
+    __bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
+    __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
+                     deal_num);
+    __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
+  } else {
+    __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
+    __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
+    __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  }
+  __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
+
+  // 1. max = max(0, -x)  if t == c_i
+  //        = max(0, x)   if t != c_i
+  __nramset((T *)compute_b, deal_num, (T)0);
+  __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
+
+  // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max   if t == c_i
+  //              = ln(e^(-max)+ e^(-max+x) + max   if t != c_i
+  __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
+
+  __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_active_exphp((T *)input, (T *)input, deal_num);
+  __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
+  __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
+
+  // 3. output = alpha_t * p_t^r * [-log(p_t)]
+  __bang_mul((T *)output, (T *)output, (T *)input, deal_num);
+
+  // 4. with weight
+  if (has_weight) {
+    for (int32_t i = 0; i < n_seg; ++i) {
+      int32_t t = *((int32_t *)target + i);
+      if (t >= 0 && t < c) {
+        t = partition_nc ? 0 : t;
+        __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
+                         *((T *)weight + t), c_num);
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void startPipeline(
+    const T *input, const int32_t *target, const T *weight,
+    char *nram_compute_a, char *nram_compute_b, char *nram_input,
+    char *nram_target, char *nram_weight, char *nram_output,
+    const int32_t has_weight, const int32_t partition_nc,
+    const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
+    const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
+    const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
+    T *output) {
+  // with offset
+  input = (T *)((char *)input + c_offset_num * sizeof(T));
+  output = (T *)((char *)output + c_offset_num * sizeof(T));
+
+  const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
+  const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t load_size = c_seg * sizeof(T);
+  const int32_t dram_stride = c * sizeof(T);
+  const int32_t nram_stride = c_num * sizeof(T);
+
+  if (has_weight && !partition_nc) {
+    loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
+                 1);
+    __asm__ volatile("sync;\n\t");
+  }
+  const int32_t repeat = n / n_seg;
+  const int32_t remain = n % n_seg;
+
+  /*
+   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
+   *           The allocated memory space of NRAM is divided into two parts:
+   *           PING and Pong. In a single time slice, PING is used to process
+   *           IO stream and PONG is used for computation. Both of them are
+   *           processed synchronously until finished.
+   *
+   * diagram of PINGPONG:
+   * |------|-----------------------------------------------------------------|
+   * |      |                              space                              |
+   * |------|-----------------------------------------------------------------|
+   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
+   * |------|-----------------------------------------------------------------|
+   * |  0   |    L0    |          |          |          |          |          |
+   * |  1   |    C0    |    L1    |          |          |          |          |
+   * |  2   |    S0    |    C1    |    L2    |          |          |          |
+   * |  3   |          |    S1    |    C2    |    L3    |          |          |
+   * |  4   |          |          |    S2    |    C3    |    L4    |          |
+   * |  5   |          |          |          |    S3    |    C4    |    L5    |
+   * |  6   |          |          |          |          |    S4    |    C5    |
+   * |  7   |          |          |          |          |          |    S5    |
+   * |------|-----------------------------------------------------------------|
+   */
+
+  // diagram of PINGPONG: L0
+  if (repeat > 0) {
+    loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
+                 n_seg);
+    loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
+    loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
+                  partition_nc);
+    __asm__ volatile("sync;\n\t");
+  }
+
+  // diagram of PINGPONG: C0 and L1
+  if (repeat > 1) {
+    compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)nram_output);
+    loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
+                 load_size, nram_stride, dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + pingpong_offset,
+                       (int32_t *)target + n_seg, n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
+                  *((int32_t *)target + n_seg), c, has_weight, partition_nc);
+    __asm__ volatile("sync;\n\t");
+  }
+
+  for (int32_t i = 0; i < repeat - 2; ++i) {
+    storeOutput<T>((T *)output + i * c * n_seg,
+                   nram_output + (i % 2) * pingpong_offset, load_size,
+                   dram_stride, nram_stride, n_seg);
+    loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
+                 (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
+                 dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
+                       (int32_t *)target + (i + 2) * n_seg,
+                       n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
+                  has_weight, partition_nc);
+    compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
+            (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
+    __asm__ volatile("sync;\n\t");
+  }
+
+  if (repeat > 1) {
+    storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
+  }
+
+  if (remain > 0) {
+    loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
+                 (T *)input + repeat * c * n_seg, load_size, nram_stride,
+                 dram_stride, remain);
+    loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
+                       (int32_t *)target + repeat * n_seg,
+                       remain * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + repeat * n_seg), c,
+                  has_weight, partition_nc);
+  }
+
+  if (repeat > 0) {
+    compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
+            (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
+  }
+  __asm__ volatile("sync;\n\t");
+
+  if (repeat > 0) {
+    storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
+                   (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
+  }
+
+  if (remain > 0) {
+    int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
+    compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
+            (int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * (repeat % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + (repeat % 2) * pingpong_offset));
+    __asm__ volatile("sync;\n\t");
+
+    storeOutput<T>((T *)output + repeat * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, remain);
+  }
+  __asm__ volatile("sync;\n\t");
+}
+
+template <typename T>
+__mlu_func__ void focalLossSigmoidForwardBlock(
+    const T *input, const int32_t *target, const T *weight, const int32_t n,
+    const int32_t c, const float alpha, const float gamma, T *output) {
+  /*
+   * NRAM partition
+   *  |-----------------------------------------------------------------------|
+   *  |                                weight                                 |
+   *  |------------------------------- COMPUTE -------------------------------|
+   *  |                                   |                                   |
+   *  |              computeA             |               computeB            |
+   *  |                                   |                                   |
+   *  |------------- PING ------------------------------- PONG ---------------|
+   *  |                                   |                                   |
+   *  |              input                |               input               |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |                                   |                                   |
+   *  |              output               |               output              |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |              target               |               target              |
+   *  |-----------------------------------|-----------------------------------|
+   *
+   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
+   * PONG(input,output).
+   * split_target_num is 2: PING(target), PONG(target).
+   * weight is not NULL:
+   *   The nram-size of weight is equal to c_align_size when partition input-N.
+   *   The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
+   * input-NC.
+  */
+
+  // calculate threshold of c
+  const int32_t split_pipeline_num = 6;
+  const int32_t split_target_num = 2;
+  const int32_t has_weight = weight != NULL;
+  const int32_t threshold_c =
+      PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      sizeof(T);
+  const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_align_size = c_align * sizeof(T);
+
+  if (c <= threshold_c) {
+    // partition inputN
+    int32_t c_num = c;
+    int32_t reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int32_t weight_size = 0;
+    if (has_weight) {
+      c_num = c_align;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      weight_size = c_align_size;
+    }
+
+    const int32_t remain_size =
+        MAX_NRAM_SIZE - weight_size - reservered_align_size;
+    const int32_t n_seg =
+        remain_size / (split_pipeline_num * c_num * sizeof(T) +
+                       split_target_num * sizeof(int32_t));
+    const int32_t split_pipeline_size =
+        PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + has_weight * c_align_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                     nram_input, nram_target, nram_weight, nram_output,
+                     has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
+                     alpha, gamma, output);
+  } else {
+    // partition inputNC
+    const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
+    const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
+    const int32_t split_pipeline_size = PAD_DOWN(
+        (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
+        NFU_ALIGN_SIZE);
+    const int32_t c_seg = split_pipeline_size / sizeof(T);
+    const int32_t n_seg = 1;
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+    const int32_t pingpong_weight_offset = weight_size / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + weight_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    const int32_t loop_num = (c + c_seg - 1) / c_seg;
+    const int32_t partition_nc = 1;
+    for (int32_t i = 0; i < loop_num; ++i) {
+      const int32_t c_index = i * c_seg;
+      const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
+      startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                       nram_input, nram_target, nram_weight, nram_output,
+                       has_weight, partition_nc, pingpong_offset,
+                       pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
+                       alpha, gamma, output);
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
+    const void *input, const void *target, const void *weight, const int32_t N,
+    const int32_t C, const float alpha, const float gamma, void *output) {
+  const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
+  const T *input_offset = (T *)input + N / taskDim * taskId * C;
+  const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
+  T *output_offset = (T *)output + N / taskDim * taskId * C;
+
+  focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
+                               (T *)weight, n_seg, C, alpha, gamma,
+                               (T *)output_offset);
+}
+}  // namespace forward
+
+namespace backward {
+template <typename T>
+__mlu_func__ void loadInput(char *nram_input, char *nram_target,
+                            const T *gdram_input, const int32_t *gdram_target,
+                            const int32_t deal_n, const int32_t total_c,
+                            const bool pingping_flag, const bool has_weight,
+                            const int32_t nram_offset,
+                            const int32_t gdram_offset) {
+  if (pingping_flag == PONG) {
+    nram_input += nram_offset;
+    nram_target += nram_offset;
+  }
+
+  __memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
+                 deal_n * sizeof(int32_t), GDRAM2NRAM);
+
+  char *nram_input_load = nram_input;
+  int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
+  if (has_weight) {
+    if (sizeof(T) == sizeof(half)) {
+      int32_t compute_align_num = compute_align_size / sizeof(float);
+      int32_t align_c = PAD_UP(total_c, compute_align_num);
+      int32_t compute_size = deal_n * align_c * sizeof(float);
+      nram_input_load += compute_size / 2;
+    }
+    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
+    int32_t total_c_size = total_c * sizeof(T);
+    int32_t align_c_size = align_c * sizeof(T);
+    __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
+                   GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
+  } else {
+    if (sizeof(T) == sizeof(half)) {
+      int32_t compute_size =
+          PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
+      nram_input_load += compute_size / 2;
+    }
+    int32_t load_size = deal_n * total_c * sizeof(T);
+    __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
+                   GDRAM2NRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
+                          const int32_t elem_count) {
+  __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
+  __bang_active_exphp(dst_data, dst_data, elem_count);
+  __bang_add_const(dst_data, dst_data, T(1), elem_count);
+  __bang_active_reciphp(dst_data, dst_data, elem_count);
+}
+
+template <typename T>
+__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
+                              const float *nram_flt_min, char *nram_pt,
+                              char *nram_alpha_t, char *nram_temp,
+                              char *nram_target, const float *nram_gamma,
+                              char *nram_output, const float alpha,
+                              const int32_t compute_num, const int32_t deal_n,
+                              const int32_t total_c, const bool pingpong_flag,
+                              const int32_t nram_offset,
+                              const bool has_weight) {
+  if (pingpong_flag == PONG) {
+    nram_input += nram_offset;
+    nram_pt += nram_offset;
+    nram_alpha_t += nram_offset;
+    nram_temp += nram_offset;
+    nram_output += nram_offset;
+    nram_target += nram_offset;
+  }
+
+  if (sizeof(T) == sizeof(half)) {
+    const int32_t compute_size = compute_num * sizeof(float);
+    char *nram_input_load = nram_input + compute_size / 2;
+    __bang_half2float((float *)nram_input, (half *)nram_input_load,
+                      compute_num);
+  }
+
+  // 0. alpha_t = alpha - 1
+  __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
+
+  // 1. pt = 1 - sigmoid(x)
+  sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
+  __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
+
+  // 2. pt      = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
+  //    alpha_t = target[n] == c ? alpha      : alpha - 1
+  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
+  for (int n = 0; n < deal_n; n++) {
+    const int32_t target_value = ((int32_t *)nram_target)[n];
+    if (target_value >= total_c || target_value < 0) continue;
+    int32_t c_offset = 0;
+    if (has_weight) {
+      int32_t c_align_num = nfu_align_num;
+      if (sizeof(T) == sizeof(half)) {
+        c_align_num += nfu_align_num;
+      }
+      c_offset = PAD_UP(total_c, c_align_num);
+    } else {
+      c_offset = total_c;
+    }
+    int32_t idx = n * c_offset + target_value;
+    *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
+    *((float *)nram_alpha_t + idx) = alpha;
+  }
+
+  // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
+  __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
+                   compute_num);
+  __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
+                        (float *)nram_flt_min, compute_num, nfu_align_num);
+  __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
+  __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
+                   compute_num, nfu_align_num);
+  __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
+  __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
+             compute_num);
+  __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
+                   compute_num);
+
+  // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
+  __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
+                        (float *)nram_flt_min, compute_num, nfu_align_num);
+  __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
+  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
+             compute_num);
+  __bang_cycle_mul((float *)nram_output, (float *)nram_output,
+                   (float *)nram_gamma, compute_num, nfu_align_num);
+  __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
+             compute_num);
+  __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
+                   compute_num);
+
+  // 5. output = output * temp
+  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
+             compute_num);
+
+  if (sizeof(T) == sizeof(half)) {
+    __bang_float2half_rd((half *)nram_output, (float *)nram_output,
+                         compute_num);
+  }
+
+  if (has_weight) {
+    // with weight
+    for (int n = 0; n < deal_n; n++) {
+      int32_t c_align_num = nfu_align_num;
+      if (sizeof(T) == sizeof(half)) {
+        c_align_num += nfu_align_num;
+      }
+      int32_t align_c = PAD_UP(total_c, c_align_num);
+      int32_t target_value = ((int32_t *)nram_target)[n];
+      T weight_value = nram_weight[target_value];
+      __bang_mul_const((T *)nram_output + n * align_c,
+                       (T *)nram_output + n * align_c, weight_value, align_c);
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
+                              const int32_t deal_n, const int32_t total_c,
+                              const bool pingpong_flag, const bool has_weight,
+                              const int32_t nram_offset,
+                              const int32_t gdram_offset) {
+  if (pingpong_flag == PONG) {
+    nram_output += nram_offset;
+  }
+  const int32_t store_size = deal_n * total_c * sizeof(T);
+  if (has_weight) {
+    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
+    int32_t total_c_size = total_c * sizeof(T);
+    int32_t align_c_size = align_c * sizeof(T);
+    __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
+                   NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
+  } else {
+    __memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
+                   NRAM2GDRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void focalLossSigmoidBackwardBlock(
+    const T *input, const int32_t *target, const T *weight, const float gamma,
+    const float alpha, const int32_t total_n, const int32_t deal_n,
+    const int32_t total_c, T *output) {
+  // params per time slice
+  int32_t deal_num = deal_n * total_c;
+  int32_t deal_size = deal_num * sizeof(float);
+  int32_t compute_num = 0;
+  int32_t compute_size = 0;
+  int32_t compute_align_size = NFU_ALIGN_SIZE;
+  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
+  if (sizeof(T) == sizeof(half)) {
+    compute_align_size += NFU_ALIGN_SIZE;
+  }
+  const int32_t compute_align_num = compute_align_size / sizeof(float);
+  bool has_weight = false;
+  if (weight != NULL) {
+    has_weight = true;
+    int32_t align_c = PAD_UP(total_c, compute_align_num);
+    compute_num = deal_n * align_c;
+    compute_size = compute_num * sizeof(float);
+  } else {
+    compute_size = PAD_UP(deal_size, compute_align_size);
+    compute_num = compute_size / sizeof(float);
+  }
+
+  // params per core
+  int32_t total_num = total_n * total_c;
+  int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
+  int32_t loop_per_core = num_per_core / deal_num;
+
+  /* NRAM partition:
+   *
+   * |-----------------ping pong--------------------|
+   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
+   *
+   * split_pipeline_num is 5: input, pt, alpha_t, temp, output.
+   * nram_reserved_line_num is 2: flt_min, gamma.
+   */
+  const int32_t split_pipeline_num = 5;
+  const int32_t nram_reserved_line_num = 2;
+  int32_t target_deal_size = deal_n * sizeof(int32_t);
+  int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
+  // nram PING/PONG offset
+  int32_t ping_pong_offset =
+      compute_size * split_pipeline_num + target_deal_size_align;
+
+  // gdram addr
+  int32_t *base_addr_target =
+      (int32_t *)target + taskId * loop_per_core * deal_n;
+  T *base_addr_input = (T *)input + taskId * num_per_core;
+  T *base_addr_output = output + taskId * num_per_core;
+
+  // nram addr
+  char *nram_input = (char *)nram_buffer;
+  char *nram_pt = nram_input + compute_size;
+  char *nram_alpha_t = nram_pt + compute_size;
+  char *nram_temp = nram_alpha_t + compute_size;
+  char *nram_output = nram_temp + compute_size;
+  char *nram_target = nram_output + compute_size;
+  float *nram_flt_min = NULL;
+  float *nram_gamma = NULL;
+  T *nram_weight = NULL;
+
+  if (!has_weight) {
+    nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
+                             nram_reserved_line_num * NFU_ALIGN_SIZE);
+    nram_gamma = nram_flt_min + nfu_align_num;
+  } else {
+    int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
+    nram_flt_min =
+        (float *)(nram_buffer + MAX_NRAM_SIZE -
+                  nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
+    nram_gamma = nram_flt_min + nfu_align_num;
+    nram_weight = (T *)(nram_gamma + nfu_align_num);
+    __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
+  }
+
+  // nram set gamma and FLT_MIN
+  __nramset(nram_gamma, nfu_align_num, gamma);
+  __nramset(nram_flt_min, nfu_align_num, FLT_MIN);
+
+  /*
+   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
+   *           The allocated memory space of NRAM is divided into two parts:
+   *           PING and Pong. In a single time slice, PING is used to process
+   *           IO stream and PONG is used for computation. Both of them are
+   *           processed synchronously until finished.
+   *
+   * diagram of PINGPONG:
+   * |------|-----------------------------------------------------------------|
+   * |      |                              space                              |
+   * |------|-----------------------------------------------------------------|
+   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
+   * |------|-----------------------------------------------------------------|
+   * |  0   |    L0    |          |          |          |          |          |
+   * |  1   |    C0    |    L1    |          |          |          |          |
+   * |  2   |    S0    |    C1    |    L2    |          |          |          |
+   * |  3   |          |    S1    |    C2    |    L3    |          |          |
+   * |  4   |          |          |    S2    |    C3    |    L4    |          |
+   * |  5   |          |          |          |    S3    |    C4    |    L5    |
+   * |  6   |          |          |          |          |    S4    |    C5    |
+   * |  7   |          |          |          |          |          |    S5    |
+   * |------|-----------------------------------------------------------------|
+   */
+
+  // diagram of PINGPONG: L0
+  if (loop_per_core > 0) {
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
+    __asm__ volatile("sync;");
+  }
+
+  // diagram of PINGPONG: C0 and L1
+  if (loop_per_core > 1) {
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                compute_num, deal_n, total_c, PING, ping_pong_offset,
+                has_weight);
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
+    __asm__ volatile("sync;");
+  }
+
+  for (int i = 0; i < loop_per_core - 2; ++i) {
+    if (i % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, i * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
+                  has_weight);
+      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+                deal_n, total_c, PING, has_weight, ping_pong_offset,
+                (i + 2) * deal_num);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, i * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+                deal_n, total_c, PONG, has_weight, ping_pong_offset,
+                (i + 2) * deal_num);
+    }
+    __asm__ volatile("sync;");
+  }
+
+  if (loop_per_core > 1) {
+    if ((loop_per_core - 2) % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
+                  has_weight);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+    }
+    __asm__ volatile("sync;");
+  }
+
+  if (loop_per_core > 0) {
+    if (loop_per_core == 1) {
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+      __asm__ volatile("sync;");
+    }
+    if ((loop_per_core - 1) % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
+    }
+  }
+
+  // process the remaining data which N remainder per core is less than deal_n
+  int32_t rem_for_all = total_num - num_per_core * taskDim;
+  if (rem_for_all == 0) return;
+  int32_t rem_n_for_all = rem_for_all / total_c;
+  int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
+  int32_t rem_num_per_core = rem_n_per_core * total_c;
+  int32_t rem_num_per_core_align = 0;
+  int32_t rem_core_num = rem_for_all / rem_num_per_core;
+
+  int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
+  int32_t rem_num_for_last = rem_n_for_last * total_c;
+  int32_t rem_num_for_last_align = 0;
+
+  if (has_weight) {
+    int32_t align_c = PAD_UP(total_c, compute_align_num);
+    rem_num_per_core_align = rem_n_per_core * align_c;
+    rem_num_for_last_align = rem_n_for_last * align_c;
+  } else {
+    rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
+    rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
+  }
+
+  int32_t rem_addr_base = num_per_core * taskDim;
+  int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
+  base_addr_target = (int32_t *)target + rem_target_addr_base;
+  base_addr_input = (T *)input + rem_addr_base;
+  base_addr_output = output + rem_addr_base;
+
+  if (taskId < rem_core_num) {
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
+              taskId * rem_num_per_core);
+    __asm__ volatile("sync;");
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                rem_num_per_core_align, rem_n_per_core, total_c, PING,
+                ping_pong_offset, has_weight);
+    __asm__ volatile("sync;");
+    storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
+                has_weight, ping_pong_offset, taskId * rem_num_per_core);
+  } else if (taskId == rem_core_num) {
+    if (rem_num_for_last == 0) return;
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
+              taskId * rem_num_per_core);
+    __asm__ volatile("sync;");
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                rem_num_for_last_align, rem_n_for_last, total_c, PING,
+                ping_pong_offset, has_weight);
+    __asm__ volatile("sync;");
+    storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
+                has_weight, ping_pong_offset, taskId * rem_num_per_core);
+  } else {
+    return;
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
+    const void *input, const void *target, const void *weight,
+    const float gamma, const float alpha, const int32_t total_n,
+    const int32_t deal_n, const int32_t total_c, void *output) {
+  focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
+                                gamma, alpha, total_n, deal_n, total_c,
+                                (T *)output);
+}
+}  // namespace backward
+
+void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                   cnrtQueue_t queue,
+                                   const cnrtDataType_t d_type,
+                                   const void *input, const void *target,
+                                   const void *weight, const int32_t N,
+                                   const int32_t C, const float alpha,
+                                   const float gamma, void *output) {
+  if (d_type == CNRT_FLOAT16) {
+    forward::MLUUnion1KernelFocalLossSigmoidForward<
+        half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
+                                        gamma, output);
+  } else {
+    forward::MLUUnion1KernelFocalLossSigmoidForward<
+        float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
+                                         gamma, output);
+  }
+}
+
+void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                    cnrtQueue_t queue,
+                                    const cnrtDataType_t d_type,
+                                    const void *input, const void *target,
+                                    const void *weight, const float gamma,
+                                    const float alpha, const int32_t dim_n,
+                                    const int32_t deal_n, const int32_t dim_c,
+                                    void *output) {
+  if (d_type == CNRT_FLOAT16) {
+    backward::MLUUnion1KernelFocalLossSigmoidBackward<
+        half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
+                                        dim_n, deal_n, dim_c, output);
+  } else {
+    backward::MLUUnion1KernelFocalLossSigmoidBackward<
+        float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
+                                         dim_n, deal_n, dim_c, output);
+  }
+}
diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7cb16bb100355d49f3d1ad004a5e82998f258994
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@@ -0,0 +1,1161 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define NMS_SIZE (64)
+#define COORD_DIM (4)
+#define MEMORY_CORE (0x80)
+#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+#define REDUCE_NUM \
+  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
+
+#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
+#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
+
+__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
+__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
+enum Addr { SRAM, GDRAM };
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection(
+    uint32_t *output_box_num, const int output_mode, const int input_layout,
+    OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
+    const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
+    const int buffer_size, IN_DT *sram, const int core_limit,
+    const int input_box_num, const int input_stride, const int output_stride,
+    const int keepNum, const float thresh_iou, const float thresh_score,
+    const float offset, const int algo) {
+  // global value, it is stored in sram with a offset from the begin.
+  const int flag_offset_size = 28;
+  int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
+  loop_end_flag[0] = 0;
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  const int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  const int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = input_data_score;
+  input_x1_ptr = input_data_box;
+  if (input_layout == 0) {
+    // [boxes_num, 4]
+    input_y1_ptr = input_x1_ptr + 1;
+    input_x2_ptr = input_x1_ptr + 2;
+    input_y2_ptr = input_x1_ptr + 3;
+  } else if (input_layout == 1) {
+    // [4, boxes_num]
+    input_y1_ptr = input_x1_ptr + input_stride;
+    input_x2_ptr = input_y1_ptr + input_stride;
+    input_y2_ptr = input_x2_ptr + input_stride;
+  }
+
+  // nram data ptr
+  IN_DT *x1;
+  IN_DT *y1;
+  IN_DT *x2;
+  IN_DT *y2;
+  IN_DT *score;
+  IN_DT *inter_x1;
+  IN_DT *inter_y1;
+  IN_DT *inter_x2;
+  IN_DT *inter_y2;
+  IN_DT *max_box;  // the max score, x1, y1, x2, y2
+  IN_DT *x1_mask;
+  IN_DT *y1_mask;
+  IN_DT *x2_mask;
+  IN_DT *y2_mask;
+  OUT_DT *nram_save;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int len_core = 0;     // the length deal by every core
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int input_offset = 0;  // offset of input_data for current core
+  int nram_save_count = 0;
+  // mask for collect x1, y1, x2, y2. each mask has 128 elements
+  const int mask_size = 128;
+  const int total_mask_size = 512;
+
+  if (output_mode == 0) {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  if (core_limit == 1) {
+    len_core = input_box_num;
+    input_offset = 0;
+  } else {
+    int avg_core = input_box_num / core_limit;
+    int rem = input_box_num % core_limit;
+    len_core = avg_core + (taskId < rem ? 1 : 0);
+    input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
+  }
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+  repeat = len_core / max_seg_pad;
+  remain = len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is half, we should convert it to float when compute the IoU
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+  // initial the address point
+  score = buffer;
+  x1 = score + max_seg_pad;
+  y1 = x1 + max_seg_pad;
+  x2 = y1 + max_seg_pad;
+  y2 = x2 + max_seg_pad;
+  inter_x1 = y2 + max_seg_pad;
+  inter_y1 = inter_x1 + max_seg_pad;
+  inter_x2 = inter_y1 + max_seg_pad;
+  inter_y2 = inter_x2 + max_seg_pad;
+  x1_mask = inter_y2 + max_seg_pad;
+  y1_mask = x1_mask + mask_size;
+  x2_mask = y1_mask + mask_size;
+  y2_mask = x2_mask + mask_size;
+  max_box = y2_mask + mask_size;  // the max score, x1, y1, x2, y2
+  // offset two line from max_box
+  nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
+
+  // set mask for __bang_collect instruction
+  if (input_layout == 0) {
+    __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
+    for (int idx = 0; idx < mask_size; idx++) {
+      int index = (idx % COORD_DIM) * mask_size + idx;
+      x1_mask[index] = (IN_DT)1.0;
+    }
+  }
+
+  for (int keep = 0; keep < keepNum; keep++) {  // loop until the max_score <= 0
+    if (core_limit != 1) {
+      __sync_cluster();  // sync before current loop
+    }
+
+    /******find max start******/
+    int max_index = 0;         // the max score index
+    int global_max_index = 0;  // for U1
+    float max_area = 0;        // the max score area
+    max_box[0] = 0;            // init 0
+
+    for (int i = 0; i <= repeat; i++) {
+      if (i == repeat && remain == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
+      // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+      // num that half data type could express.
+      if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+        // seg length exceeds the max num for fp16 datatype!
+        return;
+      }
+      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+      __nramset(score, seg_len, (IN_DT)0);
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+
+      /******nms load end******/
+
+      __bang_max(inter_x1, score, seg_len);
+      if (inter_x1[0] > max_box[0]) {
+        max_box[0] = inter_x1[0];
+
+        if (sizeof(IN_DT) == sizeof(half)) {
+          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        } else if (sizeof(IN_DT) == sizeof(float)) {
+          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        }
+      }
+    }  // for repeat
+
+    int stride = 1;
+    if (input_layout == 0) {
+      stride = input_stride;
+    } else if (input_layout == 1) {
+      stride = 1;
+    }
+
+    if (core_limit == 1) {
+      max_box[1] = input_x1_ptr[max_index * stride];
+      max_box[2] = input_y1_ptr[max_index * stride];
+      max_box[3] = input_x2_ptr[max_index * stride];
+      max_box[4] = input_y2_ptr[max_index * stride];
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      input_score_ptr[max_index] = 0;
+      global_max_index = max_index;
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+    } else if (core_limit == 4) {
+      // find the max with sram
+      // the max box's x1, y1, x2, y2 on every core
+      if (coreId != MEMORY_CORE) {
+        max_box[1] = input_x1_ptr[max_index * stride];
+        max_box[2] = input_y1_ptr[max_index * stride];
+        max_box[3] = input_x2_ptr[max_index * stride];
+        max_box[4] = input_y2_ptr[max_index * stride];
+      }
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+      // copy every core's box info to sram, form: score---x1---y1---x2---y2---
+      for (int i = 0; i < INFO_NUM; i++) {
+        __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
+                 NRAM2SRAM);
+      }
+      // copy every core's max_index to sram, use 2 half to store max_index
+      __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
+               sizeof(uint32_t),
+               NRAM2SRAM);  // int32_t datatype
+      __sync_cluster();
+
+      // copy score from sram to nram and find the max
+      __nramset(inter_x1, NMS_SIZE, (IN_DT)0);
+      __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_core = 0;
+      if (sizeof(IN_DT) == sizeof(half)) {
+        max_core = ((uint16_t *)max_box)[1];
+      } else if (sizeof(IN_DT) == sizeof(float)) {
+        max_core = ((uint32_t *)max_box)[1];
+      }
+
+      // copy the max box from SRAM to NRAM
+      __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x1
+      __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y1
+      __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x2
+      __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y2
+      __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
+               sizeof(uint32_t), SRAM2NRAM);
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******find max end******/
+
+    /******nms store start******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
+                   1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
+                   1 * sizeof(uint32_t), 0);
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      (*output_box_num)++;
+    }
+
+    // store to sram/gdram
+    if (*output_box_num != 0) {
+      mluMemcpyDirection_t store_dir = NRAM2GDRAM;
+      if (dst == SRAM) {
+        store_dir = NRAM2SRAM;
+      } else {  // dst == GDRAM
+        store_dir = NRAM2GDRAM;
+      }
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
+        if (nram_save_count != 0) {
+          if (coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * sizeof(uint32_t), store_dir);
+              pvUnlock();
+              output_data += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
+              pvUnlock();
+              output_data += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
+                       store_dir, output_stride * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_data += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    // if the max score <= 0, end
+    if (core_limit == 1) {
+      if (float(max_box[0]) <= thresh_score) {
+        break;
+      }
+    } else {
+      if (float(max_box[0]) <= thresh_score) {
+        if (coreId == 0) {
+          loop_end_flag[0] = 1;
+        }
+      }
+      __sync_cluster();
+      if (loop_end_flag[0] == 1) {
+        break;
+      }
+    }
+    /******nms store end******/
+
+    // To solve half data accuracy, we convert half to float to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
+                              : seg_len = max_seg_iou_compute;
+      i == repeat_iou_compute ? cpy_len = remain_iou_compute
+                              : cpy_len = max_seg_iou_compute;
+
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      if (input_layout == 0) {
+        // the following number 4 means x1, y1, x2, y2
+        __memcpy(
+            inter_x1,
+            input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
+            cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
+            cpy_len * COORD_DIM * sizeof(IN_DT),
+            cpy_len * COORD_DIM * sizeof(IN_DT), 0);
+        // here use collect instruction to transpose the [n, 4] shape into [4,
+        // n] shape to avoid
+        // discrete memory accessing.
+        for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
+          // the following number 32 means 32 elements will be selected out by
+          // once operation
+          __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x1_mask, mask_size);
+          __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y1_mask, mask_size);
+          __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x2_mask, mask_size);
+          __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y2_mask, mask_size);
+        }
+      } else if (input_layout == 1) {
+        __memcpy(x1 + dt_offset,
+                 input_x1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y1 + dt_offset,
+                 input_y1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(x2 + dt_offset,
+                 input_x2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y2 + dt_offset,
+                 input_y2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+      }
+      /******nms load end******/
+
+      /******nms compute start******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******nms compute end******/
+
+      // update the score
+      mluMemcpyDirection_t update_dir = NRAM2SRAM;
+      if (dst == SRAM) {
+        update_dir = NRAM2SRAM;
+      } else {
+        update_dir = NRAM2GDRAM;
+      }
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for keepNum
+}
+
+__mlu_global__ void MLUUnion1KernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const int mode, const int input_layout,
+    void *workspace, void *result_num, void *output,
+    const cnrtDataType_t data_type_input, const float offset, const int algo) {
+  if (data_type_input == CNRT_FLOAT16) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
+             GDRAM2GDRAM);
+  } else if (data_type_input == CNRT_FLOAT32) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
+             GDRAM2GDRAM);
+  } else {
+  }
+
+  int output_stride = max_output_size;
+  uint32_t result_box_num = 0;
+  if (mode == 0) {
+    uint32_t *out_data = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *out_data = (half *)output;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *out_data = (float *)output;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  }
+}
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection_ux(
+    int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
+    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
+    const int input_layout, const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float thresh_iou, const float thresh_score,
+    const float offset, const int output_mode, const int algo) {
+  loop_end_flag[0] = 0;
+  IN_DT *sram = (IN_DT *)sram_buffer;
+
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = score_data;
+  input_x1_ptr = boxes_data;
+  input_y1_ptr = input_x1_ptr + input_stride;
+  input_x2_ptr = input_y1_ptr + input_stride;
+  input_y2_ptr = input_x2_ptr + input_stride;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int nram_save_count = 0;
+
+  if (output_mode == 0) {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  // data split
+  int avg_cluster = input_num_boxes / clusterDim;
+  int rem_cluster = input_num_boxes % clusterDim;
+  int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
+  int cluster_offset = avg_cluster * clusterId +
+                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
+
+  int avg_core = len_cluster / coreDim;
+  int rem_core = len_cluster % coreDim;
+  int len_core = avg_core + (coreId < rem_core ? 1 : 0);
+  int core_offset =
+      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
+  int input_offset = cluster_offset + core_offset;
+
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+
+  // core 0 of each cluster calculate the max score index
+  int max_index_avg_core = input_num_boxes / clusterDim;
+  int max_index_rem_core = input_num_boxes % clusterDim;
+  int max_index_len_core =
+      max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
+  int max_index_input_offset =
+      max_index_avg_core * clusterId +
+      (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
+  repeat = max_index_len_core / max_seg_pad;
+  remain = max_index_len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+
+  // init the nram ptr
+  IN_DT *score = (IN_DT *)nram_buffer;
+  IN_DT *x1 = score + max_seg_pad;
+  IN_DT *y1 = x1 + max_seg_pad;
+  IN_DT *x2 = y1 + max_seg_pad;
+  IN_DT *y2 = x2 + max_seg_pad;
+  IN_DT *inter_x1 = y2 + max_seg_pad;
+  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
+  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
+  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
+  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
+  OUT_DT *nram_save =
+      (OUT_DT *)((char *)max_box +
+                 NFU_ALIGN_SIZE);  // offset two line from max_box
+
+  mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
+  input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+
+  for (int keep = 0; keep < max_output_size;
+       keep++) {  // loop until the max_score <= 0
+    __sync_all();
+
+    /******FIND MAX START******/
+    int max_index = 0;
+    int global_max_index = 0;  // for Ux
+    float max_area = 0;        // the max socre area
+    max_box[0] = 0;            // init 0
+
+    if (coreId == 0) {
+      for (int i = 0; i <= repeat; i++) {
+        if (i == repeat && remain == 0) {
+          break;
+        }
+
+        int seg_len = (i == repeat)
+                          ? remain_pad
+                          : max_seg_pad;  // the length every nms compute
+        // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+        // num
+        // that fp16 could express.
+        if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+          return;
+        }
+        int cpy_len = (i == repeat)
+                          ? remain
+                          : max_seg_pad;  // the length every nms memcpy
+
+        /******NMS LOAD START******/
+        __bang_write_zero(score, seg_len);
+        __memcpy(score,
+                 input_score_ptr + max_index_input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+
+        /******NMS LOAD END******/
+
+        __bang_max(inter_x1, score, seg_len);
+        if (inter_x1[0] > max_box[0]) {
+          max_box[0] = inter_x1[0];
+          if (sizeof(IN_DT) == sizeof(half)) {
+            max_index =
+                ((uint16_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          } else if (sizeof(IN_DT) == sizeof(float)) {
+            max_index =
+                ((uint32_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          }
+        }
+      }  // for repeat
+
+      // the max box's x1, y1, x2, y2 on every cluster
+      max_box[1] = input_x1_ptr[max_index];
+      max_box[2] = input_y1_ptr[max_index];
+      max_box[3] = input_x2_ptr[max_index];
+      max_box[4] = input_y2_ptr[max_index];
+      ((uint32_t *)(max_box + 5))[0] = max_index;
+      // copy max box info to sram
+      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+    }
+    __sync_all();
+    // copy all partial max to the sram of cluster 0
+    if (clusterId != 0) {
+      __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
+               SRAM2SRAM, 0);
+    }
+    __sync_all();
+
+    // reduce between clusters to get the global max box
+    if (clusterId == 0) {
+      if (coreId == 0) {
+        __bang_write_zero(inter_x1, NMS_SIZE);
+        __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+                 REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+        __bang_max(max_box, inter_x1, NMS_SIZE);
+        int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                              ? ((uint16_t *)max_box)[1]
+                              : ((uint32_t *)max_box)[1];
+        __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
+                 REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+        __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+      }
+      __sync_cluster();
+      if (coreId == 0x80 && clusterDim > 1) {
+        // broadcast global max box to each cluster's sram
+        for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
+          __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
+                   cluster_idx);
+        }
+      }
+      __sync_cluster();
+    }
+    __sync_all();
+
+    // copy the global max box to max_box
+    __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+    if (algo == 0 || offset == 0.0) {
+      max_area = ((float)max_box[3] - (float)max_box[1]) *
+                 ((float)max_box[4] - (float)max_box[2]);
+    } else {
+      max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                 ((float)max_box[4] - (float)max_box[2] + offset);
+    }
+    global_max_index = ((uint32_t *)(max_box + 5))[0];
+    if (coreId != 0x80) {
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******FIND MAX END******/
+
+    /******NMS STORE START******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (clusterId == 0 && coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      output_box_num++;
+    }
+
+    // store to sram/gdram
+    if (output_box_num != 0) {
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
+        if (nram_save_count != 0) {
+          if (clusterId == 0 && coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
+                       NRAM2GDRAM, max_output_size * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_dram += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    if (float(max_box[0]) <= thresh_score) {
+      if (clusterId == 0 && coreId == 0) {
+        loop_end_flag[0] = 1;  // dram
+      }
+    }
+    __sync_all();
+    if (loop_end_flag[0] == 1) {
+      break;
+    }
+    /******NMS STORE END******/
+
+    // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
+                                              : max_seg_iou_compute;
+      int cpy_len =
+          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
+
+      /******NMS LOAD START******/
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      __memcpy(x1 + dt_offset,
+               input_x1_ptr + input_offset + i * max_seg_iou_compute,
+               cpy_len * sizeof(IN_DT), input_load_dir,
+               max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
+      /******NMS LOAD END******/
+
+      /******NMS COMPUTE START******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******NMS COMPUTE END******/
+
+      if (sizeof(IN_DT) == 2) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), input_store_dir,
+               cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for max_output_size
+}
+
+__mlu_global__ void MLUUionXKernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_layout, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const float offset,
+    const cnrtDataType_t data_type_input, const int output_mode, const int algo,
+    void *workspace, void *result_num, void *output) {
+  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
+  int32_t *loop_end_flag =
+      (int32_t *)((char *)workspace +
+                  INFO_NUM * input_num_boxes * input_dwidth);
+  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
+  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
+
+  int cluster_score_size = input_num_boxes * input_dwidth;
+  int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
+  char *sram_score = (char *)sram_buffer + reduce_sram_size;
+  char *sram_boxes =
+      (char *)sram_buffer + reduce_sram_size + cluster_score_size;
+  Addr input_ram = GDRAM;
+  if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
+    input_ram = SRAM;
+    __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
+    __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
+  } else {
+    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
+  }
+  __sync_cluster();
+  uint32_t output_box_num = 0;
+  if (output_mode == 0) {
+    uint32_t *output_dram = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *output_dram = (half *)output;
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *output_dram = (float *)output;
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  }
+}
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
+  switch (k_type) {
+    default: { return; }
+    case CNRT_FUNC_TYPE_BLOCK:
+    case CNRT_FUNC_TYPE_UNION1: {
+      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, input_stride,
+          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
+          /*output_mode=*/0,
+          /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
+          data_type_input, offset, /*algo=*/1);
+    }; break;
+    case CNRT_FUNC_TYPE_UNION2:
+    case CNRT_FUNC_TYPE_UNION4:
+    case CNRT_FUNC_TYPE_UNION8:
+    case CNRT_FUNC_TYPE_UNION16: {
+      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
+          input_stride, max_output_boxes, iou_threshold,
+          /*confidence_threshold=*/0.0, offset, data_type_input,
+          /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
+          output_ptr);
+    }; break;
+  }
+}
diff --git a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..13b4af19f669aa0b63758e899a06395b39e455aa
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
@@ -0,0 +1,615 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+#include "psamask_utils.hpp"
+
+#define COMPUTE_COUNT_ALIGN 64
+
+__nram__ char buf[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void swap(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
+                                          const PositionInCore &position,
+                                          const Shape &shape_full) {
+  int n_offset = shape_full.h * shape_full.w * shape_full.c;
+  int h_offset = shape_full.w * shape_full.c;
+  int w_offset = shape_full.c;
+  int n_seg = position.n_end - position.n_start;
+  int h_seg = position.h_end - position.h_start;
+  int w_seg = position.w_end - position.w_start;
+  int size = h_seg * w_seg * shape_full.c;
+
+  __memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
+               position.w_start * w_offset,
+           src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
+           size * sizeof(T), n_seg - 1);
+}
+
+template <typename T>
+__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
+                                         const PositionInCore &position,
+                                         const Shape &shape_full) {
+  int n_offset = shape_full.h * shape_full.w * shape_full.c;
+  int h_offset = shape_full.w * shape_full.c;
+  int w_offset = shape_full.c;
+  int n_seg = position.n_end - position.n_start;
+  int h_seg = position.h_end - position.h_start;
+  int w_seg = position.w_end - position.w_start;
+  int size = h_seg * w_seg * shape_full.c;
+
+  __memcpy(dst,
+           src + position.n_start * n_offset + position.h_start * h_offset +
+               position.w_start * w_offset,
+           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
+           n_seg - 1);
+}
+
+// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
+template <typename T>
+__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
+  int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  for (int i = 0; i < shape_seg.n; ++i) {
+    __bang_transpose(dst, src, align_hw, align_c);
+    dst += align_hw * align_c;
+    src += align_hw * align_c;
+  }
+}
+
+template <typename T>
+__mlu_func__ void psamaskCollectForward(
+    const T *x_dram, T *y_dram, const PositionInCore &position,
+    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *x_nram = (T *)buf;
+  T *y_nram =
+      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
+                          COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
+
+  // fill zeros to output
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
+                 NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(y_nram, elem_count, (T)0);
+
+  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
+  int y_h_offset = shape_seg.w * shape_seg.c;
+  int y_w_offset = shape_seg.c;
+  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
+  int y_c_offset = 1;
+  int x_h_offset = shape_seg.w * x_full.c;
+  int x_w_offset = x_full.c;
+  int x_c_offset = 1;
+  int x_start = 0;
+  int y_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int y_offset = y_start;
+        int x_offset = x_start;
+        y_offset += hidx * y_h_offset + widx * y_w_offset;
+        x_offset += hidx * x_h_offset + widx * x_w_offset;
+
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = x_full.h + half_h_mask - h_abs < h_mask
+                             ? x_full.h + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = x_full.w + half_w_mask - w_abs < w_mask
+                             ? x_full.w + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                      w                  ) with mask-indexed
+        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
+        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
+                     w_abs - half_w_mask) *
+                    y_c_offset;
+        x_offset += (hstart * w_mask + wstart) * x_c_offset;
+        int count = wend - wstart;
+        __memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
+                 NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
+                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
+      }
+    }
+    y_start += y_n_offset;
+    x_start += x_n_offset;
+  }
+  storeDataFromNramToDram(y_dram, y_nram, position, y_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskDistributeForward(
+    const T *x_dram, T *y_dram, const PositionInCore &position,
+    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *x_nram = (T *)buf;
+  T *y_nram_temp =
+      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
+                          COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
+
+  // fill zeros to output
+  int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(y_nram_temp, elem_count, (T)0);
+
+  int y_n_offset = align_hw * align_c;
+  int y_h_offset = shape_seg.w * align_c;
+  int y_w_offset = align_c;
+  int y_c_offset = 1;
+  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
+  int x_h_offset = shape_seg.w * x_full.c;
+  int x_w_offset = x_full.c;
+  int x_c_offset = 1;
+  int h_feature = y_full.h;
+  int w_feature = y_full.w;
+
+  int y_start = 0;
+  int x_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int y_offset = y_start;
+        int x_offset = x_start;
+        y_offset += hidx * y_h_offset + widx * y_w_offset;
+        x_offset += hidx * x_h_offset + widx * x_w_offset;
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                      w                     ) with mask-indexed
+        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
+        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
+                     w_abs - half_w_mask) *
+                    y_c_offset;
+        x_offset += (hstart * w_mask + wstart) * x_c_offset;
+        int count = wend - wstart;
+        __memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
+                 NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
+                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
+      }
+    }
+    y_start += y_n_offset;
+    x_start += x_n_offset;
+  }
+  // transpose y
+  T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
+  Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
+  transposeData(y_nram, y_nram_temp, y_seg);
+  swap(align_c, align_hw);
+  // store y from nram to dram
+  int y_n_offset_full = y_full.h * y_full.w * y_full.c;
+  int y_w_offset_full = y_full.c;
+  int y_c_offset_full = 1;
+
+  int y_dram_start =
+      position.n_start * y_n_offset_full +
+      (position.h_start * y_full.w + position.w_start) * y_c_offset_full;
+  int y_nram_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
+    int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
+    __memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
+             shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
+             y_w_offset_full * sizeof(T), align_c * sizeof(T),
+             h_feature * w_feature - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void psamaskCollectBackward(
+    const T *dy_dram, T *dx_dram, const PositionInCore &position,
+    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *dy_nram = (T *)buf;
+  T *dx_nram =
+      dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
+                           COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
+
+  // fill zeros to output
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
+                 NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(dx_nram, elem_count, (T)0);
+
+  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
+  int dy_h_offset = shape_seg.w * dy_full.c;
+  int dy_w_offset = dy_full.c;
+  int dy_c_offset = 1;
+  int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
+  int dx_h_offset = shape_seg.w * dx_full.c;
+  int dx_w_offset = dx_full.c;
+  int dx_c_offset = 1;
+  int h_feature = dy_full.h;
+  int w_feature = dy_full.w;
+
+  int dy_start = 0;
+  int dx_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int dy_offset = dy_start;
+        int dx_offset = dx_start;
+        dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
+        dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
+
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                       w                      ) with mask-indexed
+        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
+        // feature-indexed
+        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
+                      w_abs - half_w_mask) *
+                     dy_c_offset;
+        dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
+        int count = wend - wstart;
+        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
+                 NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
+                 dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
+      }
+    }
+    dy_start += dy_n_offset;
+    dx_start += dx_n_offset;
+  }
+  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskDistributeBackward(
+    const T *dy_dram, T *dx_dram, const PositionInCore &position,
+    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  // load dy from dram to nram
+  T *dy_nram_temp = (T *)buf;
+  int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
+  int dy_c_offset_full = 1;
+  int h_feature = dy_full.h;
+  int w_feature = dy_full.w;
+  int align_c =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
+
+  int dy_dram_start =
+      position.n_start * dy_n_offset_full +
+      (position.h_start * w_feature + position.w_start) * dy_c_offset_full;
+  int dy_nram_start = 0;
+  for (int i = 0; i < shape_seg.n; ++i) {
+    int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
+    int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
+    __memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
+             shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
+             align_c * sizeof(T), dy_full.c * sizeof(T),
+             h_feature * w_feature - 1);
+  }
+  T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
+  Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
+  transposeData(dy_nram, dy_nram_temp, dy_seg);
+  swap(align_c, align_hw);
+
+  // fill zeros to dx
+  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
+  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
+  __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
+
+  int dy_n_offset_seg = align_hw * align_c;
+  int dy_h_offset_seg = shape_seg.w * align_c;
+  int dy_w_offset_seg = align_c;
+  int dy_c_offset_seg = 1;
+  int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
+  int dx_h_offset_seg = shape_seg.w * shape_seg.c;
+  int dx_w_offset_seg = shape_seg.c;
+  int dx_c_offset_seg = 1;
+
+  int dy_start = 0;
+  int dx_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int dy_offset = dy_start;
+        int dx_offset = dx_start;
+        dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
+        dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                       w                      ) with mask-indexed
+        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
+        // feature-indexed
+        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
+                      w_abs - half_w_mask) *
+                     dy_c_offset_seg;
+        dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
+        int count = wend - wstart;
+        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
+                 NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
+                 w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
+      }
+    }
+    dy_start += dy_n_offset_seg;
+    dx_start += dx_n_offset_seg;
+  }
+  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
+                              const Shape &input_full, const Shape &output_full,
+                              LimitParam &limit, const PsamaskType psa_type,
+                              const DimPartitionType core_partition,
+                              const DimPartitionType cluster_partition,
+                              const bool is_forward, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const int n_per_core,
+                              const int h_per_core, const int n_per_cluster,
+                              const int h_per_cluster) {
+  PositionInCore position_full;
+  PositionInCore position_seg;
+  position_full.w_start = 0;
+  position_full.w_end = output_full.w;
+  int n_num_in_cluster = n_per_cluster;
+  int h_num_in_cluster = h_per_cluster;
+
+  switch (cluster_partition) {
+    case PARTITION_N: {
+      position_full.h_start = 0;
+      position_full.h_end = input_full.h;
+      position_full.n_start = taskIdY * n_per_cluster;
+      int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
+      if (taskIdY >= cluster_need) return;
+      int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
+      n_num_in_cluster =
+          (taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
+      position_full.n_end = position_full.n_start + n_num_in_cluster;
+    }; break;
+    case PARTITION_H: {
+      position_full.n_start = 0;
+      position_full.n_end = input_full.n;
+      position_full.h_start = taskIdY * h_per_cluster;
+      int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
+      if (taskIdY >= cluster_need) return;
+      int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
+      h_num_in_cluster =
+          (taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
+      position_full.h_end = position_full.h_start + h_num_in_cluster;
+    }; break;
+  }
+  switch (core_partition) {
+    case PARTITION_N: {
+      position_full.n_start += taskIdX * n_per_core;
+      int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
+      if (taskIdX >= core_need) return;
+      int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
+      position_full.n_end =
+          position_full.n_start +
+          ((taskIdX == core_need - 1) ? n_remainder : n_per_core);
+    }; break;
+    case PARTITION_H: {
+      position_full.h_start += taskIdX * h_per_core;
+      int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
+      if (taskIdX >= core_need) return;
+      int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
+      position_full.h_end =
+          position_full.h_start +
+          ((taskIdX == core_need - 1) ? h_remainder : h_per_core);
+    }; break;
+  }
+  // the count of n ,h and w need to be processed in the current core
+  int shape_core_n = position_full.n_end - position_full.n_start;
+  int shape_core_h = position_full.h_end - position_full.h_start;
+  int shape_core_w = input_full.w;
+
+  limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
+  limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
+  limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
+
+  // load the data to nram according to the limit
+  for (int nidx = position_full.n_start; nidx < position_full.n_end;
+       nidx += limit.n) {
+    position_seg.n_start = nidx;
+    position_seg.n_end =
+        position_seg.n_start + (position_full.n_end - nidx < limit.n
+                                    ? position_full.n_end - nidx
+                                    : limit.n);
+    for (int hidx = position_full.h_start; hidx < position_full.h_end;
+         hidx += limit.h) {
+      position_seg.h_start = hidx;
+      position_seg.h_end =
+          position_seg.h_start + (position_full.h_end - hidx < limit.h
+                                      ? position_full.h_end - hidx
+                                      : limit.h);
+      for (int widx = position_full.w_start; widx < position_full.w_end;
+           widx += limit.w) {
+        position_seg.w_start = widx;
+        position_seg.w_end =
+            position_seg.w_start + (position_full.w_end - widx < limit.w
+                                        ? position_full.w_end - widx
+                                        : limit.w);
+
+        // record the segment of output except the size of channel
+        // channel segments of output and input are the same
+        Shape shape_seg;
+        shape_seg.n = position_seg.n_end - position_seg.n_start;
+        shape_seg.h = position_seg.h_end - position_seg.h_start;
+        shape_seg.w = position_seg.w_end - position_seg.w_start;
+        shape_seg.c = output_full.c;
+
+        switch (psa_type) {
+          case COLLECT: {
+            if (is_forward) {
+              psamaskCollectForward(input_dram, output_dram, position_seg,
+                                    input_full, output_full, shape_seg, h_mask,
+                                    w_mask, half_h_mask, half_w_mask);
+            } else {
+              psamaskCollectBackward(input_dram, output_dram, position_seg,
+                                     input_full, output_full, shape_seg, h_mask,
+                                     w_mask, half_h_mask, half_w_mask);
+            }
+          } break;
+          case DISTRIBUTE: {
+            if (is_forward) {
+              psamaskDistributeForward(input_dram, output_dram, position_seg,
+                                       input_full, output_full, shape_seg,
+                                       h_mask, w_mask, half_h_mask,
+                                       half_w_mask);
+            } else {
+              psamaskDistributeBackward(input_dram, output_dram, position_seg,
+                                        input_full, output_full, shape_seg,
+                                        h_mask, w_mask, half_h_mask,
+                                        half_w_mask);
+            }
+          } break;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelPsamaskForward(
+    const T *x, T *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  if (coreId == 0x80) {
+    return;
+  }
+  Shape x_full, y_full;
+  x_full.n = batch;
+  x_full.h = h_feature;
+  x_full.w = w_feature;
+  x_full.c = x_c;
+  y_full.n = batch;
+  y_full.h = h_feature;
+  y_full.w = w_feature;
+  y_full.c = y_c;
+
+  LimitParam limit;
+  limit.n = limit_n_seg;
+  limit.h = limit_h_seg;
+  limit.w = limit_w_seg;
+
+  psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
+              cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
+              n_per_core, h_per_core, n_per_cluster, h_per_cluster);
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelPsamaskBackward(
+    const T *dy, T *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  if (coreId == 0x80) {
+    return;
+  }
+  Shape dy_full, dx_full;
+  dx_full.n = batch;
+  dx_full.h = h_feature;
+  dx_full.w = w_feature;
+  dx_full.c = dx_c;
+  dy_full.n = batch;
+  dy_full.h = h_feature;
+  dy_full.w = w_feature;
+  dy_full.c = dy_c;
+
+  LimitParam limit;
+  limit.n = limit_n_seg;
+  limit.h = limit_h_seg;
+  limit.w = limit_w_seg;
+
+  psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
+              cluster_partition, false, h_mask, w_mask, half_h_mask,
+              half_w_mask, n_per_core, h_per_core, n_per_cluster,
+              h_per_cluster);
+}
+
+void KernelPsamaskForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *x, void *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
+      static_cast<const float *>(x), static_cast<float *>(y), psa_type,
+      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
+      w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
+      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
+}
+
+void KernelPsamaskBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *dy, void *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
+      static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
+      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
+      w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
+      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
+}
diff --git a/mmcv/ops/csrc/common/mlu/psamask_utils.hpp b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..30ec388494615842528b74da0661e169b08a545e
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PSAMASK_UTILS_HPP_
+#define PSAMASK_UTILS_HPP_
+
+typedef enum {
+  COLLECT = 0,
+  DISTRIBUTE = 1,
+} PsamaskType;
+
+typedef enum {
+  PARTITION_N = 0,
+  PARTITION_H = 1,
+} DimPartitionType;
+
+struct PartitionSeg {
+  int h_per_cluster;
+  int n_per_cluster;
+  int h_per_core;
+  int n_per_core;
+  DimPartitionType cluster_partition;
+  DimPartitionType core_partition;
+};
+
+struct Shape {
+  int n;
+  int h;
+  int w;
+  int c;
+};
+
+struct LimitParam {
+  int n;
+  int h;
+  int w;
+};
+
+struct PositionInCore {
+  int n_start;
+  int n_end;
+  int h_start;
+  int h_end;
+  int w_start;
+  int w_end;
+};
+#endif  // PSAMASK_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..f62554d0effd9e67ba5068b1b57d7e7131c696ea
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
@@ -0,0 +1,493 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define ROI_OFFSET 5
+
+__nram__ char buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void bilinearInterpolate(const int input_height,
+                                      const int input_width, T y, T x, T *w1,
+                                      T *w2, T *w3, T *w4, int *x_low,
+                                      int *x_high, int *y_low, int *y_high,
+                                      bool *empty) {
+  // deal with cases that inverse elements are of feature map boundary
+  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
+    *empty = true;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low_ = int(y);
+  int x_low_ = int(x);
+
+  if (y_low_ >= input_height - 1) {
+    *y_high = y_low_ = input_height - 1;
+    y = (T)y_low_;
+  } else {
+    *y_high = y_low_ + 1;
+  }
+
+  if (x_low_ >= input_width - 1) {
+    *x_high = x_low_ = input_width - 1;
+    x = T(x_low_);
+  } else {
+    *x_high = x_low_ + 1;
+  }
+
+  *y_low = y_low_;
+  *x_low = x_low_;
+
+  T ly = y - y_low_;
+  T lx = x - x_low_;
+  T hy = 1.0 - ly;
+  T hx = 1.0 - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
+                                 T *nram_out, const int roi_bin_grid_h,
+                                 const int roi_bin_grid_w, const T roi_start_h,
+                                 const T roi_start_w, const int ph,
+                                 const int pw, const T bin_size_h,
+                                 const T bin_size_w, const float count,
+                                 const int input_height, const int input_width,
+                                 const int channels, const int cyc_num,
+                                 const int max_elements) {
+  int cyc_channel = max_elements;
+
+  for (int i = 0; i < cyc_num; i++) {
+    int real_channel =
+        (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
+    int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
+    __bang_write_zero(nram_out, align_channel);
+    uint32_t real_size = real_channel * sizeof(T);
+
+    int iy, ix;
+    for (iy = 0; iy < roi_bin_grid_h; iy++) {
+      // 1. compute the coordinates of the y axis in the current roi_bin_grid_h
+      T y = roi_start_h + ph * bin_size_h +
+            (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
+      for (ix = 0; ix < roi_bin_grid_w; ix++) {
+        // 2. compute the coordinates of the x axis in the current
+        //    roi_bin_grid_w
+        T x = roi_start_w + pw * bin_size_w +
+              (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
+
+        // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
+        //    and y_high) and weight (x_low and x_high) of input feature map in
+        //    the current roi bin grid, and the flag (empty) which shows if x, y
+        //    are out of input feature map ranges
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bool empty = false;
+
+        bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
+                            &x_low, &x_high, &y_low, &y_high, &empty);
+
+        // 4. compute interpolation of the current roi bin grid
+        //    tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
+        //    to compute the interpolation, and then reused to compute
+        //    the argmax_x and argmax_y.
+        T *tmp_cyc1 = nram_in + cyc_channel;
+        T *tmp_cyc2 = nram_in + cyc_channel * 2;
+        T *tmp_cyc3 = nram_in + cyc_channel * 3;
+        T *tmp_cyc4 = nram_in + cyc_channel * 4;
+
+        if (empty) {  // exits abnormal values
+          __bang_write_zero(nram_in, align_channel);
+        } else {
+          __bang_write_zero(nram_in, align_channel);
+          uint32_t offset1 = (y_low * input_width + x_low) * channels;
+          uint32_t offset2 = (y_low * input_width + x_high) * channels;
+          uint32_t offset3 = (y_high * input_width + x_low) * channels;
+          uint32_t offset4 = (y_high * input_width + x_high) * channels;
+          T *input1 = (T *)input_core + offset1 + i * cyc_channel;
+          T *input2 = (T *)input_core + offset2 + i * cyc_channel;
+          T *input3 = (T *)input_core + offset3 + i * cyc_channel;
+          T *input4 = (T *)input_core + offset4 + i * cyc_channel;
+
+          // load the four pixels (p1, p2, p3 and p4) of input feature map to
+          // compute interpolation
+          __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
+
+          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
+          __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
+          __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
+          __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
+          __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
+
+          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
+        }
+        // 5. compute sum value and corresponding coordinates of x axis and y
+        //    axis. Update the sum value.
+        __bang_add(nram_out, nram_in, nram_out, align_channel);
+      }  // loop_roi_grid_w
+    }    // loop_roi_grid_h
+    T count_value = (T)(1.0 / count);
+    __bang_mul_const(nram_out, nram_out, count_value, align_channel);
+    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
+  }  // loop_cyc_num
+}
+
+template <typename T>
+__mlu_func__ void roialignForwardAvg(
+    T *input, T *rois, T *output, const bool aligned, const int channels,
+    const int pooled_height, const int pooled_width, const int input_height,
+    const int input_width, const int sampling_ratio, const T spatial_scale,
+    const int num_rois) {
+  // find limit for channel, the nram space is divided to 6 parts that are
+  // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
+
+  // max_elements : 300 : float datatype : 27296, half datatype : 54592
+  // max_elements : 200 : float datatype : 16384, half datatype : 32768
+  int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
+  int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
+  T offset = aligned ? (T)0.5 : (T)0.0;
+  int task_num = num_rois * pooled_height * pooled_width;
+  T *nram_out = (T *)buffer;
+  T *nram_in = nram_out + max_elements;
+  if (task_num < taskDim) {
+    if (taskId >= task_num) {
+      return;
+    }
+  }
+
+  for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
+    if (bin_idx >= task_num) {
+      return;
+    }
+
+    // (n,ph.pw) is a c in the pooled output
+    int pw = bin_idx % pooled_width;
+    int ph = (bin_idx / pooled_width) % pooled_height;
+    int n = bin_idx / pooled_width / pooled_height;
+
+    T *roi_id_tmp = rois + n * ROI_OFFSET;
+    // 1. compute width and height of roi region.
+    int batch_idx = (int)roi_id_tmp[0];
+    T roi_x1 = roi_id_tmp[1];
+    T roi_y1 = roi_id_tmp[2];
+    T roi_x2 = roi_id_tmp[3];
+    T roi_y2 = roi_id_tmp[4];
+    T roi_start_w = roi_x1 * spatial_scale - offset;
+    T roi_start_h = roi_y1 * spatial_scale - offset;
+    T roi_end_w = roi_x2 * spatial_scale - offset;
+    T roi_end_h = roi_y2 * spatial_scale - offset;
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    if (!aligned) {
+      roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
+      roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
+    }
+
+    // 2. compute float-type width and height of roi bin region.
+    T bin_size_w = (T)roi_width / (T)pooled_width;
+    T bin_size_h = (T)roi_height / (T)pooled_height;
+
+    // 3. compute int-type width and height of roi bin region.
+    int roi_bin_grid_h, roi_bin_grid_w;
+    roi_bin_grid_h = (sampling_ratio > 0)
+                         ? sampling_ratio
+                         : int(ceilf(roi_height / pooled_height));
+    roi_bin_grid_w = (sampling_ratio > 0)
+                         ? sampling_ratio
+                         : int(ceilf(roi_width / pooled_width));
+    float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
+                              ? roi_bin_grid_h * roi_bin_grid_w
+                              : 1.0);
+    T *input_core = input + batch_idx * channels * input_width * input_height;
+    T *output_core = output + bin_idx * channels;
+    // 4. compute avg value and corresponding coordinates of x axis and y axis.
+    computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
+                   roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
+                   bin_size_w, count, input_height, input_width, channels,
+                   cyc_num, max_elements);
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
+    const void *input, const void *rois, const int channels, const bool aligned,
+    const int pooled_height, const int pooled_width, const int input_height,
+    const int input_width, const int sampling_ratio, const float spatial_scale,
+    const int num_rois, const cnrtDataType_t data_type, void *output) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+
+  switch (data_type) {
+    case CNRT_FLOAT16: {
+      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
+                         channels, pooled_height, pooled_width, input_height,
+                         input_width, sampling_ratio,
+                         (half)spatial_scale, num_rois);
+    }; break;
+    case CNRT_FLOAT32: {
+      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
+                         aligned, channels, pooled_height, pooled_width,
+                         input_height, input_width, sampling_ratio,
+                         (float)spatial_scale, num_rois);
+    }; break;
+    default:
+      break;
+  }
+
+  return;
+}
+}  // namespace forward
+
+namespace backward {
+__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
+                                              float x, float *w1, float *w2,
+                                              float *w3, float *w4, int *x_low,
+                                              int *x_high, int *y_low,
+                                              int *y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
+    *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
+    return;
+  }
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+  *y_low = (int)y;
+  *x_low = (int)x;
+  if (*y_low >= height - 1) {
+    *y_high = height - 1, *y_low = height - 1;
+    y = (float)(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = width - 1, *x_low = width - 1;
+    x = (float)(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  float ly = y - *y_low, lx = x - *x_low;
+  float hy = 1.0 - ly, hx = 1.0 - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+__mlu_func__ void unionRoiAlignBp(
+    T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
+    const int wi, const int c, const int no, const int ho, const int wo,
+    const float spatial_scale, const int sampling_ratio, const bool aligned) {
+  int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
+  int deal_all = boxes_num * hi * wi;
+  int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
+  for (int i = 0; i < deal_this_core; ++i) {
+    int bhw_id = i * taskDim + taskId;
+    int box_id = bhw_id / (hi * wi);
+    int ih = (bhw_id / wi) % hi;
+    int iw = bhw_id % wi;
+    T *box = boxes + box_id * 5;
+    int image_id = (int)box[0];
+    T *image_offset = grads_image + image_id * ho * wo * c;
+    T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
+
+    float offset = aligned ? 0.5 : 0.0;
+    float x1 = box[1] * spatial_scale - offset;
+    float y1 = box[2] * spatial_scale - offset;
+    float x2 = box[3] * spatial_scale - offset;
+    float y2 = box[4] * spatial_scale - offset;
+    float roi_width = x2 - x1;
+    float roi_height = y2 - y1;
+    if (!aligned) {
+      roi_width = (roi_width > 1.0) ? roi_width : 1.0;
+      roi_height = (roi_height > 1.0) ? roi_height : 1.0;
+    }
+    float bin_size_h = roi_height / hi;
+    float bin_size_w = roi_width / wi;
+
+    int roi_grid_h =
+        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
+    int roi_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
+    const T count = roi_grid_h * roi_grid_w;
+    if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
+      for (int iy = 0; iy < roi_grid_h; ++iy) {
+        const float y =
+            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
+        for (int ix = 0; ix < roi_grid_w; ++ix) {
+          const float x =
+              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
+          float w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
+                                      &x_high, &y_low, &y_high);
+          if (x_low >= 0 && y_low >= 0) {
+            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_low * wo * c + x_low * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_low * wo * c + x_high * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_high * wo * c + x_low * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_high * wo * c + x_high * c,
+                              (T *)buffer + c_align, c);
+          }  // x_low && y_low
+        }    // ix
+      }      // iy
+    } else {
+      for (int iy = 0; iy < roi_grid_h; ++iy) {
+        const float y =
+            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
+        for (int ix = 0; ix < roi_grid_w; ++ix) {
+          const float x =
+              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
+          float w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
+                                      &x_high, &y_low, &y_high);
+          if (x_low >= 0 && y_low >= 0) {
+            int deal_once =
+                PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
+            int c_repeat = c / deal_once + (int)(c % deal_once != 0);
+            for (int i = 0; i < c_repeat; ++i) {
+              int deal_c = deal_once;
+              int align_c = deal_once;
+              if (i == c_repeat - 1) {
+                deal_c = c - i * deal_once;
+                align_c = c_align - i * deal_once;
+              }
+              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
+                       GDRAM2NRAM);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_high * wo * c + x_high * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+            }  // for c_repeat
+          }    // x_low >= 0 && y_low >= 0
+        }      // ix
+      }        // iy
+    }          // if c
+  }            // i
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
+    const void *grads, const void *boxes, void *grads_image,
+    const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
+    const int c, const int no, const int ho, const int wo,
+    const float spatial_scale, const int sampling_ratio, const bool aligned) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (dtype) {
+    case CNRT_FLOAT16: {
+      unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
+                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
+                      sampling_ratio, aligned);
+    }; break;
+    case CNRT_FLOAT32: {
+      unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
+                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
+                      sampling_ratio, aligned);
+    }; break;
+    default: { return; }
+  }
+}
+}  // namespace backward
+
+void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                    cnrtQueue_t queue, const cnrtDataType_t d_type,
+                    const void *input, const void *rois, const int channels,
+                    const bool aligned, const int pooled_height,
+                    const int pooled_width, const int input_height,
+                    const int input_width, const int sampling_ratio,
+                    const float spatial_scale, const int num_rois,
+                    void *output) {
+  forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
+      input, rois, channels, aligned, pooled_height, pooled_width, input_height,
+      input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
+}
+
+void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                            cnrtQueue_t queue, const cnrtDataType_t dtype,
+                            const void *grads, const void *boxes,
+                            void *grads_image, const int boxes_num,
+                            const int hi, const int wi, const int c,
+                            const int no, const int ho, const int wo,
+                            const float spatial_scale, const int sampling_ratio,
+                            const bool aligned) {
+  backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
+      grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
+      spatial_scale, sampling_ratio, aligned);
+}
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7f05b525a0b278e7593db76faee8fa782df4bc38
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
@@ -0,0 +1,472 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+#include "roi_align_rotated_utils.hpp"
+
+#define ROI_OFFSET 6
+#define SAMPLING_NUM 4
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void swap(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+__mlu_func__ void bilinearInterpolate(const int input_height,
+                                      const int input_width, T x, T y,
+                                      const T zero_sign, T *w1, T *w2, T *w3,
+                                      T *w4, int *x_low, int *x_high,
+                                      int *y_low, int *y_high, bool *empty) {
+  // deal with case that the point is out of feature map boundary
+  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
+    *empty = true;
+    return;
+  }
+
+  if (y <= 0) y = (T)0;
+  if (x <= 0) x = (T)0;
+
+  *y_low = int(y);
+  *x_low = int(x);
+
+  if (*y_low >= input_height - 1) {
+    *y_high = *y_low = input_height - 1;
+    y = (T)(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= input_width - 1) {
+    *x_high = *x_low = input_width - 1;
+    x = T(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1.0 - ly;
+  T hx = 1.0 - lx;
+  *w1 = hy * hx * zero_sign;
+  *w2 = hy * lx * zero_sign;
+  *w3 = ly * hx * zero_sign;
+  *w4 = ly * lx * zero_sign;
+}
+
+template <typename T>
+__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
+                                const RoiAlignRotatedParams &params,
+                                int *batch_idx, int *roi_n, int *pw, int *ph,
+                                T *roi_center_x, T *roi_center_y, T *roi_width,
+                                T *roi_height, T *theta) {
+  T offset = params.aligned ? (T)0.5 : (T)0.0;
+  *pw = bin_i % params.pooled_width;
+  *ph = (bin_i / params.pooled_width) % params.pooled_height;
+  *roi_n = bin_i / params.pooled_width / params.pooled_height;
+  const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
+  *batch_idx = (int)roi_info[0];
+  *roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
+  *roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
+  *roi_width = roi_info[3] * (T)params.spatial_scale;
+  *roi_height = roi_info[4] * (T)params.spatial_scale;
+  *theta = roi_info[5];
+  if (params.clockwise) {
+    *theta = -(*theta);
+  }
+  if (!params.aligned) {
+    *roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
+    *roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
+  }
+}
+
+template <typename T>
+__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
+                                         const T *rois_dram, const int batch,
+                                         const int height, const int width,
+                                         const int channel, const int rois_num,
+                                         const RoiAlignRotatedParams &params,
+                                         T *output_dram) {
+  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
+  int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
+  channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
+  int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
+  channel_align = CEIL_ALIGN(channel_align, align_base_128);
+
+  T *nram_out = (T *)nram_buffer;
+  T *nram_ping = nram_out + channel_align;
+  T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
+
+  int bin_first = taskId;
+  int bin_end = rois_num * params.pooled_height * params.pooled_width;
+
+  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
+    T roi_center_x, roi_center_y, roi_width, roi_height, theta;
+    int batch_idx, roi_n, pw, ph;
+    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
+                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
+                  &theta);
+    T bin_size_h = roi_height / params.pooled_height;
+    T bin_size_w = roi_width / params.pooled_width;
+
+    int roi_bin_grid_h =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_height / params.pooled_height);
+    int roi_bin_grid_w =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_width / params.pooled_width);
+    T roi_start_y = -roi_height / 2;
+    T roi_start_x = -roi_width / 2;
+    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
+                            ? roi_bin_grid_h * roi_bin_grid_w
+                            : 1;
+    T cos_theta = std::cos(theta);
+    T sin_theta = std::sin(theta);
+    T zero_sign = 1.0f / bin_dim;
+
+    bool is_first_sample = true;
+    int src_offset = 0;
+    int dst_offset = 0;
+    int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
+    for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
+      __nramset(nram_out, channel_align, (T)0);
+      c_rem = channel - c_offset;
+      c_slice = channel_align > c_rem ? c_rem : channel_align;
+      c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
+      is_first_sample = true;
+      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+        const T yy = roi_start_y + ph * bin_size_h +
+                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
+        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+          const T xx = roi_start_x + pw * bin_size_w +
+                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
+          int sample_i = iy * roi_bin_grid_w + ix;
+
+          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
+          T w1, w2, w3, w4;
+          bool empty = false;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
+                              &w4, &x_low, &x_high, &y_low, &y_high, &empty);
+          int sample_wdim = x_high - x_low + 1;
+          /*******************************************************
+                 |          ping         |          pong         |
+          |------|-----|-----|-----|-----|-----|-----|-----|-----|
+          |output|  p1 |  p2 |  p3 |  p4 |  p1 |  p2 |  p3 |  p4 |
+          |------|-----|-----|-----|-----|-----|-----|-----|-----|
+          ********************************************************/
+          if (is_first_sample && !empty) {
+            // load input data from dram to nram
+            __nramset(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
+            for (int h = y_low; h <= y_high; ++h) {
+              src_offset =
+                  (batch_idx * height * width + h * width + x_low) * channel +
+                  c_offset;
+              dst_offset = (h - y_low) * SAMPLING_NUM * c_slice_align / 2;
+              if (c_slice_align == channel) {
+                __memcpy(nram_ping + dst_offset, input_dram + src_offset,
+                         sample_wdim * channel * sizeof(T), GDRAM2NRAM);
+              } else {
+                __memcpy(nram_ping + dst_offset, input_dram + src_offset,
+                         c_slice * sizeof(T), GDRAM2NRAM,
+                         c_slice_align * sizeof(T), channel * sizeof(T),
+                         sample_wdim - 1);
+              }
+            }
+          }
+          // load next input data to nram
+          if (sample_i + 1 < bin_dim) {
+            int p_iy = (sample_i + 1) / roi_bin_grid_w;
+            int p_ix = (sample_i + 1) % roi_bin_grid_w;
+            const T p_yy = roi_start_y + ph * bin_size_h +
+                           T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
+            const T p_xx = roi_start_x + pw * bin_size_w +
+                           T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
+            T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
+            T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
+            T p_w1, p_w2, p_w3, p_w4;
+            bool p_empty = false;
+            int p_x_low, p_x_high, p_y_low, p_y_high;
+            bilinearInterpolate(height, width, p_x, p_y, zero_sign, &p_w1,
+                                &p_w2, &p_w3, &p_w4, &p_x_low, &p_x_high,
+                                &p_y_low, &p_y_high, &p_empty);
+            int p_sample_wdim = p_x_high - p_x_low + 1;
+            pongc_slice = c_slice;
+            pongc_slice_align = c_slice_align;
+            if (!p_empty) {
+              __nramset(nram_pong, SAMPLING_NUM * pongc_slice_align, (T)0);
+              for (int h = p_y_low; h <= p_y_high; ++h) {
+                src_offset =
+                    (batch_idx * height * width + h * width + p_x_low) *
+                        channel +
+                    c_offset;
+                dst_offset =
+                    (h - p_y_low) * SAMPLING_NUM * pongc_slice_align / 2;
+                if (pongc_slice_align == channel) {
+                  __memcpy_async(
+                      nram_pong + dst_offset, input_dram + src_offset,
+                      p_sample_wdim * channel * sizeof(T), GDRAM2NRAM);
+                } else {
+                  __memcpy_async(nram_pong + dst_offset,
+                                 input_dram + src_offset,
+                                 pongc_slice * sizeof(T), GDRAM2NRAM,
+                                 pongc_slice_align * sizeof(T),
+                                 channel * sizeof(T), p_sample_wdim - 1);
+                }
+              }
+            }
+          }
+          T *tmp_sum = nram_ping + 3 * c_slice_align;
+          if (empty) {
+            __nramset(tmp_sum, c_slice_align, T(0));
+          } else {
+            __bang_mul_const(nram_ping, nram_ping, w1, c_slice_align);
+            __bang_mul_const(nram_ping + c_slice_align,
+                             nram_ping + c_slice_align, w2, c_slice_align);
+            __bang_mul_const(nram_ping + 2 * c_slice_align,
+                             nram_ping + 2 * c_slice_align, w3, c_slice_align);
+            __bang_mul_const(nram_ping + 3 * c_slice_align,
+                             nram_ping + 3 * c_slice_align, w4, c_slice_align);
+            __bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
+                           1, SAMPLING_NUM, 1, 1);
+          }
+          __bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
+          swap(nram_ping, nram_pong);
+
+          __asm__ volatile("sync;");
+          is_first_sample = false;
+        }
+      }
+      // store the result to dram
+      int output_offset =
+          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+              channel +
+          c_offset;
+      __memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
+               NRAM2GDRAM);
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
+                                          const T *rois_dram, const int batch,
+                                          const int height, const int width,
+                                          const int channel, const int rois_num,
+                                          const RoiAlignRotatedParams &params,
+                                          T *bottom_grad_dram) {
+  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
+  int channel_align = CEIL_ALIGN(channel, align_base_128);
+
+  unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
+  int c_limit = max_element >> 2;
+  c_limit = c_limit > channel_align ? channel_align : c_limit;
+
+  T *nram_ping = (T *)nram_buffer;
+  T *nram_pong = nram_ping + 2 * c_limit;
+  T *nram_output = nullptr;
+
+  int bin_first = taskId;
+  int bin_end = rois_num * params.pooled_height * params.pooled_width;
+  bool is_first_bin = true;
+  T roi_center_x, roi_center_y, roi_width, roi_height, theta;
+  int batch_idx, roi_n, pw, ph;
+  T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
+      pong_theta;
+  int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
+  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
+    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
+                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
+                  &theta);
+    T bin_size_h = roi_height / params.pooled_height;
+    T bin_size_w = roi_width / params.pooled_width;
+
+    int roi_bin_grid_h =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_height / params.pooled_height);
+    int roi_bin_grid_w =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_width / params.pooled_width);
+    T roi_start_y = -roi_height / 2;
+    T roi_start_x = -roi_width / 2;
+    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
+                            ? roi_bin_grid_h * roi_bin_grid_w
+                            : 1;
+    T cos_theta = std::cos(theta);
+    T sin_theta = std::sin(theta);
+    T zero_sign = 1.0f / bin_dim;
+
+    int c_rem, c_slice, pongc_slice, c_offset;
+    c_rem = channel;
+    c_offset = 0;
+    /****************************************
+    |        ping       |        pong       |
+    |---------|---------|---------|---------|
+    |  input  |  output |  input  |  output |
+    |---------|---------|---------|---------|
+    *****************************************/
+    if (is_first_bin) {
+      // load the first top_grad to nram
+      c_slice = c_limit < c_rem ? c_limit : c_rem;
+      int top_grad_offset =
+          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+          channel;
+      __memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
+               GDRAM2NRAM);
+    }
+    nram_output = nram_ping + c_limit;
+    while (c_rem > 0) {
+      c_slice = c_slice < c_rem ? c_slice : c_rem;
+      // load the next top_grad to nram
+      if (c_rem - c_slice > 0) {
+        // load the rest channels to nram
+        pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
+        int top_grad_offset =
+            ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+                channel +
+            c_offset + c_slice;
+        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
+                       pongc_slice * sizeof(T), GDRAM2NRAM);
+      } else if (bin_i + taskDim < bin_end) {
+        // load next bin's data to nram
+        getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
+                      &pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
+                      &pong_roi_center_y, &pong_roi_width, &pong_roi_height,
+                      &pong_theta);
+        pongc_slice = c_limit < channel ? c_limit : channel;
+        int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
+                                   params.pooled_width +
+                               pong_pw) *
+                              channel;
+        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
+                       c_slice * sizeof(T), GDRAM2NRAM);
+      }
+      // comput the output in a single bin
+
+      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+        const T yy = roi_start_y + ph * bin_size_h +
+                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
+        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+          const T xx = roi_start_x + pw * bin_size_w +
+                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
+          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
+          T w1, w2, w3, w4;
+          bool empty = false;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
+                              &w4, &x_low, &x_high, &y_low, &y_high, &empty);
+          if (empty) {
+            continue;
+          } else {
+            __bang_mul_const(nram_output, nram_ping, w1, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_low * width * channel + x_low * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w2, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_low * width * channel + x_high * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w3, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_high * width * channel + x_low * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w4, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_high * width * channel + x_high * channel + c_offset,
+                (T *)nram_output, c_slice);
+          }
+        }
+      }
+      swap(nram_ping, nram_pong);
+      c_rem -= c_slice;
+      c_offset += c_slice;
+      __asm__ volatile("sync;");
+    }
+    is_first_bin = false;
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
+    const void *features, const void *rois, void *output, const int batch,
+    const int height, const int width, const int channel, const int rois_num,
+    const RoiAlignRotatedParams rroiAlignParams,
+    const cnrtDataType_t data_type) {
+  if (0x80 == coreId) {
+    return;
+  }
+
+  if (data_type == CNRT_FLOAT32) {
+    roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
+                           width, channel, rois_num, rroiAlignParams,
+                           (float *)output);
+  } else {
+    roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
+                           channel, rois_num, rroiAlignParams, (half *)output);
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
+    const void *top_grad, const void *rois, void *bottom_grad, const int batch,
+    const int height, const int width, const int channel, const int rois_num,
+    const RoiAlignRotatedParams rroiAlignParams,
+    const cnrtDataType_t data_type) {
+  if (0x80 == coreId) {
+    return;
+  }
+
+  if (data_type == CNRT_FLOAT32) {
+    roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
+                            width, channel, rois_num, rroiAlignParams,
+                            (float *)bottom_grad);
+  } else {
+    roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
+                            width, channel, rois_num, rroiAlignParams,
+                            (half *)bottom_grad);
+  }
+}
+
+void KernelRoiAlignRotatedForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *features, const void *rois,
+    void *output, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams) {
+  MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
+      features, rois, output, batch, height, width, channel, rois_num,
+      roiAlignRotatedParams, d_type);
+}
+
+void KernelRoiAlignRotatedBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
+    void *bottom_grad, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams) {
+  MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
+      top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
+      roiAlignRotatedParams, d_type);
+}
diff --git a/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd0ec02484fef395db7d401976d64f9c5ca59622
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
+#define ROI_ALIGN_ROTATED_UTILS_HPP_
+
+struct RoiAlignRotatedParams {
+  int pooled_height;
+  int pooled_width;
+  int sample_ratio;
+  float spatial_scale;
+  bool aligned;
+  bool clockwise;
+};
+
+#endif  // ROI_ALIGN_ROTATED_UTILS_HPP_
diff --git a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7186cdfac3e93677ed2727234a71def607fcd79b
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -0,0 +1,749 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define ALIGN_SIZE 64
+#define PIPELINE_COMMON_NUM 2
+#define PIPELINE_PINGPONG_NUM 10
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
+                                int width, int channels, int p_height,
+                                int p_width, T spatial_scale, int *bin_x1,
+                                int *bin_y1, int *bin_x2, int *bin_y2,
+                                int *bin_wdim, int *bin_hdim, int *bin_dims,
+                                T **input_base, bool *is_empty) {
+  int pw = bin_i % p_width;
+  int ph = (bin_i / p_width) % p_height;
+  int roi_n = bin_i / p_width / p_height;
+
+  /*roi*/
+  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
+  int batch_index = (int)roi_info[0];
+  int roi_x1 = round(roi_info[1] * spatial_scale);
+  int roi_y1 = round(roi_info[2] * spatial_scale);
+  int roi_x2 = round(roi_info[3] * spatial_scale);
+  int roi_y2 = round(roi_info[4] * spatial_scale);
+  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
+  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
+
+  /*bin*/
+  T bin_w = (T)roi_w / (T)p_width;
+  T bin_h = (T)roi_h / (T)p_height;
+
+  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
+  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
+  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;
+
+  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
+  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
+  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;
+
+  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
+  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
+  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;
+
+  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
+  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
+  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;
+
+  *input_base = input_v + batch_index * height * width * channels;
+  *bin_wdim = *bin_x2 - *bin_x1;
+  *bin_hdim = *bin_y2 - *bin_y1;
+  *bin_dims = (*bin_hdim) * (*bin_wdim);
+  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
+                                   int channels, int height, int width,
+                                   int p_height, int p_width, int rois_num,
+                                   T spatial_scale, T *output_v, int *argmax) {
+  /*
+   * NRAM partition
+   *  |---------------------------------------------------|
+   *  |                        ping                       |
+   *  |---------------------------------------------------|
+   *  |                        pong                       |
+   *  |---------------------------------------------------|
+   *  |                        out                        |
+   *  |---------------------------------------------------|
+   *  |                        argmax                     |
+   *  |---------------------------------------------------|
+   *  |                        a                          |
+   *  |---------------------------------------------------|
+   *  |                        b                          |
+   *  |---------------------------------------------------|
+   */
+  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
+  uint32_t t_size = sizeof(T);
+  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
+  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
+
+  uint32_t channels_align = PAD_UP(channels, float_div);
+  uint32_t nram_limit = PAD_DOWN(
+      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
+
+  // nram PING/PONG, output, argamx, a, b
+  float *nram_ping = (float *)nram_buffer;
+  float *nram_pong = (float *)nram_buffer + nram_limit;
+  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
+  float *nram_argmax = nram_out + channels_align;
+  float *nram_a = nram_out + 2 * channels_align;
+  float *nram_b = nram_out + 3 * channels_align;
+
+  uint32_t c_bins_num = rois_num * p_height * p_width;
+  uint32_t task_bins = c_bins_num / taskDim;
+  uint32_t rem_bins = c_bins_num % taskDim;
+  if (taskId < rem_bins) {
+    task_bins += 1;
+  }
+  int bin_first =
+      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
+  int bins_loop = bin_first + task_bins;
+
+  T *input_base = NULL;
+  T *output_base = output_v + bin_first * channels;
+  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
+  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
+  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
+  bool is_empty = false;
+  bool pong_is_empty = false;
+  bool is_first_bin = true;
+  uint32_t src_offset = 0;
+  uint32_t dst_offset = 0;
+  uint32_t nram_offset = 0;
+  uint32_t half_offset =
+      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
+  float *nram_tmp = NULL;
+
+  uint32_t c_slice = 0;
+  uint32_t c_slice_align = 0;
+  uint32_t pongc_slice = 0;
+  uint32_t pongc_slice_align = 0;
+  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
+    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
+                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
+                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
+                  &input_base, &is_empty);
+    uint32_t c_rem = channels;
+    c_slice = nram_limit / bin_dims / float_div * float_div;
+
+    if (is_first_bin && !is_empty) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+      for (int h = bin_y1; h < bin_y2; h++) {
+        src_offset = (h * width + bin_x1) * channels;
+        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
+        if (c_slice_align == channels) {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
+        } else {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
+                   channels * t_size, bin_wdim - 1);
+        }
+      }
+    }
+    uint32_t c_offset = 0;
+    while (c_rem > 0) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+
+      /*__memcpy_async*/
+      if (c_rem - c_slice > 0 && !is_empty) {
+        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        for (int h = bin_y1; h < bin_y2; h++) {
+          src_offset = (h * width + bin_x1) * channels + c_offset;
+          nram_offset =
+              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
+          __memcpy_async((T *)nram_pong + nram_offset,
+                         (T *)input_base + src_offset + c_slice,
+                         pongc_slice * t_size, GDRAM2NRAM,
+                         pongc_slice_align * t_size, channels * t_size,
+                         bin_wdim - 1);
+        }
+      } else if (bin_i + 1 < bins_loop) {
+        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
+                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
+                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
+                      &pbin_dims, &input_base, &pong_is_empty);
+        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
+        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        if (!pong_is_empty) {
+          for (int h = pbin_y1; h < pbin_y2; h++) {
+            src_offset = (h * width + pbin_x1) * channels;
+            nram_offset =
+                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
+            if (pongc_slice_align == channels) {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset,
+                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
+            } else {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset, pongc_slice * t_size,
+                             GDRAM2NRAM, pongc_slice_align * t_size,
+                             channels * t_size, pbin_wdim - 1);
+            }
+          }
+        }
+      }
+
+      if (is_empty) {
+        __nramset((T *)nram_out, c_slice_align, (T)0);
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
+        }
+      } else {
+        if (is_half) {
+          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
+          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
+                            bin_align64);
+        }
+        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
+                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
+        if (is_half) {
+          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
+          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
+        }
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          /*compute max_index*/
+          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
+                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
+                               bin_wdim, 1, 1);
+          convertInt2Float((float *)nram_argmax, (float *)nram_a,
+                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);
+
+          /*compute input_h*/
+          for (int i = 0; i < c_slice; i++) {
+            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
+          }
+          __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                           c_slice_align);
+          __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
+                           c_slice_align);
+
+          /*compute input_w*/
+          __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                           c_slice_align);
+          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
+                     c_slice_align);
+          __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                           c_slice_align);
+          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
+                     c_slice_align);
+          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
+                           (float *)nram_out, (float *)nram_b, c_slice_align);
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
+                   NRAM2GDRAM);
+        }
+      }
+      nram_tmp = nram_ping;
+      nram_ping = nram_pong;
+      nram_pong = nram_tmp;
+      c_offset += c_slice;
+      c_rem -= c_slice;
+      __asm__ volatile("sync;");
+    }
+    dst_offset += channels;
+    is_first_bin = false;
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
+                                     const void *input_data,
+                                     const void *input_rois, int batch,
+                                     int channels, int height, int width,
+                                     int pooled_height, int pooled_width,
+                                     int rois_num, float spatial_scale,
+                                     void *output_data, int *argmax) {
+  switch (data_type) {
+    case CNRT_FLOAT16: {
+      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (half)spatial_scale, (half *)output_data, argmax);
+    }; break;
+    case CNRT_FLOAT32: {
+      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
+                       channels, height, width, pooled_height, pooled_width,
+                       rois_num, (float)spatial_scale, (float *)output_data,
+                       argmax);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+}  // namespace forward
+
+namespace backward {
+// Convert index of argmax from global grads_image to local bin in RoI. Vector
+// operations do not support int type, so conversion from int to float is
+// performed here.
+__mlu_func__ void convertIndex(
+    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
+    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
+    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
+    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
+    float *nram_atomic_add, float *nram_grads_image, int width, int height,
+    int wstart, int hstart, int w_compute, int h_compute, int align_c,
+    int channels, int loop_flag, int loop_id, int true_limit) {
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+
+  // This step uses scalar division, because the above vector division causes
+  // rounding accuracy problem.
+  for (int i = 0; i < channels; ++i) {
+    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
+  }
+
+  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
+  // operation.
+  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
+                   align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+
+  // Perform 'temp_result - hstart' operation
+  __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                   align_c);
+
+  // Perform 'temp_result1 - temp_result2 * width' operation
+  __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                   align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
+             (float *)nram_argmax_fp_w, align_c);
+
+  // Perform 'temp_result - wstart' operation
+  __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
+                   align_c);
+
+  // Perform 'temp_result = h * w_compute + w' operation
+  __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                   w_compute, align_c);
+  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+             (float *)nram_argmax_fp_w, align_c);
+
+  if (loop_flag == 1) {
+    __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                     (loop_id * true_limit), align_c);
+  }
+  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
+                                   const int32_t *argmax, T *grads_image,
+                                   int channels, int height, int width,
+                                   int pooled_height, int pooled_width,
+                                   int rois_num, const T spatial_scale,
+                                   int high_precision) {
+  // Calculate the number of rois processed by each core
+  int bin_num = rois_num * pooled_height * pooled_width;
+  int loop =
+      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
+  int tid = taskId * loop;
+  if (bin_num % taskDim != 0) {
+    if (tid >= bin_num) {
+      return;
+    } else {
+      // last part is (bin_num - tid).
+      loop = bin_num - tid < loop ? bin_num - tid : loop;
+    }
+  }
+  int align_c = PAD_UP(channels, ALIGN_SIZE);
+  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
+  int data_size =
+      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
+                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
+                2),
+               ALIGN_SIZE);
+  int hw_limit = data_size / align_c;
+  float *nram_grads = (float *)nram_buffer;
+  for (int idx = tid; idx < tid + loop; ++idx) {
+    // (n, ph, pw) is a C in the pooled output
+    int pw = idx % pooled_width;
+    int ph = (idx / pooled_width) % pooled_height;
+    int n = idx / pooled_width / pooled_height;
+
+    const T *offset_rois = (const T *)(rois + n * 5);
+    int roi_batch_ind = int(offset_rois[0]);
+    // Calculate the roi region on feature maps
+    int roi_start_w = round(offset_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_rois[4] * spatial_scale);
+    // Force malformed rois to 1x1
+    int roi_width =
+        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
+    int roi_height =
+        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
+    T bin_size_h = (T)roi_height / (T)pooled_height;
+    T bin_size_w = (T)roi_width / (T)pooled_width;
+
+    // The corresponding bin region
+    int hstart = int(floor((T)ph * bin_size_h));
+    int wstart = int(floor((T)pw * bin_size_w));
+    int hend = int(ceil((T)(ph + 1) * bin_size_h));
+    int wend = int(ceil((T)(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
+    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
+    hstart = hstart < height ? hstart : height;
+    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
+    hend = hend < height ? hend : height;
+    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
+    wstart = wstart < width ? wstart : width;
+    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
+    wend = wend < width ? wend : width;
+
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+    if (!is_empty) {
+      int h_compute = hend - hstart;
+      int w_compute = wend - wstart;
+      int true_limit =
+          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
+      int loop_int = (h_compute * w_compute) / true_limit;
+      int rem = (h_compute * w_compute) % true_limit;
+      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
+      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
+      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+      float *nram_grads_image = (float *)nram_atomic_add + align_c;
+      if (true_limit == h_compute * w_compute) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |---------------------------------------------------|
+         *  |                     argmax_temp                   |
+         *  |---------------------------------------------------|
+         *  |                     atomic_add                    |
+         *  |---------------------------------------------------|
+         *  |                     grads_image                   |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy((T *)nram_grads + align_c * high_precision,
+                 (const T *)grads + (n * pooled_height * pooled_width +
+                                     ph * pooled_width + pw) *
+                                        channels,
+                 channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+
+        __memcpy((int32_t *)nram_argmax,
+                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
+                                            ph * pooled_width + pw) *
+                                               channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        // Perform pooling operation on NRAM.
+        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                     nram_atomic_add, nram_grads_image, width, height, wstart,
+                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
+        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                          (int32_t *)nram_argmax_int, align_c, h_compute,
+                          w_compute, h_compute, w_compute, h_compute,
+                          w_compute);
+        if (high_precision) {
+          __bang_float2half_rd((half *)nram_grads_image,
+                               (float *)nram_grads_image,
+                               h_compute * w_compute * align_c);
+        }
+
+        // Store the result on NRAM back to GDRAM.
+        for (int hc = 0; hc < h_compute; ++hc) {
+          for (int wc = 0; wc < w_compute; ++wc) {
+            T *dst = (T *)nram_atomic_add;
+            int grad_image_offset = (roi_batch_ind * height * width +
+                                     (hc + hstart) * width + wc + wstart) *
+                                    channels;
+            T *src1 = (T *)grads_image + grad_image_offset;
+            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
+            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+        }
+      } else if (true_limit > 0) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy((T *)nram_grads + align_c * high_precision,
+                 (const T *)grads + (n * pooled_height * pooled_width +
+                                     ph * pooled_width + pw) *
+                                        channels,
+                 channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+        __memcpy((int32_t *)nram_argmax,
+                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
+                                            ph * pooled_width + pw) *
+                                               channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : true_limit;
+          if (size == 0) {
+            break;
+          }
+          // Perform pooling operation on NRAM.
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+          nram_grads_image = (float *)nram_atomic_add + align_c;
+          int loop_id_1 = loop_id;
+          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
+          if (size_1 == 0) {
+            break;
+          }
+          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                       nram_atomic_add, nram_grads_image, width, height, wstart,
+                       hstart, w_compute, h_compute, align_c, channels, 1,
+                       loop_id_1, true_limit);
+          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
+                            size_1, 1, size_1, 1);
+          if (high_precision) {
+            __bang_float2half_rd((half *)nram_grads_image,
+                                 (float *)nram_grads_image, size_1 * align_c);
+          }
+
+          // Store the result on NRAM back to GDRAM.
+          for (int index_size = 0; index_size < size; ++index_size) {
+            int h = (loop_id * true_limit + index_size) / w_compute;
+            int w = (loop_id * true_limit + index_size) % w_compute;
+            T *dst = (T *)nram_atomic_add;
+            T *grads_image_n =
+                (T *)grads_image + roi_batch_ind * height * width * channels;
+            T *src1 = (T *)grads_image_n +
+                      ((h + hstart) * width + (w + wstart)) * channels;
+            T *src2 = (T *)nram_grads_image + index_size * align_c;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      } else {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        int c_limit =
+            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
+                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
+                     ALIGN_SIZE);
+        int loop_int = channels / c_limit;
+        int rem = channels % c_limit;
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : c_limit;
+          if (size == 0) {
+            break;
+          }
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
+          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
+          nram_grads_image = (float *)nram_atomic_add + c_limit;
+
+          // This pipeline loads the data from GDRAM to NRAM.
+          __memcpy((T *)nram_grads + c_limit * high_precision,
+                   (const T *)grads +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(T), GDRAM2NRAM);
+          if (high_precision) {
+            __bang_half2float((float *)nram_grads,
+                              (half *)nram_grads + c_limit * high_precision,
+                              c_limit);
+          }
+          __memcpy((int32_t *)nram_argmax,
+                   (const int32_t *)argmax +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(int32_t), GDRAM2NRAM);
+
+          for (int hc = 0; hc < h_compute; ++hc) {
+            for (int wc = 0; wc < w_compute; ++wc) {
+              // This pipeline performs pooling operation on NRAM.
+              convertIndex(
+                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
+                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
+              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
+                                1, 1);
+              if (high_precision) {
+                __bang_float2half_rd((half *)nram_grads_image,
+                                     (float *)nram_grads_image, c_limit);
+              }
+              // This pipeline stores the result on NRAM back to GDRAM.
+              T *dst = (T *)nram_atomic_add;
+              T *grads_image_n =
+                  (T *)grads_image + roi_batch_ind * height * width * channels;
+              T *src1 = (T *)grads_image_n +
+                        ((hc + hstart) * width + (wc + wstart)) * channels +
+                        loop_id * c_limit;
+              T *src2 = (T *)nram_grads_image;
+              __bang_atomic_add(dst, src1, src2, size);
+            }
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      }
+    }
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPoolBackward(
+    const void *grads, const void *rois, const int *argmax, void *grads_image,
+    int rois_num, int pooled_height, int pooled_width, int channels, int no,
+    int height, int width, const float spatial_scale,
+    const cnrtDataType_t k_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (k_dtype) {
+    case CNRT_FLOAT16: {
+      // Using the float type '__bang_max_pool_bp' instruction to increase the
+      // bit width.
+      const int high_precision = 1;
+      MLUUnion1Roipool((const half *)rois, (const half *)grads,
+                       (const int32_t *)argmax, (half *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const half)spatial_scale, high_precision);
+    }; break;
+    case CNRT_FLOAT32: {
+      const int high_precision = 0;
+      MLUUnion1Roipool((const float *)rois, (const float *)grads,
+                       (const int32_t *)argmax, (float *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const float)spatial_scale, high_precision);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+}  // namespace backward
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax) {
+  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
+      data_type, input_data, input_rois, batch, channels, height, width,
+      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
+      argmax);
+}
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale) {
+  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
+      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
+      pooled_height, pooled_width, channels, batch, height, width,
+      spatial_scale, k_dtype);
+}
diff --git a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7cb6df0e5d531afa6c2d548a6f3f7b8a8110da28
--- /dev/null
+++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
@@ -0,0 +1,307 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+__nram__ char data_nram[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void mluMultiKernelTinShift(
+    const T *input, const int *shifts, T *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel) {
+  for (int cur_channel_index = taskId;
+       cur_channel_index < batch_size * channel_size;
+       cur_channel_index += taskDim) {
+    int n_index = cur_channel_index / channel_size;
+    int group_id = cur_channel_index % channel_size / group_channel;
+    int t_shift = shifts[n_index * group_size + group_id];
+    int index = cur_channel_index % channel_size * hw_size +
+                n_index * time_size * channel_size * hw_size;
+    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __asm__ volatile("sync;");
+    if (abs(t_shift) >= time_size) {
+      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+               channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+               time_size - 1);
+    } else {
+      if (t_shift > 0) {
+        __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
+                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
+                 channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 time_size - 1);
+      } else {
+        __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
+                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
+                 channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 time_size - 1);
+      }
+    }
+    __asm__ volatile("sync;");
+  }
+}
+
+template <typename T>
+__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
+                             const int time_size, const int hw_size,
+                             const int channel_size, const int index,
+                             const int cur_sequence_index,
+                             const int max_length_per_core, T *output) {
+  for (int cur_index = index; cur_index < index + hw_size;
+       cur_index += max_length_per_core) {
+    int memcpy_size = max_length_per_core;
+    if (cur_index + max_length_per_core > index + hw_size) {
+      memcpy_size = index + hw_size - cur_index;
+    }
+    if (cur_sequence_index - t_shift < 0 ||
+        cur_sequence_index - t_shift >= time_size) {
+      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
+               NRAM2GDRAM);
+    } else {
+      __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
+               memcpy_size * sizeof(T), GDRAM2NRAM);
+      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
+               NRAM2GDRAM);
+    }
+    __asm__ volatile("sync;");
+  }
+}
+
+template <typename T>
+__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
+    const T *input, const int *shifts, T *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  const int tmp_max_number_hw_per_core =
+      max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
+  const int loop_time = time_size / tmp_max_number_hw_per_core +
+                        ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
+  int segmentime_size = tmp_max_number_hw_per_core;
+  int res_segment = time_size % tmp_max_number_hw_per_core;
+
+  for (int cur_segment_index = taskId;
+       cur_segment_index < loop_time * batch_size * channel_size;
+       cur_segment_index += taskDim) {
+    int n_index = cur_segment_index / loop_time / channel_size;
+    int group_id = cur_segment_index / loop_time % channel_size / group_channel;
+    int t_shift = shifts[n_index * group_size + group_id];
+    int index = n_index * time_size * channel_size * hw_size +
+                (cur_segment_index / loop_time % channel_size) * hw_size +
+                cur_segment_index % loop_time * segmentime_size * hw_size *
+                    channel_size;
+    char *dst_gdram2nram = data_nram;
+    const T *src_gdram2nram = input + index;
+    int count_gdram2nram = -1;
+    int count_nram2gdram = -1;
+    int next_sequence_index =
+        index / hw_size / channel_size % time_size + segmentime_size;
+    int cur_sequence_index = index / hw_size / channel_size % time_size;
+    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __asm__ volatile("sync;");
+    if (max_number_hw_per_core == 0) {
+      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
+                 cur_sequence_index, max_length_per_core, output);
+      continue;
+    }
+    if (abs(t_shift) >= time_size) {
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 res_segment - 1);
+      } else {
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 segmentime_size - 1);
+      }
+      continue;
+    }
+    if (t_shift == 0) {
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index;
+        count_gdram2nram = res_segment - 1;
+        count_nram2gdram = res_segment - 1;
+      } else {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index;
+        count_gdram2nram = segmentime_size - 1;
+        count_nram2gdram = segmentime_size - 1;
+      }
+    } else if (t_shift > 0) {
+      int first_index_cur_channel =
+          n_index * time_size * channel_size * hw_size +
+          (cur_segment_index / loop_time % channel_size) * hw_size;
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram =
+            input +
+            (index - t_shift * channel_size * hw_size < first_index_cur_channel
+                 ? first_index_cur_channel
+                 : index - t_shift * channel_size * hw_size);
+        count_gdram2nram = res_segment - 1;
+        count_nram2gdram = res_segment - 1;
+        if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
+          dst_gdram2nram =
+              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
+          count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
+        }
+      } else {
+        if (t_shift >= next_sequence_index) {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   segmentime_size - 1);
+          continue;
+        } else if (cur_sequence_index < t_shift &&
+                   t_shift < next_sequence_index) {
+          dst_gdram2nram =
+              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
+          src_gdram2nram = input + first_index_cur_channel;
+          count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
+          count_nram2gdram = segmentime_size - 1;
+        } else {
+          dst_gdram2nram = data_nram;
+          src_gdram2nram = input + index - t_shift * channel_size * hw_size;
+          count_gdram2nram = segmentime_size - 1;
+          count_nram2gdram = segmentime_size - 1;
+        }
+      }
+    } else {
+      int offset_index = time_size + t_shift;
+      if (cur_sequence_index >= offset_index) {
+        if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   res_segment - 1);
+          continue;
+        } else {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   segmentime_size - 1);
+          continue;
+        }
+      } else {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index - t_shift * channel_size * hw_size;
+        if (cur_sequence_index - t_shift + segmentime_size < time_size) {
+          count_gdram2nram = segmentime_size - 1;
+          count_nram2gdram = segmentime_size - 1;
+        } else {
+          count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
+          count_nram2gdram =
+              (segmentime_size - 1) < (time_size - cur_sequence_index - 1)
+                  ? (segmentime_size - 1)
+                  : (time_size - cur_sequence_index - 1);
+        }
+      }
+    }
+    __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
+             hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
+             count_gdram2nram);
+    __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+             channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+             count_nram2gdram);
+    __asm__ volatile("sync;");
+  }
+}
+
+__mlu_entry__ void MLUUnion1KernelTinShift(
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
+                             batch_size, time_size, channel_size, hw_size,
+                             group_size, group_channel);
+    }; break;
+    case CNRT_FLOAT32: {
+      mluMultiKernelTinShift((float *)input, (const int *)shifts,
+                             (float *)output, batch_size, time_size,
+                             channel_size, hw_size, group_size, group_channel);
+    }; break;
+    default: { return; }
+  }
+}
+
+__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const int max_number_hw_per_core, const int max_length_per_core,
+    const cnrtDataType_t data_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      mluMultiKernelTinShiftSplitSequence(
+          (half *)input, (const int *)shifts, (half *)output, batch_size,
+          time_size, channel_size, hw_size, group_size, group_channel,
+          max_number_hw_per_core, max_length_per_core);
+    }; break;
+    case CNRT_FLOAT32: {
+      mluMultiKernelTinShiftSplitSequence(
+          (float *)input, (const int *)shifts, (float *)output, batch_size,
+          time_size, channel_size, hw_size, group_size, group_channel,
+          max_number_hw_per_core, max_length_per_core);
+    }; break;
+    default: { return; }
+  }
+}
+
+void KernelTinShiftForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  if (channel_per_core >= 1) {
+    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
+        input, shifts, output, batch_size, time_size, channel_size, hw_size,
+        group_size, group_channel, data_dtype);
+  } else {
+    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
+        input, shifts, output, batch_size, time_size, channel_size, hw_size,
+        group_size, group_channel, max_number_hw_per_core, max_length_per_core,
+        data_dtype);
+  }
+}
+
+void KernelTinShiftBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *grad_output, const void *shifts, void *grad_input,
+    const int batch_size, const int time_size, const int channel_size,
+    const int hw_size, const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  if (channel_per_core >= 1) {
+    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
+        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
+        hw_size, group_size, group_channel, data_dtype);
+  } else {
+    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
+        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
+        hw_size, group_size, group_channel, max_number_hw_per_core,
+        max_length_per_core, data_dtype);
+  }
+}
diff --git a/mmcv/ops/csrc/common/mps/MPSDevice.h b/mmcv/ops/csrc/common/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac
--- /dev/null
+++ b/mmcv/ops/csrc/common/mps/MPSDevice.h
@@ -0,0 +1,64 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
+
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+
+using namespace std;
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() { return _mtl_device; }
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+}  // namespace mps
+}  // namespace at
diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.h b/mmcv/ops/csrc/common/mps/MPSLibrary.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b
--- /dev/null
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.h
@@ -0,0 +1,61 @@
+#ifndef _MPS_LIBRARY_H_
+#define _MPS_LIBRARY_H_
+
+#include <string>
+#include <unordered_map>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLComputePipelineState;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary;
+typedef void* MTLLibrary_t;
+#endif
+
+class MPSLibrary {
+ public:
+  // disable constructor for singleton
+  static MPSLibrary* createFromUrl(const std::string& library_url);
+  static MPSLibrary* createFromSource(const std::string& source);
+  ~MPSLibrary();
+
+  MTLLibrary_t library() { return _library; }
+
+  MTLComputePipelineState_t getComputePipelineState(
+      const std::string& function_name);
+
+ private:
+  MTLLibrary_t _library;
+  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
+};
+
+class MPSLibraryManager {
+ public:
+  // disable constructor for singleton
+  MPSLibraryManager(const MPSLibraryManager&) = delete;
+  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
+  MPSLibraryManager(MPSLibraryManager&&) = delete;
+  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
+
+  static MPSLibraryManager* getInstance();
+
+  bool hasLibrary(const std::string& name);
+
+  MPSLibrary* getLibrary(const std::string& library_url);
+
+  MPSLibrary* createLibraryFromSouce(const std::string& name,
+                                     const std::string& sources);
+
+  ~MPSLibraryManager();
+
+ private:
+  MPSLibraryManager();
+  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
+};
+#endif
diff --git a/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
new file mode 100644
index 0000000000000000000000000000000000000000..1a3d635ca95666e110a94b33315d94af16888b7c
--- /dev/null
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
@@ -0,0 +1,110 @@
+#include "MPSLibrary.h"
+#include <c10/util/CallOnce.h>
+#include "MPSDevice.h"
+
+static std::unique_ptr<MPSLibraryManager> mps_library_manager;
+static c10::once_flag mpsdev_init;
+
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  c10::call_once(mpsdev_init, [] {
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  });
+  return mps_library_manager.get();
+}
+
+MPSLibraryManager::~MPSLibraryManager() {}
+
+MPSLibraryManager::MPSLibraryManager() {}
+
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
diff --git a/mmcv/ops/csrc/common/mps/MPSStream.h b/mmcv/ops/csrc/common/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6
--- /dev/null
+++ b/mmcv/ops/csrc/common/mps/MPSStream.h
@@ -0,0 +1,132 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device]; }
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+}  // namespace mps
+}  // namespace at
diff --git a/mmcv/ops/csrc/common/mps/MPSUtils.h b/mmcv/ops/csrc/common/mps/MPSUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de
--- /dev/null
+++ b/mmcv/ops/csrc/common/mps/MPSUtils.h
@@ -0,0 +1,51 @@
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
diff --git a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
index c7f9f35b7b0af6bc91052a1d038809e46c74effd..f68e8740561ef833c09e1ba9f999922f5d04bce5 100644
--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -1,22 +1,25 @@
 #ifndef PYTORCH_CPP_HELPER
 #define PYTORCH_CPP_HELPER
-#include <torch/extension.h>
+#include <torch/types.h>
 
 #include <vector>
 
 using namespace at;
 
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
 #define CHECK_CUDA(x) \
   TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_MLU(x) \
+  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
 #define CHECK_CPU(x) \
-  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
+  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
 #define CHECK_CONTIGUOUS(x) \
   TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_CUDA_INPUT(x) \
   CHECK_CUDA(x);            \
   CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
 #define CHECK_CPU_INPUT(x) \
   CHECK_CPU(x);            \
   CHECK_CONTIGUOUS(x)
diff --git a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72dbe5880bfed2bcebaf6b20c6f169639e34fa38
--- /dev/null
+++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PYTORCH_MLU_HELPER_HPP_
+#define PYTORCH_MLU_HELPER_HPP_
+
+#ifdef MMCV_WITH_MLU
+#include "aten.h"
+
+#define NFU_ALIGN_SIZE 128
+
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+#endif
+
+#endif  // PYTORCH_MLU_HELPER_HPP_
diff --git a/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class scalar_t>
+int getTotalSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <class scalar_t, class... TArgs>
+int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+
+template <typename scalar_t>
+int getSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <int Idx, class TT, class scalar_t>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class scalar_t, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+}  // namespace detail
+
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
@@ -0,0 +1,493 @@
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T>
+struct has_const_iterator : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &test(typename C::const_iterator *);
+  template <typename C>
+  static no &test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T>
+struct has_begin_end : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C>
+  static no &f(...);
+
+  template <typename C>
+  static yes &g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator (
+                                          C::*)() const>(&C::end)),
+                             typename C::const_iterator (C::*)() const>::value,
+                void>::type *);
+
+  template <typename C>
+  static no &g(...);
+
+ public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+}  // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar>
+struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar>
+struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specify a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U>
+  struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end) break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+ private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I>
+  struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void tuple_print(
+      const element_type &c, ostream_type &stream,
+      typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N>
+struct is_container<char[N]> : std::false_type {};
+
+template <typename T>
+struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T>
+struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T>
+struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
+        "{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2>
+struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args>
+struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args>
+struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+ private:
+  const T &t;
+};
+
+template <typename Delims>
+struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T>
+struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+ private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T>
+struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+ private:
+  const T &m_map;
+  const size_type n;
+};
+
+}  // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+}  // namespace std
+
+#endif  // H_PRETTY_PRINT
diff --git a/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/embed.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <spconv/tensorview/tensorview.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <typename scalar_t, typename TPyObject>
+std::vector<scalar_t> array2Vector(TPyObject arr) {
+  py::array arr_np = arr;
+  size_t size = arr.attr("size").template cast<size_t>();
+  py::array_t<scalar_t> arr_cc = arr_np;
+  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
+  return data;
+}
+
+template <typename scalar_t>
+std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
+  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename scalar_t, typename TPyObject>
+tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
+  py::array arr_np = arr;
+  py::array_t<scalar_t> arr_cc = arr_np;
+  tv::Shape shape;
+  for (int i = 0; i < arr_cc.ndim(); ++i) {
+    shape.push_back(arr_cc.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
+  tv::Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
+}
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5e093fbbed4f0485559d9860b291e258337443f
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
@@ -0,0 +1,297 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <iostream>
+#include <limits>
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
new file mode 100644
index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
@@ -0,0 +1,78 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor {
+  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
@@ -0,0 +1,50 @@
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+template <class... T>
+struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B>
+struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+}  // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
+  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
+                                    std::forward<F>(f));
+}
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h b/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c1c6e389eb2f451e8640592ee2698d8b736010
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
@@ -0,0 +1,385 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
+
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
diff --git a/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h b/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
new file mode 100644
index 0000000000000000000000000000000000000000..998d9511b060d02d9f12408038b56a802f63c1da
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..70851bc70ecb8ce1c74d777006d5b30b78e0d232
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
@@ -0,0 +1,75 @@
+#pragma once
+namespace tv {
+namespace detail {
+
+template <typename scalar_t>
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
+        : index_(index), delta_(delta) {}
+    __forceinline__ __device__ scalar_t operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    scalar_t index_;
+    const scalar_t delta_;
+  };
+
+ public:
+  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
+                                        scalar_t end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  scalar_t begin_;
+  scalar_t delta_;
+  scalar_t end_;
+};
+
+}  // namespace detail
+
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
+                                      gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
+                                      gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
+                                      gridDim.z * blockDim.z * NumILP, count);
+}
+
+}  // namespace tv
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..163df1720cbb0e55c70fb82e9762b040b3b13fb9
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
@@ -0,0 +1,19 @@
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv {
+namespace launch {
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+constexpr int CUDA_NUM_THREADS = 1024;
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, CUDA_NUM_THREADS);
+}
+}  // namespace launch
+}  // namespace tv
diff --git a/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h b/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb2f018a934c97c6ac1b965a562a3e4122d7cf4e
--- /dev/null
+++ b/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
@@ -0,0 +1,1119 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
+  }
+
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
+  }
+
+struct CPU {};
+
+#define TV_MAX_DIM 6
+
+template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
+struct SimpleVector {
+ public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (scalar_t s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<scalar_t> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(
+      const SimpleVector<scalar_t, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+   public:
+    typedef iterator self_type;
+    typedef scalar_t value_type;
+    typedef scalar_t &reference;
+    typedef scalar_t *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+   public:
+    typedef const_iterator self_type;
+    typedef scalar_t value_type;
+    typedef const scalar_t &reference;
+    typedef const scalar_t *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+ protected:
+  scalar_t mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename scalar_t, size_t MaxDim>
+bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size()) return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i]) return false;
+  }
+  return true;
+}
+
+template <typename scalar_t, size_t MaxDim>
+bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename scalar_t>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (scalar_t s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+ protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename scalar_t, template <class...> class Container>
+  ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0) return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N>
+struct ArrayIndexRowMajor {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <>
+struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *simpleTypeName(scalar_t val = scalar_t());
+template <>
+constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <>
+constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+};  // namespace detail
+
+template <typename scalar_t, int Rank = -1>
+struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      const TensorView<scalar_t, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    const scalar_t *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
+    return *this;
+  }
+
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
+                                                   T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
+      Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slice slice, Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
+                                                           Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
+                                      mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty()) return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1) ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0) ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+    return ss.str();
+  }
+
+ protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;  // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  scalar_t *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+};  // namespace detail
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
+                                    const char *format) {
+  if (tensor.empty()) return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1) printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0) printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(TensorView<const scalar_t>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
+}
+
+}  // namespace tv
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
index ae7807223fdc0467d42f137e0fd73956f269fd79..840eed82e437ca27a4c20050c798ab86d3647c13 100644
--- a/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
+++ b/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -10,6 +10,7 @@
 #include "reduce_ops.h"
 #include "roi_align.h"
 #include "roi_align_rotated.h"
+#include "rotated_feature_align.h"
 #include "soft_nms.h"
 
 const char *c_MMCVOpDomain = "mmcv";
@@ -17,6 +18,7 @@ SoftNmsOp c_SoftNmsOp;
 NmsOp c_NmsOp;
 MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
 MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
 GridSampleOp c_GridSampleOp;
 MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
 MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
@@ -77,5 +79,10 @@ OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
     return status;
   }
 
+  if (auto status = ortApi->CustomOpDomain_Add(
+          domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
+    return status;
+  }
+
   return ortApi->AddCustomOpDomain(options, domain);
 }
diff --git a/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp b/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8d07376317d91ec3cd919257fae7f00d6945f29
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
@@ -0,0 +1,132 @@
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "rotated_feature_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+template <typename T>
+T bilinear_interpolate(const T *input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[int(fma(y_low, width, x_low))];
+  T v2 = input[int(fma(y_low, width, x_high))];
+  T v3 = input[int(fma(y_high, width, x_low))];
+  T v4 = input[int(fma(y_high, width, x_high))];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t *bottom_data,
+    const scalar_t *best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t *top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t *bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
+  const float *features_data = reinterpret_cast<const float *>(
+      ort_.GetTensorData<float>(input_features));
+  const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
+  const float *best_rbboxes = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_best_rbboxes));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_features);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  rotated_feature_align_forward_cpu_kernel<float>(
+      output_size, points_, features_data, best_rbboxes, spatial_scale_,
+      input_channels, input_height, input_width, out);
+}
diff --git a/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h b/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fc03d84de9023425d19c83cc9bae11e87af1bc9
--- /dev/null
+++ b/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
@@ -0,0 +1,50 @@
+#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+
+struct MMCVRotatedFeatureAlignKernel {
+ public:
+  MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  float spatial_scale_;
+  int points_;
+};
+
+struct MMCVRotatedFeatureAlignCustomOp
+    : Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
+                        MMCVRotatedFeatureAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRotatedFeatureAlignKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter.cpp b/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9097f7e0a15d817b8e176a01e080e8f4476f6be9
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "active_rotated_filter_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void active_rotated_filter_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+#endif
+
+void active_rotated_filter_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h b/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d2ce96a416d6d845413f08b586aa55c57ea2f
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
+#define ACTIVE_ROTATED_FILTER_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/assign_score_withk.cpp b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
index d35fd24795fa1b420817da2025a7595bd9496f6e..9076277181c48c7c8f236cb9da79a83c5d38d47f 100644
--- a/mmcv/ops/csrc/parrots/assign_score_withk.cpp
+++ b/mmcv/ops/csrc/parrots/assign_score_withk.cpp
@@ -1,59 +1,33 @@
 // Modified from
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void AssignScoreWithKForwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& points, const Tensor& centers, const Tensor& scores,
-    const Tensor& knn_idx, Tensor& output);
-
-void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
                                      int aggregate, const Tensor& points,
                                      const Tensor& centers,
                                      const Tensor& scores,
                                      const Tensor& knn_idx, Tensor& output) {
-  AssignScoreWithKForwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
-};
-
-void AssignScoreWithKBackwardCUDAKernelLauncher(
-    int B, int N0, int N1, int M, int K, int O, int aggregate,
-    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
-    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
-    Tensor& grad_centers, Tensor& grad_scores);
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
 
-void assign_score_withk_backward_cuda(
+void assign_score_withk_backward_impl(
     int B, int N0, int N1, int M, int K, int O, int aggregate,
     const Tensor& grad_out, const Tensor& points, const Tensor& centers,
     const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
     Tensor& grad_centers, Tensor& grad_scores) {
-  AssignScoreWithKBackwardCUDAKernelLauncher(
-      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
-      grad_points, grad_centers, grad_scores);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
 
 void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
                                 const Tensor& scores, const Tensor& knn_idx,
                                 Tensor& output, int B, int N0, int N1, int M,
                                 int K, int O, int aggregate) {
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(output);
-
-    assign_score_withk_forward_cuda(B, N0, N1, M, K, O, aggregate, points,
-                                    centers, scores, knn_idx, output);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
 }
 
 void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
@@ -62,24 +36,7 @@ void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
                                  Tensor& grad_centers, Tensor& grad_scores,
                                  int B, int N0, int N1, int M, int K, int O,
                                  int aggregate) {
-  if (grad_points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CONTIGUOUS(grad_out);
-    CHECK_CONTIGUOUS(scores);
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(centers);
-    CHECK_CONTIGUOUS(knn_idx);
-    CHECK_CONTIGUOUS(grad_scores);
-    CHECK_CONTIGUOUS(grad_points);
-    CHECK_CONTIGUOUS(grad_centers);
-
-    assign_score_withk_backward_cuda(B, N0, N1, M, K, O, aggregate, grad_out,
-                                     points, centers, scores, knn_idx,
-                                     grad_points, grad_centers, grad_scores);
-#else
-    AT_ERROR("assign_score_withk is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("assign_score_withk is not implemented on CPU");
-  }
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
 }
diff --git a/mmcv/ops/csrc/parrots/ball_query.cpp b/mmcv/ops/csrc/parrots/ball_query.cpp
index fc2709f0db84a341456fe171bcf7a21390b0e1ef..1c9e7a20785e894c80d15256a1b040beffa92b47 100644
--- a/mmcv/ops/csrc/parrots/ball_query.cpp
+++ b/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -2,36 +2,19 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
-                                        float max_radius, int nsample,
-                                        const Tensor new_xyz, const Tensor xyz,
-                                        Tensor idx);
-
-void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
                              float max_radius, int nsample,
                              const Tensor new_xyz, const Tensor xyz,
                              Tensor idx) {
-  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
-                                     new_xyz, xyz, idx);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
 
 void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                         Tensor idx_tensor, int b, int n, int m,
                         float min_radius, float max_radius, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-
-    ball_query_forward_cuda(b, n, m, min_radius, max_radius, nsample,
-                            new_xyz_tensor, xyz_tensor, idx_tensor);
-#else
-    AT_ERROR("ball_query is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("ball_query is not implemented on CPU");
-  }
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
index 073110dfc8741ecd1b4521d9ea35c3bd0ea5ee9f..187216fb01a307906a6fff8d7c10fc4efa1b9b3a 100644
--- a/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
+++ b/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
@@ -1,30 +1,14 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
-                                    Tensor ious, const int mode,
-                                    const bool aligned, const int offset);
-
-void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                         const int mode, const bool aligned, const int offset) {
-  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
 }
-#endif
 
 void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
                    const int mode, const bool aligned, const int offset) {
-  if (bboxes1.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(bboxes1);
-    CHECK_CUDA_INPUT(bboxes2);
-    CHECK_CUDA_INPUT(ious);
-
-    bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
-#else
-    AT_ERROR("bbox_overlaps is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("bbox_overlaps is not implemented on CPU");
-  }
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
 }
diff --git a/mmcv/ops/csrc/parrots/border_align.cpp b/mmcv/ops/csrc/parrots/border_align.cpp
index bb14962c9f99e36bc212331501dd57033e59d621..565de689913413ab106884365e6dc1edfa940de0 100644
--- a/mmcv/ops/csrc/parrots/border_align.cpp
+++ b/mmcv/ops/csrc/parrots/border_align.cpp
@@ -1,68 +1,30 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
-                                          const Tensor &boxes, Tensor output,
-                                          Tensor argmax_idx,
-                                          const int pool_size);
-
-void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
-                                           const Tensor &boxes,
-                                           const Tensor &argmax_idx,
-                                           Tensor grad_input,
-                                           const int pool_size);
-
-void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
                                Tensor output, Tensor argmax_idx,
                                const int pool_size) {
-  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
-                                       pool_size);
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
 }
 
-void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
                                 const Tensor &argmax_idx, Tensor grad_input,
                                 const int pool_size) {
-  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
-                                        grad_input, pool_size);
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
 }
-#endif
 
 void border_align_forward(const Tensor &input, const Tensor &boxes,
                           Tensor output, Tensor argmax_idx,
                           const int pool_size) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax_idx);
-
-    border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
 }
 
 void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
                            const Tensor &argmax_idx, Tensor grad_input,
                            const int pool_size) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(argmax_idx);
-    CHECK_CUDA_INPUT(grad_input);
-
-    border_align_backward_cuda(grad_output, boxes, argmax_idx, grad_input,
-                               pool_size);
-#else
-    AT_ERROR("BorderAlign is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("BorderAlign is not implemented on CPU");
-  }
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
 }
diff --git a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
index 01fc02f550d9e77cdb279e96af3f033a861eb6ba..a2a4e0953a5575f72c167bd668c6b6e758ebae87 100644
--- a/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -2,28 +2,18 @@
 // modified from
 // https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned);
-
-#ifdef MMCV_WITH_CUDA
-void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                          const int mode_flag, const bool aligned);
-#endif
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
 
 // Interface for Python
 // inline is needed to prevent multiple function definitions when this header is
 // included by different cpps
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                      const int mode_flag, const bool aligned) {
-  assert(boxes1.device().is_cuda() == boxes2.device().is_cuda());
-  if (boxes1.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
-  }
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
 }
diff --git a/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp b/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
deleted file mode 100644
index 2b434885a82ed76cf326520df908d303a25bb060..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/box_iou_rotated_cpu.cpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
-#include "box_iou_rotated_utils.hpp"
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T>
-void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
-                                Tensor ious, const int mode_flag,
-                                const bool aligned) {
-  int output_size = ious.numel();
-  auto num_boxes1 = boxes1.size(0);
-  auto num_boxes2 = boxes2.size(0);
-
-  if (aligned) {
-    for (int i = 0; i < output_size; i++) {
-      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
-                                          boxes2[i].data_ptr<T>(), mode_flag);
-    }
-  } else {
-    for (int i = 0; i < num_boxes1; i++) {
-      for (int j = 0; j < num_boxes2; j++) {
-        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
-            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
-      }
-    }
-  }
-}
-
-void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
-                         const int mode_flag, const bool aligned) {
-  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
-}
diff --git a/mmcv/ops/csrc/parrots/carafe.cpp b/mmcv/ops/csrc/parrots/carafe.cpp
index c4137378cd298cf0cff4426cfa61902a2108d404..a563aed94f04e32614e38062c4e7f4250c6dafe6 100644
--- a/mmcv/ops/csrc/parrots/carafe.cpp
+++ b/mmcv/ops/csrc/parrots/carafe.cpp
@@ -1,59 +1,30 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
-                                     Tensor rfeatures, Tensor routput,
-                                     Tensor rmasks, Tensor output,
-                                     const int kernel_size,
-                                     const int group_size,
-                                     const int scale_factor);
-
-void CARAFEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
-    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
-    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
-    const int kernel_size, const int group_size, const int scale_factor);
-
-void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
                          Tensor routput, Tensor rmasks, Tensor output,
                          int kernel_size, int group_size, int scale_factor) {
-  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
-                                  output, kernel_size, group_size,
-                                  scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
 }
 
-void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
                           Tensor rtop_grad, Tensor rbottom_grad_hs,
                           Tensor rbottom_grad, Tensor rmask_grad,
                           Tensor bottom_grad, Tensor mask_grad, int kernel_size,
                           int group_size, int scale_factor) {
-  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
-                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
-                                   bottom_grad, mask_grad, kernel_size,
-                                   group_size, scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
 }
-#endif
 
 void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
                     Tensor routput, Tensor rmasks, Tensor output,
                     int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(routput);
-    CHECK_CUDA_INPUT(rmasks);
-    CHECK_CUDA_INPUT(output);
-    carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
-                        kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
 }
 
 void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
@@ -61,24 +32,7 @@ void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
                      Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
                      Tensor mask_grad, int kernel_size, int group_size,
                      int scale_factor) {
-  if (top_grad.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(top_grad);
-    CHECK_CUDA_INPUT(rfeatures);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(rtop_grad);
-    CHECK_CUDA_INPUT(rbottom_grad_hs);
-    CHECK_CUDA_INPUT(rbottom_grad);
-    CHECK_CUDA_INPUT(rmask_grad);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
-                         rbottom_grad, rmask_grad, bottom_grad, mask_grad,
-                         kernel_size, group_size, scale_factor);
-#else
-    AT_ERROR("Carafe is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Carafe is not implemented on CPU");
-  }
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
 }
diff --git a/mmcv/ops/csrc/parrots/carafe_naive.cpp b/mmcv/ops/csrc/parrots/carafe_naive.cpp
index d6ebda35a3f851e04903133482551e32c0ca63bd..6e8917a61d93c7e6613566902cb00623ea89444e 100644
--- a/mmcv/ops/csrc/parrots/carafe_naive.cpp
+++ b/mmcv/ops/csrc/parrots/carafe_naive.cpp
@@ -1,69 +1,32 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
-                                          const Tensor masks, Tensor output,
-                                          const int kernel_size,
-                                          const int group_size,
-                                          const int scale_factor);
-
-void CARAFENAIVEBackwardCUDAKernelLauncher(
-    const Tensor top_grad, const Tensor features, const Tensor masks,
-    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
-    const int group_size, const int scale_factor);
-
-void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
                                int kernel_size, int group_size,
                                int scale_factor) {
-  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
-                                       group_size, scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
 }
 
-void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
                                 Tensor bottom_grad, Tensor mask_grad,
                                 int kernel_size, int group_size,
                                 int scale_factor) {
-  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
-                                        mask_grad, kernel_size, group_size,
-                                        scale_factor);
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
 }
-#endif
 
 void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
                           int kernel_size, int group_size, int scale_factor) {
-  if (features.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(output);
-    carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
-                              scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
 }
 
 void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
                            Tensor bottom_grad, Tensor mask_grad,
                            int kernel_size, int group_size, int scale_factor) {
-  if (top_grad.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(top_grad);
-    CHECK_CUDA_INPUT(features);
-    CHECK_CUDA_INPUT(masks);
-    CHECK_CUDA_INPUT(bottom_grad);
-    CHECK_CUDA_INPUT(mask_grad);
-    carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad,
-                               mask_grad, kernel_size, group_size,
-                               scale_factor);
-#else
-    AT_ERROR("CarafeNaive is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("CarafeNaive is not implemented on CPU");
-  }
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
 }
diff --git a/mmcv/ops/csrc/parrots/contour_expand.cpp b/mmcv/ops/csrc/parrots/contour_expand.cpp
index 7639ae5673c23efc5e5e535d99565428053c3361..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b 100644
--- a/mmcv/ops/csrc/parrots/contour_expand.cpp
+++ b/mmcv/ops/csrc/parrots/contour_expand.cpp
@@ -102,7 +102,6 @@ std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
   IntArrayRef data_shape = kernel_mask.sizes();
 
   auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
-  IntArrayRef label_map_shape = internal_kernel_label.sizes();
   vector<vector<int>> text_line;
 
   kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
diff --git a/mmcv/ops/csrc/parrots/convex_iou.cpp b/mmcv/ops/csrc/parrots/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp b/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf766542f0a04da85a1b15022f3e5f078c283a1a
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "convex_iou_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  convex_iou(pointsets, polygons, ious);
+}
+
+void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  convex_giou(pointsets, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(convex_iou)
+    .input(2)
+    .output(1)
+    .apply(convex_iou_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(convex_giou)
+    .input(2)
+    .output(1)
+    .apply(convex_giou_forward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/ops/csrc/parrots/convex_iou_pytorch.h b/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f16a1ce4b62bbe91b3083465468c2b9ae6df055
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_PYTORCH_H
+#define CONVEX_IOU_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/corner_pool.cpp b/mmcv/ops/csrc/parrots/corner_pool.cpp
deleted file mode 100644
index 732cdb0562850cb20c2d47a56093b8024a3f6d42..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/corner_pool.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
-#include "pytorch_cpp_helper.hpp"
-
-Tensor bottom_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get height
-  int64_t height = input.size(2);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 2, ind, height);
-    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
-    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(2, 0);
-  auto grad_output_temp = grad_output.select(2, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < height - 1; ++ind) {
-    input_temp = input.select(2, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor left_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get width
-  int64_t width = input.size(3);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 3, 0, width - ind);
-    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
-    Tensor next_temp = at::slice(output, 3, ind, width).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor left_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, width - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(width - 1);
-
-  auto output_temp = output.select(3, width - 1);
-  auto grad_output_temp = grad_output.select(3, width - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < width; ++ind) {
-    input_temp = input.select(3, width - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, width - ind - 1);
-
-    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor right_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get width
-  int64_t width = input.size(3);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 3, ind, width);
-    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
-    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor right_pool_backward(Tensor input, Tensor grad_output) {
-  Tensor output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(3, 0);
-  auto grad_output_temp = grad_output.select(3, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < width - 1; ++ind) {
-    input_temp = input.select(3, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor top_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get height
-  int64_t height = input.size(2);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 2, 0, height - ind);
-    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
-    Tensor next_temp = at::slice(output, 2, ind, height).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor top_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, height - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(height - 1);
-
-  auto output_temp = output.select(2, height - 1);
-  auto grad_output_temp = grad_output.select(2, height - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < height; ++ind) {
-    input_temp = input.select(2, height - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, height - ind - 1);
-
-    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
diff --git a/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp b/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
deleted file mode 100644
index 8b8cc5e4815eb5c3ffa8800b9ee865dbc4d47808..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/corner_pool_parrots.cpp
+++ /dev/null
@@ -1,234 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include <parrots/compute/aten.hpp>
-#include <parrots/extension.hpp>
-#include <parrots/foundation/ssattrs.hpp>
-
-#include "corner_pool_pytorch.h"
-
-using namespace parrots;
-
-#ifdef MMCV_WITH_CUDA
-void bottom_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
-                                 const OperatorBase::in_list_t& ins,
-                                 OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = bottom_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void bottom_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
-                                  const OperatorBase::in_list_t& ins,
-                                  OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = bottom_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void left_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
-                               const OperatorBase::in_list_t& ins,
-                               OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = left_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void left_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
-                                const OperatorBase::in_list_t& ins,
-                                OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = left_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void right_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
-                                const OperatorBase::in_list_t& ins,
-                                OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = right_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void right_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
-                                 const OperatorBase::in_list_t& ins,
-                                 OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = right_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void top_pool_forward_parrots(CudaContext& ctx, const SSElement& attr,
-                              const OperatorBase::in_list_t& ins,
-                              OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = top_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void top_pool_backward_parrots(CudaContext& ctx, const SSElement& attr,
-                               const OperatorBase::in_list_t& ins,
-                               OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = top_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-#endif
-
-void bottom_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                     const OperatorBase::in_list_t& ins,
-                                     OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = bottom_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void bottom_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                      const OperatorBase::in_list_t& ins,
-                                      OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = bottom_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void left_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                   const OperatorBase::in_list_t& ins,
-                                   OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = left_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void left_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                    const OperatorBase::in_list_t& ins,
-                                    OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = left_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void right_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                    const OperatorBase::in_list_t& ins,
-                                    OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = right_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void right_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                     const OperatorBase::in_list_t& ins,
-                                     OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = right_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void top_pool_forward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                  const OperatorBase::in_list_t& ins,
-                                  OperatorBase::out_list_t& outs) {
-  at::Tensor input;
-  input = buildATensor(ctx, ins[0]);
-  auto out = top_pool_forward(input);
-  updateDArray(ctx, out, outs[0]);
-}
-
-void top_pool_backward_parrots_cpu(HostContext& ctx, const SSElement& attr,
-                                   const OperatorBase::in_list_t& ins,
-                                   OperatorBase::out_list_t& outs) {
-  at::Tensor input, grad_output;
-  input = buildATensor(ctx, ins[0]);
-  grad_output = buildATensor(ctx, ins[1]);
-  auto out = top_pool_backward(input, grad_output);
-  updateDArray(ctx, out, outs[0]);
-}
-
-PARROTS_EXTENSION_REGISTER(bottom_pool_forward)
-    .input(1)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(bottom_pool_forward_parrots)
-#endif
-    .apply(bottom_pool_forward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(bottom_pool_backward)
-    .input(2)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(bottom_pool_backward_parrots)
-#endif
-    .apply(bottom_pool_backward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(top_pool_forward)
-    .input(1)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(top_pool_forward_parrots)
-#endif
-    .apply(top_pool_forward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(top_pool_backward)
-    .input(2)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(top_pool_backward_parrots)
-#endif
-    .apply(top_pool_backward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(left_pool_forward)
-    .input(1)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(left_pool_forward_parrots)
-#endif
-    .apply(left_pool_forward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(left_pool_backward)
-    .input(2)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(left_pool_backward_parrots)
-#endif
-    .apply(left_pool_backward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(right_pool_forward)
-    .input(1)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(right_pool_forward_parrots)
-#endif
-    .apply(right_pool_forward_parrots_cpu)
-    .done();
-
-PARROTS_EXTENSION_REGISTER(right_pool_backward)
-    .input(2)
-    .output(1)
-#ifdef MMCV_WITH_CUDA
-    .apply(right_pool_backward_parrots)
-#endif
-    .apply(right_pool_backward_parrots_cpu)
-    .done();
diff --git a/mmcv/ops/csrc/parrots/corner_pool_pytorch.h b/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
deleted file mode 100644
index fd94234bf1fbca6124115b9b82f95ef22d6972de..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/corner_pool_pytorch.h
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#ifndef CORNER_POOL_PYTORCH_H
-#define CORNER_POOL_PYTORCH_H
-#include <torch/extension.h>
-
-at::Tensor bottom_pool_forward(at::Tensor input);
-at::Tensor bottom_pool_backward(at::Tensor input, at::Tensor grad_output);
-at::Tensor left_pool_forward(at::Tensor input);
-at::Tensor left_pool_backward(at::Tensor input, at::Tensor grad_output);
-at::Tensor right_pool_forward(at::Tensor input);
-at::Tensor right_pool_backward(at::Tensor input, at::Tensor grad_output);
-at::Tensor top_pool_forward(at::Tensor input);
-at::Tensor top_pool_backward(at::Tensor input, at::Tensor grad_output);
-
-#endif  // CORNER_POOL_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/correlation.cpp b/mmcv/ops/csrc/parrots/correlation.cpp
index c3614a500becabb1222752884ac88a19ec1fd339..f4adba2a0c17201476352c473f1c7117af020ab2 100644
--- a/mmcv/ops/csrc/parrots/correlation.cpp
+++ b/mmcv/ops/csrc/parrots/correlation.cpp
@@ -2,65 +2,37 @@
 #include <iostream>
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-
-void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
-                                          Tensor output, int kH, int kW,
-                                          int patchH, int patchW, int padH,
-                                          int padW, int dilationH,
-                                          int dilationW, int dilation_patchH,
-                                          int dilation_patchW, int dH, int dW);
-
-void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
-                                           Tensor input2, Tensor grad_input1,
-                                           Tensor grad_input2, int kH, int kW,
-                                           int patchH, int patchW, int padH,
-                                           int padW, int dilationH,
-                                           int dilationW, int dilation_patchH,
-                                           int dilation_patchW, int dH, int dW);
-
-void correlation_cuda_forward(Tensor input1, Tensor input2, Tensor output,
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
                               int kH, int kW, int patchH, int patchW, int padH,
                               int padW, int dilationH, int dilationW,
                               int dilation_patchH, int dilation_patchW, int dH,
                               int dW) {
-  CorrelationForwardCUDAKernelLauncher(
-      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
-      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
 }
 
-void correlation_cuda_backward(Tensor grad_output, Tensor input1, Tensor input2,
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
                                Tensor grad_input1, Tensor grad_input2, int kH,
                                int kW, int patchH, int patchW, int padH,
                                int padW, int dilationH, int dilationW,
                                int dilation_patchH, int dilation_patchW, int dH,
                                int dW) {
-  CorrelationBackwardCUDAKernelLauncher(
-      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
-      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-      dilation_patchW, dH, dW);
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
 }
 
-#endif
-
 void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
                          int kW, int patchH, int patchW, int padH, int padW,
                          int dilationH, int dilationW, int dilation_patchH,
                          int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input1);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_forward(input1, input2, output, kH, kW, patchH, patchW,
-                             padH, padW, dilationH, dilationW, dilation_patchH,
-                             dilation_patchW, dH, dW);
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
 }
 
 void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
@@ -68,20 +40,8 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                           int kW, int patchH, int patchW, int padH, int padW,
                           int dilationH, int dilationW, int dilation_patchH,
                           int dilation_patchW, int dH, int dW) {
-  if (input1.device().is_cuda() && input2.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(input1);
-    CHECK_CUDA_INPUT(input2);
-    correlation_cuda_backward(grad_output, input1, input2, grad_input1,
-                              grad_input2, kH, kW, patchH, patchW, padH, padW,
-                              dilationH, dilationW, dilation_patchH,
-                              dilation_patchW, dH, dW);
-
-#else
-    AT_ERROR("Correlation is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Correlation is not implemented on CPU");
-  }
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
 }
diff --git a/mmcv/ops/csrc/parrots/cudabind.cpp b/mmcv/ops/csrc/parrots/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04c6e36c4a1f4ff33d94a82dc1dc334aeadfc340
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/cudabind.cpp
@@ -0,0 +1,1591 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long* mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh);
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long* mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                      nms_overlap_thresh);
+};
+
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                            nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
diff --git a/mmcv/ops/csrc/parrots/deform_conv.cpp b/mmcv/ops/csrc/parrots/deform_conv.cpp
index 455102744da302aa68584275cb20ecf54e4e056d..86690b9394a4b758104009062f656dcfe0de178e 100644
--- a/mmcv/ops/csrc/parrots/deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/deform_conv.cpp
@@ -1,57 +1,46 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
 
-#ifdef MMCV_WITH_CUDA
-
-void deformable_im2col(Tensor data_im, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor data_col);
-
-void deformable_col2im(Tensor data_col, Tensor data_offset, const int channels,
-                       const int height, const int width, const int ksize_h,
-                       const int ksize_w, const int pad_h, const int pad_w,
-                       const int stride_h, const int stride_w,
-                       const int dilation_h, const int dilation_w,
-                       const int parallel_imgs, const int deformable_group,
-                       Tensor grad_im);
-
-void deformable_col2im_coord(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
-
-#endif
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
 
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col);
-
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im);
-
-void deformable_col2im_coord_cpu(
+void deformable_col2im_coord_impl(
     Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
     const int height, const int width, const int ksize_h, const int ksize_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset);
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
 
 void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
                              at::Tensor *gradOutput, at::Tensor weight, int kH,
@@ -227,17 +216,9 @@ void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
        output_buffer.size(2), output_buffer.size(3)});
 
   for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
 
     columns = columns.view({group, columns.size(0) / group, columns.size(1)});
     weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -373,29 +354,15 @@ void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
         {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
          gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
 
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
-                              inputHeight, inputWidth, kH, kW, padH, padW, dH,
-                              dW, dilationH, dilationW, im2col_step,
-                              deformable_group, gradOffset[elt]);
-
-      deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group,
-                        gradInput[elt]);
-#endif
-    } else {
-      deformable_col2im_coord_cpu(columns, input[elt], offset[elt], nInputPlane,
-                                  inputHeight, inputWidth, kH, kW, padH, padW,
-                                  dH, dW, dilationH, dilationW, im2col_step,
-                                  deformable_group, gradOffset[elt]);
-
-      deformable_col2im_cpu(columns, offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group,
-                            gradInput[elt]);
-    }
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
 
     weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                           weight.size(3), weight.size(4)});
@@ -508,17 +475,9 @@ void deform_conv_backward_parameters(Tensor input, Tensor offset,
                    deformable_group * 2 * kH * kW, outputHeight, outputWidth});
 
   for (int elt = 0; elt < batchSize / im2col_step; elt++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
-                        inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                        dilationW, im2col_step, deformable_group, columns);
-#endif
-    } else {
-      deformable_im2col_cpu(input[elt], offset[elt], nInputPlane, inputHeight,
-                            inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
-                            dilationW, im2col_step, deformable_group, columns);
-    }
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
 
     // divide into group
     gradOutputBuffer = gradOutputBuffer.view(
diff --git a/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp b/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
deleted file mode 100644
index cb0e638c5e32075139be06adff25f3dbe1f48155..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/deform_conv_cpu.cpp
+++ /dev/null
@@ -1,377 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T>
-T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
-                                 const int height, const int width, T h, T w) {
-  if (h <= -1 || height <= h || w <= -1 || width <= w) {
-    return 0;
-  }
-
-  int h_low = floor(h);
-  int w_low = floor(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename T>
-T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
-                          const int height, const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename T>
-T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
-                            const int width, const T *im_data,
-                            const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floor(argmax_h);
-  int argmax_w_low = floor(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename T>
-void deformable_im2col_cpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const int height,
-    const int width, const int kernel_h, const int kernel_w, const int pad_h,
-    const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T *data_col) {
-  for (int index = 0; index < n; index++) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-    T *data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr =
-        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
-                                               width, h_im, w_im);
-        *data_col_ptr = val;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-void deformable_col2im_cpu_kernel(
-    const int n, const T *data_col, const T *data_offset, const int channels,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T *grad_im) {
-  for (int index = 0; index < n; index++) {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[index];
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight =
-              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
-                                      cur_h + dy, cur_w + dx, height, width);
-          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void deformable_col2im_coord_cpu_kernel(
-    const int n, const T *data_col, const T *data_im, const T *data_offset,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int offset_channels, const int deformable_group, const int height_col,
-    const int width_col, T *grad_offset) {
-  for (int index = 0; index < n; index++) {
-    T val = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T *data_col_ptr = data_col + deformable_group_index *
-                                           channel_per_deformable_group *
-                                           batch_size * width_col * height_col;
-    const T *data_im_ptr =
-        data_im + (b * deformable_group + deformable_group_index) *
-                      channel_per_deformable_group / kernel_h / kernel_w *
-                      height * width;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-        inv_h = inv_w = -2;
-      const T weight = get_coordinate_weight_cpu(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos];
-      cnt += 1;
-    }
-
-    grad_offset[index] = val;
-  }
-}
-
-void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor data_col) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = channels * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
-        deformable_im2col_cpu_kernel<scalar_t>(
-            num_kernels, data_im.data_ptr<scalar_t>(),
-            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
-            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, parallel_imgs, channels,
-            deformable_group, height_col, width_col,
-            data_col.data_ptr<scalar_t>());
-      });
-}
-
-void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
-                           const int channels, const int height,
-                           const int width, const int ksize_h,
-                           const int ksize_w, const int pad_h, const int pad_w,
-                           const int stride_h, const int stride_w,
-                           const int dilation_h, const int dilation_w,
-                           const int parallel_imgs, const int deformable_group,
-                           Tensor grad_im) {
-  // todo: make sure parallel_imgs is passed in correctly
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels =
-      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
-  int channel_per_deformable_group = channels / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        deformable_col2im_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_offset_, channels, height, width,
-            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
-            dilation_w, channel_per_deformable_group, parallel_imgs,
-            deformable_group, height_col, width_col, grad_im_);
-      }));
-}
-
-void deformable_col2im_coord_cpu(
-    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
-    const int height, const int width, const int ksize_h, const int ksize_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int parallel_imgs,
-    const int deformable_group, Tensor grad_offset) {
-  int height_col =
-      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
-  int width_col =
-      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
-  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
-                    deformable_group * parallel_imgs;
-  int channel_per_deformable_group =
-      channels * ksize_h * ksize_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-
-        deformable_col2im_coord_cpu_kernel<scalar_t>(
-            num_kernels, data_col_, data_im_, data_offset_, channels, height,
-            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
-            2 * ksize_h * ksize_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_);
-      }));
-}
diff --git a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
index b4654c0d9a000ebf9694e81ecdc7cb3403ab857e..4fb78a96e74f7e97dff5212bb767eab743f2e73c 100644
--- a/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
@@ -1,59 +1,34 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
-                                            Tensor offset, Tensor output,
-                                            int pooled_height, int pooled_width,
-                                            float spatial_scale,
-                                            int sampling_ratio, float gamma);
-
-void DeformRoIPoolBackwardCUDAKernelLauncher(
-    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
-    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
-    float spatial_scale, int sampling_ratio, float gamma);
-
-void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
                                   Tensor output, int pooled_height,
                                   int pooled_width, float spatial_scale,
                                   int sampling_ratio, float gamma) {
-  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
-                                         pooled_height, pooled_width,
-                                         spatial_scale, sampling_ratio, gamma);
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
 }
 
-void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
                                    Tensor rois, Tensor offset,
                                    Tensor grad_input, Tensor grad_offset,
                                    int pooled_height, int pooled_width,
                                    float spatial_scale, int sampling_ratio,
                                    float gamma) {
-  DeformRoIPoolBackwardCUDAKernelLauncher(
-      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
-      pooled_width, spatial_scale, sampling_ratio, gamma);
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
 }
-#endif
 
 void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
                              Tensor output, int pooled_height, int pooled_width,
                              float spatial_scale, int sampling_ratio,
                              float gamma) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(output);
-
-    deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
-                                 pooled_width, spatial_scale, sampling_ratio,
-                                 gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
 }
 
 void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
@@ -61,22 +36,7 @@ void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
                               Tensor grad_offset, int pooled_height,
                               int pooled_width, float spatial_scale,
                               int sampling_ratio, float gamma) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_offset);
-
-    deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
-                                  grad_offset, pooled_height, pooled_width,
-                                  spatial_scale, sampling_ratio, gamma);
-#else
-    AT_ERROR("DeformRoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("DeformRoIPool is not implemented on CPU");
-  }
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
 }
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp b/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d3e0e05900a1c9c731fcc7e2194eeedc8b9bfb
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "diff_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  at::Tensor boxes, scores, dets;
+  auto vertices = buildATensor(ctx, ins[0]);
+  auto mask = buildATensor(ctx, ins[1]);
+  auto num_valid = buildATensor(ctx, ins[2]);
+  auto out =
+      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
+    .input(3)
+    .output(1)
+    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h b/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef911ecc20c7e648dea7aeb74a4d3ec2f46ec990
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DIFF_IOU_ROTATED_PYTORCH_H
+#define DIFF_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+#endif  // DIFF_IOU_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/focal_loss.cpp b/mmcv/ops/csrc/parrots/focal_loss.cpp
index 3e2c92b27a1ffdad33a6ae9bfea683902b972b50..ed0e2186532d9d6d909f76d653283bbdc29eac11 100644
--- a/mmcv/ops/csrc/parrots/focal_loss.cpp
+++ b/mmcv/ops/csrc/parrots/focal_loss.cpp
@@ -1,131 +1,53 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                               Tensor weight, Tensor output,
-                                               const float gamma,
-                                               const float alpha);
-
-void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
-                                                Tensor weight, Tensor buff,
-                                                Tensor grad_input,
-                                                const float gamma,
-                                                const float alpha);
-
-void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                      Tensor output, float gamma, float alpha) {
-  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
 }
 
-void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
                                       Tensor weight, Tensor grad_input,
                                       float gamma, float alpha) {
-  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
-                                             gamma, alpha);
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
 }
 
-void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
                                      Tensor output, float gamma, float alpha) {
-  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
-                                            gamma, alpha);
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
 }
 
-void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
                                       Tensor weight, Tensor buff,
                                       Tensor grad_input, float gamma,
                                       float alpha) {
-  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
-                                             grad_input, gamma, alpha);
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
 }
-#endif
 
 void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-
-    sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
 }
 
 void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor grad_input, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(grad_input);
-
-    sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
-                                     alpha);
-#else
-    AT_ERROR("SigmoidFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SigmoidFocalLoss is not implemented on CPU");
-  }
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
 }
 
 void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
                                 Tensor output, float gamma, float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(output);
-
-    softmax_focal_loss_forward_cuda(input, target, weight, output, gamma,
-                                    alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
 }
 
 void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
                                  Tensor buff, Tensor grad_input, float gamma,
                                  float alpha) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(target);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(buff);
-    CHECK_CUDA_INPUT(grad_input);
-
-    softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
-                                     gamma, alpha);
-#else
-    AT_ERROR("SoftmaxFocalLoss is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SoftmaxFocalLoss is not implemented on CPU");
-  }
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
 }
diff --git a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
index e3ec99a82c4224ab193deaf7438b732bd2130222..9c7098acdb5b8392a698803dd7c7d34a360df6ad 100644
--- a/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
+++ b/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -2,61 +2,33 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
-                                                    const float *dataset,
-                                                    float *temp, int *idxs);
-
-void furthest_point_sampling_forward_cuda(int b, int n, int m,
-                                          const float *dataset, float *temp,
-                                          int *idxs) {
-  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
 }
 
-void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
-    int b, int n, int m, const float *dataset, float *temp, int *idxs);
-
-void furthest_point_sampling_with_dist_forward_cuda(int b, int n, int m,
-                                                    const float *dataset,
-                                                    float *temp, int *idxs) {
-  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
-                                                         idxs);
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
 }
-#endif
 
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                      Tensor idx_tensor, int b, int n, int m) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    const float *points = points_tensor.data_ptr<float>();
-    float *temp = temp_tensor.data_ptr<float>();
-    int *idx = idx_tensor.data_ptr<int>();
-    furthest_point_sampling_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR("furthest_point_sampling is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling is not implemented on CPU");
-  }
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
 }
 
 void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
                                                Tensor temp_tensor,
                                                Tensor idx_tensor, int b, int n,
                                                int m) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    const float *points = points_tensor.data<float>();
-    float *temp = temp_tensor.data<float>();
-    int *idx = idx_tensor.data<int>();
-
-    furthest_point_sampling_with_dist_forward_cuda(b, n, m, points, temp, idx);
-#else
-    AT_ERROR(
-        "furthest_point_sampling_with_dist is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("furthest_point_sampling_with_dist is not implemented on CPU");
-  }
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
 }
diff --git a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
index 2eadfeba2ece1234055278d610dda5a8fa757ce7..8d411c9d843f15174653aab4b24cbb3c37564073 100644
--- a/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
+++ b/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -1,27 +1,119 @@
-// Copyright (c) OpenMMLab. All rights reserved
 // Modified from
-// from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
-#include "pytorch_cpp_helper.hpp"
 
-#ifdef MMCV_WITH_CUDA
-torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor &input,
-                                      const torch::Tensor &bias,
-                                      const torch::Tensor &refer, int act,
-                                      int grad, float alpha, float scale);
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
 
-#endif
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
 
-torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
-                                   const torch::Tensor &bias,
-                                   const torch::Tensor &refer, int act,
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
                                    int grad, float alpha, float scale) {
-#ifdef MMCV_WITH_CUDA
-  CHECK_CUDA(input);
-  CHECK_CUDA(bias);
-
-  return fused_bias_leakyrelu_op(input, bias, refer, act, grad, alpha, scale);
-#else
-  AT_ERROR("Fused bias leakyrelu is not compiled with GPU support");
-#endif
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
 }
diff --git a/mmcv/ops/csrc/parrots/gather_points.cpp b/mmcv/ops/csrc/parrots/gather_points.cpp
index 3ab93b600f5b29a65752f2f1cbbb0e29152e421b..b8fb020022902bfbeb5ba940621d51859c616bdc 100644
--- a/mmcv/ops/csrc/parrots/gather_points.cpp
+++ b/mmcv/ops/csrc/parrots/gather_points.cpp
@@ -1,55 +1,30 @@
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           const Tensor points,
-                                           const Tensor idx, Tensor out);
-
-void gather_points_forward_cuda(int b, int c, int n, int npoints,
+void gather_points_forward_impl(int b, int c, int n, int npoints,
                                 const Tensor points, const Tensor idx,
                                 Tensor out) {
-  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
-};
-
-void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                            const Tensor grad_out,
-                                            const Tensor idx,
-                                            Tensor grad_points);
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
 
-void gather_points_backward_cuda(int b, int c, int n, int npoints,
+void gather_points_backward_impl(int b, int c, int n, int npoints,
                                  const Tensor grad_out, const Tensor idx,
                                  Tensor grad_points) {
-  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
-                                         grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
 
 void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
                            Tensor out_tensor, int b, int c, int n,
                            int npoints) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_forward_cuda(b, c, n, npoints, points_tensor, idx_tensor,
-                               out_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
 }
 
 void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                             Tensor grad_points_tensor, int b, int c, int n,
                             int npoints) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    gather_points_backward_cuda(b, c, n, npoints, grad_out_tensor, idx_tensor,
-                                grad_points_tensor);
-#else
-    AT_ERROR("gather_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("gather_points is not implemented on CPU");
-  }
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/group_points.cpp b/mmcv/ops/csrc/parrots/group_points.cpp
index 864cf8683bc4147ddb48e47332f50fdcfa6b5581..cdd190d40bbfdb109e34148791775dfe9d16be2e 100644
--- a/mmcv/ops/csrc/parrots/group_points.cpp
+++ b/mmcv/ops/csrc/parrots/group_points.cpp
@@ -3,56 +3,32 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                          int nsample, const Tensor points,
-                                          const Tensor idx, Tensor out);
-void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
                                const Tensor points, const Tensor idx,
                                Tensor out) {
-  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
-                                       out);
-};
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
 
-void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
-                                           int nsample, const Tensor grad_out,
-                                           const Tensor idx,
-                                           Tensor grad_points);
-void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
                                 const Tensor grad_out, const Tensor idx,
                                 Tensor grad_points) {
-  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
-                                        idx, grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
 
 void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
                           Tensor out_tensor, int b, int c, int n, int npoints,
                           int nsample) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_forward_cuda(b, c, n, npoints, nsample, points_tensor,
-                              idx_tensor, out_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
 }
 
 void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                            Tensor grad_points_tensor, int b, int c, int n,
                            int npoints, int nsample) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    group_points_backward_cuda(b, c, n, npoints, nsample, grad_out_tensor,
-                               idx_tensor, grad_points_tensor);
-#else
-    AT_ERROR("group_points is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("group_points is not implemented on CPU");
-  }
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/info.cpp b/mmcv/ops/csrc/parrots/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a08d227d4c6e94f0dabd8cebab7bf2d77b9df4b9
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/info.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/mmcv/ops/csrc/parrots/iou3d.cpp b/mmcv/ops/csrc/parrots/iou3d.cpp
index 584447b6f7b644c7126cd5ff34c7248367dfc982..5ef9c7e819943a1c5305ca3fd6294b8a3f870056 100644
--- a/mmcv/ops/csrc/parrots/iou3d.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -8,225 +8,128 @@ All Rights Reserved 2019-2020.
 */
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
 const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
 
-#ifdef MMCV_WITH_CUDA
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-#define CHECK_ERROR(state) \
-  { gpuAssert((state), __FILE__, __LINE__); }
-inline void gpuAssert(cudaError_t code, const char *file, int line,
-                      bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort) exit(code);
-  }
-}
-
-void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
-                                                   const Tensor boxes_a,
-                                                   const int num_b,
-                                                   const Tensor boxes_b,
-                                                   Tensor ans_overlap);
-void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
                                           Tensor ans_overlap) {
-  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                                ans_overlap);
-};
-
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou);
-void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou) {
-  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-};
-
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long *mask, int boxes_num,
-                                       float nms_overlap_thresh);
-
-void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long *mask,
-                            int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
-};
-
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long *mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh);
-
-void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long *mask,
-                                   int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
-                                          nms_overlap_thresh);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
 
-void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                     Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
 
-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_overlap);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_overlap_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b,
-                                         ans_overlap);
-#else
-    AT_ERROR("iou3d_boxes_overlap_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_overlap_bev is not implemented on CPU");
-  }
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
 }
 
-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params boxes_b: (M, 5)
   // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
 
-  if (boxes_a.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_a);
-    CHECK_CUDA_INPUT(boxes_b);
-    CHECK_CUDA_INPUT(ans_iou);
-
-    int num_a = boxes_a.size(0);
-    int num_b = boxes_b.size(0);
-
-    iou3d_boxes_iou_bev_forward_cuda(num_a, boxes_a, num_b, boxes_b, ans_iou);
-#else
-    AT_ERROR("iou3d_boxes_iou_bev is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_boxes_iou_bev is not implemented on CPU");
-  }
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
 }
 
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
 
-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
-
-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
 
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
 
-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_forward_cuda(boxes, mask_data, boxes_num, nms_overlap_thresh);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
 
-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
 
-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
 
-    int num_to_keep = 0;
+  int num_to_keep = 0;
 
-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
 
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
       }
     }
-
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
     *keep_num_data = num_to_keep;
-
-#else
-    AT_ERROR("iou3d_nms is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms is not implemented on CPU");
   }
 }
 
-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params keep: (N)
 
-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CONTIGUOUS(keep);
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
 
-    int boxes_num = boxes.size(0);
-    int64_t *keep_data = keep.data_ptr<int64_t>();
-    int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
 
-    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
 
-    Tensor mask =
-        at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-    unsigned long long *mask_data =
-        (unsigned long long *)mask.data_ptr<int64_t>();
-    iou3d_nms_normal_forward_cuda(boxes, mask_data, boxes_num,
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
                                   nms_overlap_thresh);
 
-    at::Tensor mask_cpu = mask.to(at::kCPU);
-    unsigned long long *mask_host =
-        (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
 
-    std::vector<unsigned long long> remv_cpu(col_blocks);
-    memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
-    int num_to_keep = 0;
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
 
-    for (int i = 0; i < boxes_num; i++) {
-      int nblock = i / THREADS_PER_BLOCK_NMS;
-      int inblock = i % THREADS_PER_BLOCK_NMS;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
 
-      if (!(remv_cpu[nblock] & (1ULL << inblock))) {
-        keep_data[num_to_keep++] = i;
-        unsigned long long *p = &mask_host[0] + i * col_blocks;
-        for (int j = nblock; j < col_blocks; j++) {
-          remv_cpu[j] |= p[j];
-        }
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
       }
     }
-
-    if (cudaSuccess != cudaGetLastError()) printf("Error!\n");
-
-    *keep_num_data = num_to_keep;
-
-#else
-    AT_ERROR("iou3d_nms_normal is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("iou3d_nms_normal is not implemented on CPU");
   }
+
+  *keep_num_data = num_to_keep;
 }
diff --git a/mmcv/ops/csrc/parrots/iou3d_parrots.cpp b/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
index 7a6477e532db4a9a76e7cd165505dd7588b02a99..20e288aeab9bdaef047115bdac645e4b58e4c629 100644
--- a/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -8,7 +8,7 @@
 using namespace parrots;
 
 #ifdef MMCV_WITH_CUDA
-void iou3d_boxes_iou_bev_forward_cuda_parrots(
+void iou3d_boxes_overlap_bev_forward_cuda_parrots(
     CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
     OperatorBase::out_list_t& outs) {
   auto boxes_a = buildATensor(ctx, ins[0]);
@@ -16,12 +16,12 @@ void iou3d_boxes_iou_bev_forward_cuda_parrots(
 
   auto ans_iou = buildATensor(ctx, outs[0]);
 
-  iou3d_boxes_iou_bev_forward(boxes_a, boxes_b, ans_iou);
+  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
 }
 
-void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
-                                    const OperatorBase::in_list_t& ins,
-                                    OperatorBase::out_list_t& outs) {
+void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
   float nms_overlap_thresh;
   SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
 
@@ -30,13 +30,13 @@ void iou3d_nms_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
   auto keep = buildATensor(ctx, outs[0]);
   auto keep_num = buildATensor(ctx, outs[1]);
 
-  iou3d_nms_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }
 
-void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
-                                           const SSElement& attr,
-                                           const OperatorBase::in_list_t& ins,
-                                           OperatorBase::out_list_t& outs) {
+void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
   float nms_overlap_thresh;
   SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
 
@@ -45,26 +45,26 @@ void iou3d_nms_normal_forward_cuda_parrots(CudaContext& ctx,
   auto keep = buildATensor(ctx, outs[0]);
   auto keep_num = buildATensor(ctx, outs[1]);
 
-  iou3d_nms_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
 }
 
-PARROTS_EXTENSION_REGISTER(iou3d_boxes_iou_bev_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
     .input(2)
     .output(1)
-    .apply(iou3d_boxes_iou_bev_forward_cuda_parrots)
+    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
     .done();
 
-PARROTS_EXTENSION_REGISTER(iou3d_nms_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
     .attr("nms_overlap_thresh")
     .input(1)
     .output(2)
-    .apply(iou3d_nms_forward_cuda_parrots)
+    .apply(iou3d_nms3d_forward_cuda_parrots)
     .done();
 
-PARROTS_EXTENSION_REGISTER(iou3d_nms_normal_forward)
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
     .attr("nms_overlap_thresh")
     .input(1)
     .output(2)
-    .apply(iou3d_nms_normal_forward_cuda_parrots)
+    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
     .done();
 #endif
diff --git a/mmcv/ops/csrc/parrots/iou3d_pytorch.h b/mmcv/ops/csrc/parrots/iou3d_pytorch.h
index 01777d3587e32f0387f41a78070f69ae7ce46784..76170edc7083dbaff4a2d23356c4e7702b929a2d 100644
--- a/mmcv/ops/csrc/parrots/iou3d_pytorch.h
+++ b/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -4,13 +4,13 @@
 #include <torch/extension.h>
 using namespace at;
 
-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou);
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
 
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh);
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
 
-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh);
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
 
 #endif  // IOU_3D_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/knn.cpp b/mmcv/ops/csrc/parrots/knn.cpp
index 55105eb019b42ae3ce75ce65facf8841f46818cf..b4be9428c59c0f04635891b954f4c73f7fb0536d 100644
--- a/mmcv/ops/csrc/parrots/knn.cpp
+++ b/mmcv/ops/csrc/parrots/knn.cpp
@@ -2,31 +2,16 @@
 // https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
-                                  const Tensor xyz, const Tensor new_xyz,
-                                  Tensor idx, Tensor dist2);
-
-void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
                       const Tensor new_xyz, Tensor idx, Tensor dist2) {
-  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
 }
-#endif
 
 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                  Tensor dist2_tensor, int b, int n, int m, int nsample) {
-  if (new_xyz_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(new_xyz_tensor);
-    CHECK_CUDA_INPUT(xyz_tensor);
-
-    knn_forward_cuda(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
-                     dist2_tensor);
-#else
-    AT_ERROR("knn is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("knn is not implemented on CPU");
-  }
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/masked_conv2d.cpp b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
index 338cd8589848bcd224fc4573a2df6b035d0fb895..5903925351fcb193b86c8b5f01b410e4fc0bbaf9 100644
--- a/mmcv/ops/csrc/parrots/masked_conv2d.cpp
+++ b/mmcv/ops/csrc/parrots/masked_conv2d.cpp
@@ -1,75 +1,33 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int kernel_h,
-                                           const int kernel_w, const int pad_h,
-                                           const int pad_w);
-
-void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
-                                           const Tensor mask_h_idx,
-                                           const Tensor mask_w_idx,
-                                           Tensor top_data, const int height,
-                                           const int width, const int channels);
-
-void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
                                 const Tensor mask_w_idx, Tensor col,
                                 const int kernel_h, const int kernel_w,
                                 const int pad_h, const int pad_w) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
-  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
-                                        kernel_h, kernel_w, pad_h, pad_w);
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
 }
 
-void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
                                 const Tensor mask_w_idx, Tensor im, int height,
                                 int width, int channels) {
-  // im: (n, ic, h, w), kernel size (kh, kw)
-  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
-  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
-                                        width, channels);
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
 }
-#endif
 
 void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
                            const Tensor mask_w_idx, Tensor col,
                            const int kernel_h, const int kernel_w,
                            const int pad_h, const int pad_w) {
-  if (im.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(im);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(col);
-    masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
-                               kernel_w, pad_h, pad_w);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
 }
 
 void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
                            const Tensor mask_w_idx, Tensor im, int height,
                            int width, int channels) {
-  if (col.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(col);
-    CHECK_CUDA_INPUT(mask_h_idx);
-    CHECK_CUDA_INPUT(mask_w_idx);
-    CHECK_CUDA_INPUT(im);
-    masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
-                               channels);
-#else
-    AT_ERROR("MaskConv is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("MaskConv is not implemented on CPU");
-  }
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
 }
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons.cpp b/mmcv/ops/csrc/parrots/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp b/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9e4ff4b3dd80746ca534cbf4f02ace966b363d8
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h b/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1df27641882c6ae29028809f726c1a19b9a192cd
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
index c5e78c3a3297e2351043c807891c7357bcb05973..12b538a05e6fd98becccfddf8e79cba7abf96d93 100644
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -1,59 +1,49 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-
-void modulated_deformable_im2col_cuda(
+void modulated_deformable_im2col_impl(
     const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
     const int batch_size, const int channels, const int height_im,
     const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
-
-void modulated_deformable_col2im_cuda(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
     const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
-
-void modulated_deformable_col2im_coord_cuda(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
-
-#endif
-
-void modulated_deformable_im2col_cpu(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col);
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
 
-void modulated_deformable_col2im_cpu(
+void modulated_deformable_col2im_impl(
     const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
     const int batch_size, const int channels, const int height_im,
     const int width_im, const int height_col, const int width_col,
     const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
     const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im);
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
 
-void modulated_deformable_col2im_coord_cpu(
+void modulated_deformable_col2im_coord_impl(
     const Tensor data_col, const Tensor data_im, const Tensor data_offset,
     const Tensor data_mask, const int batch_size, const int channels,
     const int height_im, const int width_im, const int height_col,
     const int width_col, const int kernel_h, const int kernel_w,
     const int pad_h, const int pad_w, const int stride_h, const int stride_w,
     const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask);
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
 
 void modulated_deform_conv_forward(
     Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
@@ -61,31 +51,6 @@ void modulated_deform_conv_forward(
     const int stride_h, const int stride_w, const int pad_h, const int pad_w,
     const int dilation_h, const int dilation_w, const int group,
     const int deformable_group, const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(columns);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(output);
-    CHECK_CPU_INPUT(columns);
-  }
-
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -127,19 +92,10 @@ void modulated_deform_conv_forward(
                         output.size(2), output.size(3)});
 
   for (int b = 0; b < batch; b++) {
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
 
     // divide into group
     weight = weight.view({group, weight.size(0) / group, weight.size(1),
@@ -174,41 +130,6 @@ void modulated_deform_conv_backward(
     int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(ones);
-    CHECK_CUDA_INPUT(offset);
-    CHECK_CUDA_INPUT(mask);
-    CHECK_CUDA_INPUT(columns);
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_weight);
-    CHECK_CUDA_INPUT(grad_bias);
-    CHECK_CUDA_INPUT(grad_offset);
-    CHECK_CUDA_INPUT(grad_mask);
-    CHECK_CUDA_INPUT(grad_output);
-
-#else
-    AT_ERROR("ModulatedDeformConv is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(weight);
-    CHECK_CPU_INPUT(bias);
-    CHECK_CPU_INPUT(ones);
-    CHECK_CPU_INPUT(offset);
-    CHECK_CPU_INPUT(mask);
-    CHECK_CPU_INPUT(columns);
-    CHECK_CPU_INPUT(grad_input);
-    CHECK_CPU_INPUT(grad_weight);
-    CHECK_CPU_INPUT(grad_bias);
-    CHECK_CPU_INPUT(grad_offset);
-    CHECK_CPU_INPUT(grad_mask);
-    CHECK_CPU_INPUT(grad_output);
-  }
-
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -261,46 +182,24 @@ void modulated_deform_conv_backward(
     weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
                           weight.size(3), weight.size(4)});
 
-    if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cuda(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cuda(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cuda(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-#endif
-    } else {
-      // gradient w.r.t. input coordinate data
-      modulated_deformable_col2im_coord_cpu(
-          columns, input[b], offset[b], mask[b], 1, channels, height, width,
-          height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-          stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
-          grad_mask[b]);
-      // gradient w.r.t. input data
-      modulated_deformable_col2im_cpu(
-          columns, offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, grad_input[b]);
-      // gradient w.r.t. weight, dWeight should accumulate across the batch and
-      // group
-      modulated_deformable_im2col_cpu(
-          input[b], offset[b], mask[b], 1, channels, height, width, height_out,
-          width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
-          dilation_h, dilation_w, deformable_group, columns);
-    }
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
 
     columns = columns.view({group, columns.size(0) / group, columns.size(1)});
     grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
diff --git a/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp b/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
deleted file mode 100644
index 89a81d73349571fa92d0aeb27228c4c26ce781b6..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/modulated_deform_conv_cpu.cpp
+++ /dev/null
@@ -1,403 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T>
-T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
-                           const int height, const int width, T h, T w) {
-  int h_low = floorf(h);
-  int w_low = floorf(w);
-  int h_high = h_low + 1;
-  int w_high = w_low + 1;
-
-  T lh = h - h_low;
-  T lw = w - w_low;
-  T hh = 1 - lh, hw = 1 - lw;
-
-  T v1 = 0;
-  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
-  T v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-    v2 = input[h_low * data_width + w_high];
-  T v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-    v3 = input[h_high * data_width + w_low];
-  T v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-    v4 = input[h_high * data_width + w_high];
-
-  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-template <typename T>
-T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
-                               const int height, const int width) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floorf(argmax_h);
-  int argmax_w_low = floorf(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-  if (h == argmax_h_low && w == argmax_w_low)
-    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
-  if (h == argmax_h_low && w == argmax_w_high)
-    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
-  if (h == argmax_h_high && w == argmax_w_low)
-    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
-  if (h == argmax_h_high && w == argmax_w_high)
-    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
-  return weight;
-}
-
-template <typename T>
-T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
-                                 const int width, const T *im_data,
-                                 const int data_width, const int bp_dir) {
-  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
-      argmax_w >= width) {
-    // empty
-    return 0;
-  }
-
-  int argmax_h_low = floorf(argmax_h);
-  int argmax_w_low = floorf(argmax_w);
-  int argmax_h_high = argmax_h_low + 1;
-  int argmax_w_high = argmax_w_low + 1;
-
-  T weight = 0;
-
-  if (bp_dir == 0) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += -1 * (argmax_w - argmax_w_low) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += (argmax_w_low + 1 - argmax_w) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_w - argmax_w_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  } else if (bp_dir == 1) {
-    if (argmax_h_low >= 0 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_low];
-    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
-      weight += (argmax_h_low + 1 - argmax_h) *
-                im_data[argmax_h_low * data_width + argmax_w_high];
-    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
-      weight += -1 * (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_low];
-    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
-      weight += (argmax_h - argmax_h_low) *
-                im_data[argmax_h_high * data_width + argmax_w_high];
-  }
-
-  return weight;
-}
-
-template <typename T>
-void modulated_deformable_im2col_cpu_kernel(
-    const int n, const T *data_im, const T *data_offset, const T *data_mask,
-    const int height, const int width, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int num_channels, const int deformable_group, const int height_col,
-    const int width_col, T *data_col) {
-  for (int index = 0; index < n; index++) {
-    // index index of output matrix
-    const int w_col = index % width_col;
-    const int h_col = (index / width_col) % height_col;
-    const int b_col = (index / width_col / height_col) % batch_size;
-    const int c_im = (index / width_col / height_col) / batch_size;
-    const int c_col = c_im * kernel_h * kernel_w;
-
-    // compute deformable group index
-    const int deformable_group_index = c_im / channel_per_deformable_group;
-
-    const int h_in = h_col * stride_h - pad_h;
-    const int w_in = w_col * stride_w - pad_w;
-
-    T *data_col_ptr =
-        data_col +
-        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
-    const T *data_im_ptr =
-        data_im + (b_col * num_channels + c_im) * height * width;
-    const T *data_offset_ptr =
-        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-
-    const T *data_mask_ptr =
-        data_mask + (b_col * deformable_group + deformable_group_index) *
-                        kernel_h * kernel_w * height_col * width_col;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        const int data_offset_h_ptr =
-            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
-        const int data_offset_w_ptr =
-            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
-            w_col;
-        const int data_mask_hw_ptr =
-            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-        const T offset_h = data_offset_ptr[data_offset_h_ptr];
-        const T offset_w = data_offset_ptr[data_offset_w_ptr];
-        const T mask = data_mask_ptr[data_mask_hw_ptr];
-        T val = static_cast<T>(0);
-        const T h_im = h_in + i * dilation_h + offset_h;
-        const T w_im = w_in + j * dilation_w + offset_w;
-        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
-          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
-                                         h_im, w_im);
-        *data_col_ptr = val * mask;
-        data_col_ptr += batch_size * height_col * width_col;
-      }
-    }
-  }
-}
-
-template <typename T>
-void modulated_deformable_col2im_cpu_kernel(
-    const int n, const T *data_col, const T *data_offset, const T *data_mask,
-    const int channels, const int height, const int width, const int kernel_h,
-    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
-    const int stride_w, const int dilation_h, const int dilation_w,
-    const int channel_per_deformable_group, const int batch_size,
-    const int deformable_group, const int height_col, const int width_col,
-    T *grad_im) {
-  for (int index = 0; index < n; index++) {
-    const int j = (index / width_col / height_col / batch_size) % kernel_w;
-    const int i =
-        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
-    const int c =
-        index / width_col / height_col / batch_size / kernel_w / kernel_h;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / channel_per_deformable_group;
-
-    int w_out = index % width_col;
-    int h_out = (index / width_col) % height_col;
-    int b = (index / width_col / height_col) % batch_size;
-    int w_in = w_out * stride_w - pad_w;
-    int h_in = h_out * stride_h - pad_h;
-
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const T *data_mask_ptr =
-        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
-                        kernel_w * height_col * width_col;
-    const int data_offset_h_ptr =
-        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
-    const int data_offset_w_ptr =
-        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
-    const int data_mask_hw_ptr =
-        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const T offset_h = data_offset_ptr[data_offset_h_ptr];
-    const T offset_w = data_offset_ptr[data_offset_w_ptr];
-    const T mask = data_mask_ptr[data_mask_hw_ptr];
-    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
-
-    const T cur_top_grad = data_col[index] * mask;
-    const int cur_h = (int)cur_inv_h_data;
-    const int cur_w = (int)cur_inv_w_data;
-    for (int dy = -2; dy <= 2; dy++) {
-      for (int dx = -2; dx <= 2; dx++) {
-        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
-            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
-            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
-          int cur_bottom_grad_pos =
-              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
-                                                  cur_inv_w_data, cur_h + dy,
-                                                  cur_w + dx, height, width);
-          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void modulated_deformable_col2im_coord_cpu_kernel(
-    const int n, const T *data_col, const T *data_im, const T *data_offset,
-    const T *data_mask, const int channels, const int height, const int width,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int channel_per_deformable_group,
-    const int batch_size, const int offset_channels, const int deformable_group,
-    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
-  for (int index = 0; index < n; index++) {
-    T val = 0, mval = 0;
-    int w = index % width_col;
-    int h = (index / width_col) % height_col;
-    int c = (index / width_col / height_col) % offset_channels;
-    int b = (index / width_col / height_col) / offset_channels;
-    // compute the start and end of the output
-
-    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
-    const int col_step = kernel_h * kernel_w;
-    int cnt = 0;
-    const T *data_col_ptr = data_col + deformable_group_index *
-                                           channel_per_deformable_group *
-                                           batch_size * width_col * height_col;
-    const T *data_im_ptr =
-        data_im + (b * deformable_group + deformable_group_index) *
-                      channel_per_deformable_group / kernel_h / kernel_w *
-                      height * width;
-    const T *data_offset_ptr =
-        data_offset + (b * deformable_group + deformable_group_index) * 2 *
-                          kernel_h * kernel_w * height_col * width_col;
-    const T *data_mask_ptr =
-        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
-                        kernel_w * height_col * width_col;
-
-    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
-
-    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
-         col_c += col_step) {
-      const int col_pos =
-          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
-      const int bp_dir = offset_c % 2;
-
-      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
-      int i =
-          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
-      int w_out = col_pos % width_col;
-      int h_out = (col_pos / width_col) % height_col;
-      int w_in = w_out * stride_w - pad_w;
-      int h_in = h_out * stride_h - pad_h;
-      const int data_offset_h_ptr =
-          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
-      const int data_offset_w_ptr =
-          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
-           w_out);
-      const int data_mask_hw_ptr =
-          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-      const T offset_h = data_offset_ptr[data_offset_h_ptr];
-      const T offset_w = data_offset_ptr[data_offset_w_ptr];
-      const T mask = data_mask_ptr[data_mask_hw_ptr];
-      T inv_h = h_in + i * dilation_h + offset_h;
-      T inv_w = w_in + j * dilation_w + offset_w;
-      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
-        inv_h = inv_w = -2;
-      else
-        mval += data_col_ptr[col_pos] *
-                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
-                                         width, height, width, inv_h, inv_w);
-      const T weight = dmcn_get_coordinate_weight_cpu(
-          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
-          width, bp_dir);
-      val += weight * data_col_ptr[col_pos] * mask;
-      cnt += 1;
-    }
-    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
-    grad_offset[index] = val;
-    if (offset_c % 2 == 0)
-      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
-      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
-      // height_col + h) * width_col + w], mask_req, mval);
-      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
-                      kernel_w +
-                  offset_c / 2) *
-                     height_col +
-                 h) *
-                    width_col +
-                w] = mval;
-  }
-}
-
-void modulated_deformable_im2col_cpu(
-    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor data_col) {
-  // num_axes should be smaller than block size
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels = channels * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-
-        modulated_deformable_im2col_cpu_kernel(
-            num_kernels, data_im_, data_offset_, data_mask_, height_im,
-            width_im, kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w,
-            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
-            channels, deformable_group, height_col, width_col, data_col_);
-      }));
-}
-
-void modulated_deformable_col2im_cpu(
-    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
-    const int batch_size, const int channels, const int height_im,
-    const int width_im, const int height_col, const int width_col,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int stride_h, const int stride_w, const int dilation_h,
-    const int dilation_w, const int deformable_group, Tensor grad_im) {
-  const int channel_per_deformable_group = channels / deformable_group;
-  const int num_kernels =
-      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_cpu_kernel(
-            num_kernels, data_col_, data_offset_, data_mask_, channels,
-            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
-            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
-            batch_size, deformable_group, height_col, width_col, grad_im_);
-      }));
-}
-
-void modulated_deformable_col2im_coord_cpu(
-    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
-    const Tensor data_mask, const int batch_size, const int channels,
-    const int height_im, const int width_im, const int height_col,
-    const int width_col, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
-    const int dilation_h, const int dilation_w, const int deformable_group,
-    Tensor grad_offset, Tensor grad_mask) {
-  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
-                          kernel_w * deformable_group;
-  const int channel_per_deformable_group =
-      channels * kernel_h * kernel_w / deformable_group;
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
-        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
-        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
-        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
-
-        modulated_deformable_col2im_coord_cpu_kernel(
-            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
-            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
-            stride_h, stride_w, dilation_h, dilation_w,
-            channel_per_deformable_group, batch_size,
-            2 * kernel_h * kernel_w * deformable_group, deformable_group,
-            height_col, width_col, grad_offset_, grad_mask_);
-      }));
-}
diff --git a/mmcv/ops/csrc/parrots/ms_deform_attn.cpp b/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
index 9bfabdda5845478a61b2526110ca3664f39de3cf..25c8f6209b16c475ba181eea7c880eb27cca4082 100644
--- a/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
+++ b/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -10,42 +10,39 @@
 */
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-Tensor ms_deform_attn_cuda_forward(const Tensor &value,
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
                                    const Tensor &spatial_shapes,
                                    const Tensor &level_start_index,
                                    const Tensor &sampling_loc,
                                    const Tensor &attn_weight,
-                                   const int im2col_step);
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
 
-void ms_deform_attn_cuda_backward(
+void ms_deform_attn_impl_backward(
     const Tensor &value, const Tensor &spatial_shapes,
     const Tensor &level_start_index, const Tensor &sampling_loc,
     const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
-    Tensor &grad_sampling_loc, Tensor &grad_attn_weight, const int im2col_step);
-
-#endif
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
 
 Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
                               const Tensor &level_start_index,
                               const Tensor &sampling_loc,
                               const Tensor &attn_weight,
                               const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    return ms_deform_attn_cuda_forward(value, spatial_shapes, level_start_index,
-                                       sampling_loc, attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  }
-  AT_ERROR("Not implemented on the CPU");
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
 }
 
 void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
@@ -55,25 +52,9 @@ void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
                              const Tensor &grad_output, Tensor &grad_value,
                              Tensor &grad_sampling_loc,
                              Tensor &grad_attn_weight, const int im2col_step) {
-  if (value.type().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(value)
-    CHECK_CUDA_INPUT(spatial_shapes)
-    CHECK_CUDA_INPUT(level_start_index)
-    CHECK_CUDA_INPUT(sampling_loc)
-    CHECK_CUDA_INPUT(attn_weight)
-    CHECK_CUDA_INPUT(grad_output)
-    CHECK_CUDA_INPUT(grad_value)
-    CHECK_CUDA_INPUT(grad_sampling_loc)
-    CHECK_CUDA_INPUT(grad_attn_weight)
-    ms_deform_attn_cuda_backward(value, spatial_shapes, level_start_index,
-                                 sampling_loc, attn_weight, grad_output,
-                                 grad_value, grad_sampling_loc,
-                                 grad_attn_weight, im2col_step);
-#else
-    AT_ERROR("Not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("Not implemented on the CPU");
-  }
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
 }
diff --git a/mmcv/ops/csrc/parrots/nms.cpp b/mmcv/ops/csrc/parrots/nms.cpp
index e88208dc9fdc3ae1b1c88fcd9d3b9ed1e3f47ecb..199d8af236f5442fcdd53ce3dfd8d24aa67481bb 100644
--- a/mmcv/ops/csrc/parrots/nms.cpp
+++ b/mmcv/ops/csrc/parrots/nms.cpp
@@ -1,261 +1,33 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
-                             int offset);
-
-Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
 }
-#endif
-
-Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  if (boxes.numel() == 0) {
-    return at::empty({0}, boxes.options().dtype(at::kLong));
-  }
-  auto x1_t = boxes.select(1, 0).contiguous();
-  auto y1_t = boxes.select(1, 1).contiguous();
-  auto x2_t = boxes.select(1, 2).contiguous();
-  auto y2_t = boxes.select(1, 3).contiguous();
-
-  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto nboxes = boxes.size(0);
-  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
-
-  auto select = select_t.data_ptr<bool>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-
-  for (int64_t _i = 0; _i < nboxes; _i++) {
-    if (select[_i] == false) continue;
-    auto i = order[_i];
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
-      if (select[_j] == false) continue;
-      auto j = order[_j];
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
 
-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr > iou_threshold) select[_j] = false;
-    }
-  }
-  return order_t.masked_select(select_t);
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
 }
 
-Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
-  if (boxes.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes);
-    CHECK_CUDA_INPUT(scores);
-    return nms_cuda(boxes, scores, iou_threshold, offset);
-#else
-    AT_ERROR("nms is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(boxes);
-    CHECK_CPU_INPUT(scores);
-    return nms_cpu(boxes, scores, iou_threshold, offset);
-  }
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
 }
 
-Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
-                   float iou_threshold, float sigma, float min_score,
-                   int method, int offset) {
-  if (boxes.numel() == 0) {
-    return at::empty({0}, boxes.options().dtype(at::kLong));
-  }
-
-  auto x1_t = boxes.select(1, 0).contiguous();
-  auto y1_t = boxes.select(1, 1).contiguous();
-  auto x2_t = boxes.select(1, 2).contiguous();
-  auto y2_t = boxes.select(1, 3).contiguous();
-  auto scores_t = scores.clone();
-
-  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
-
-  auto nboxes = boxes.size(0);
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto sc = scores_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-  auto de = dets.data_ptr<float>();
-
-  int64_t pos = 0;
-  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
-  auto inds = inds_t.data_ptr<int64_t>();
-
-  for (int64_t i = 0; i < nboxes; i++) {
-    auto max_score = sc[i];
-    auto max_pos = i;
-
-    pos = i + 1;
-    // get max box
-    while (pos < nboxes) {
-      if (max_score < sc[pos]) {
-        max_score = sc[pos];
-        max_pos = pos;
-      }
-      pos = pos + 1;
-    }
-    // swap
-    auto ix1 = de[i * 5 + 0] = x1[max_pos];
-    auto iy1 = de[i * 5 + 1] = y1[max_pos];
-    auto ix2 = de[i * 5 + 2] = x2[max_pos];
-    auto iy2 = de[i * 5 + 3] = y2[max_pos];
-    auto iscore = de[i * 5 + 4] = sc[max_pos];
-    auto iarea = areas[max_pos];
-    auto iind = inds[max_pos];
-    x1[max_pos] = x1[i];
-    y1[max_pos] = y1[i];
-    x2[max_pos] = x2[i];
-    y2[max_pos] = y2[i];
-    sc[max_pos] = sc[i];
-    areas[max_pos] = areas[i];
-    inds[max_pos] = inds[i];
-    x1[i] = ix1;
-    y1[i] = iy1;
-    x2[i] = ix2;
-    y2[i] = iy2;
-    sc[i] = iscore;
-    areas[i] = iarea;
-    inds[i] = iind;
-
-    pos = i + 1;
-    while (pos < nboxes) {
-      auto xx1 = std::max(ix1, x1[pos]);
-      auto yy1 = std::max(iy1, y1[pos]);
-      auto xx2 = std::min(ix2, x2[pos]);
-      auto yy2 = std::min(iy2, y2[pos]);
-
-      auto w = std::max(0.f, xx2 - xx1 + offset);
-      auto h = std::max(0.f, yy2 - yy1 + offset);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[pos] - inter);
-
-      float weight = 1.;
-      if (method == 0) {
-        if (ovr >= iou_threshold) weight = 0;
-      } else if (method == 1) {
-        if (ovr >= iou_threshold) weight = 1 - ovr;
-      } else if (method == 2) {
-        weight = std::exp(-(ovr * ovr) / sigma);
-      }
-      sc[pos] *= weight;
-      // if box score falls below threshold, discard the box by
-      // swapping with last box update N
-      if (sc[pos] < min_score) {
-        x1[pos] = x1[nboxes - 1];
-        y1[pos] = y1[nboxes - 1];
-        x2[pos] = x2[nboxes - 1];
-        y2[pos] = y2[nboxes - 1];
-        sc[pos] = sc[nboxes - 1];
-        areas[pos] = areas[nboxes - 1];
-        inds[pos] = inds[nboxes - 1];
-        nboxes = nboxes - 1;
-        pos = pos - 1;
-      }
-      pos = pos + 1;
-    }
-  }
-  return inds_t.slice(0, 0, nboxes);
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
 }
 
 Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
                float sigma, float min_score, int method, int offset) {
-  if (boxes.device().is_cuda()) {
-    AT_ERROR("softnms is not implemented on GPU");
-  } else {
-    return softnms_cpu(boxes, scores, dets, iou_threshold, sigma, min_score,
-                       method, offset);
-  }
-}
-
-std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
-  auto x1_t = dets.select(1, 0).contiguous();
-  auto y1_t = dets.select(1, 1).contiguous();
-  auto x2_t = dets.select(1, 2).contiguous();
-  auto y2_t = dets.select(1, 3).contiguous();
-  auto scores = dets.select(1, 4).contiguous();
-
-  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  at::Tensor suppressed_t =
-      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto order = order_t.data_ptr<int64_t>();
-  auto x1 = x1_t.data_ptr<float>();
-  auto y1 = y1_t.data_ptr<float>();
-  auto x2 = x2_t.data_ptr<float>();
-  auto y2 = y2_t.data_ptr<float>();
-  auto areas = areas_t.data_ptr<float>();
-
-  std::vector<int> keep;
-  std::vector<std::vector<int> > matched;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) continue;
-    keep.push_back(i);
-    std::vector<int> v_i;
-    auto ix1 = x1[i];
-    auto iy1 = y1[i];
-    auto ix2 = x2[i];
-    auto iy2 = y2[i];
-    auto iarea = areas[i];
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) continue;
-      auto xx1 = std::max(ix1, x1[j]);
-      auto yy1 = std::max(iy1, y1[j]);
-      auto xx2 = std::min(ix2, x2[j]);
-      auto yy2 = std::min(iy2, y2[j]);
-
-      auto w = std::max(static_cast<float>(0), xx2 - xx1);
-      auto h = std::max(static_cast<float>(0), yy2 - yy1);
-      auto inter = w * h;
-      auto ovr = inter / (iarea + areas[j] - inter);
-      if (ovr >= iou_threshold) {
-        suppressed[j] = 1;
-        v_i.push_back(j);
-      }
-    }
-    matched.push_back(v_i);
-  }
-  for (int i = 0; i < keep.size(); i++)
-    matched[i].insert(matched[i].begin(), keep[i]);
-  return matched;
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
 }
 
 std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
-  if (dets.device().is_cuda()) {
-    AT_ERROR("nms_match is not implemented on GPU");
-  } else {
-    return nms_match_cpu(dets, iou_threshold);
-  }
+  return nms_match_impl(dets, iou_threshold);
 }
diff --git a/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp b/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
deleted file mode 100644
index 042cb7e8fe8ef361c12c0e96d76ffe9efabbde42..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/nms_rotated_cpu.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-// modified from
-// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
-#include "box_iou_rotated_utils.hpp"
-#include "pytorch_cpp_helper.hpp"
-
-template <typename scalar_t>
-Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
-                              const float iou_threshold) {
-  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
-  // however, the code in this function is much shorter because
-  // we delegate the IoU computation for rotated boxes to
-  // the single_box_iou_rotated function in box_iou_rotated_utils.h
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
-  AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor");
-  AT_ASSERTM(dets.type() == scores.type(),
-             "dets should have the same type as scores");
-
-  if (dets.numel() == 0) {
-    return at::empty({0}, dets.options().dtype(at::kLong));
-  }
-
-  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
-
-  auto ndets = dets.size(0);
-  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
-  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
-
-  auto suppressed = suppressed_t.data_ptr<uint8_t>();
-  auto keep = keep_t.data_ptr<int64_t>();
-  auto order = order_t.data_ptr<int64_t>();
-
-  int64_t num_to_keep = 0;
-
-  for (int64_t _i = 0; _i < ndets; _i++) {
-    auto i = order[_i];
-    if (suppressed[i] == 1) {
-      continue;
-    }
-
-    keep[num_to_keep++] = i;
-
-    for (int64_t _j = _i + 1; _j < ndets; _j++) {
-      auto j = order[_j];
-      if (suppressed[j] == 1) {
-        continue;
-      }
-
-      auto ovr = single_box_iou_rotated<scalar_t>(
-          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
-      if (ovr >= iou_threshold) {
-        suppressed[j] = 1;
-      }
-    }
-  }
-  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
-}
-
-Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
-                       const float iou_threshold) {
-  auto result = at::empty({0}, dets.options());
-  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
-    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
-  });
-  return result;
-}
diff --git a/mmcv/ops/csrc/parrots/pixel_group.cpp b/mmcv/ops/csrc/parrots/pixel_group.cpp
index 5c6af46eb20b56121e60ba4f880400e899a5d44f..2bf8c8bbf2061cacb9e0c2d33c8a635834407622 100644
--- a/mmcv/ops/csrc/parrots/pixel_group.cpp
+++ b/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -2,120 +2,14 @@
 // It is modified from https://github.com/WenmuZhou/PAN.pytorch
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-std::vector<std::vector<float>> estimate_confidence(int32_t* label,
-                                                    float* score, int label_num,
-                                                    int height, int width) {
-  std::vector<std::vector<float>> point_vector;
-  for (int i = 0; i < label_num; i++) {
-    std::vector<float> point;
-    point.push_back(0);
-    point.push_back(0);
-    point_vector.push_back(point);
-  }
-  for (int y = 0; y < height; y++) {
-    auto label_tmp = label + y * width;
-    auto score_tmp = score + y * width;
-    for (int x = 0; x < width; x++) {
-      auto l = label_tmp[x];
-      if (l > 0) {
-        float confidence = score_tmp[x];
-        point_vector[l].push_back(x);
-        point_vector[l].push_back(y);
-        point_vector[l][0] += confidence;
-        point_vector[l][1] += 1;
-      }
-    }
-  }
-  for (int l = 0; l < point_vector.size(); l++)
-    if (point_vector[l][1] > 0) {
-      point_vector[l][0] /= point_vector[l][1];
-    }
-  return point_vector;
-}
-std::vector<std::vector<float>> pixel_group_cpu(
+std::vector<std::vector<float>> pixel_group_impl(
     Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
     Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
-  assert(score.dim() == 2);
-  assert(mask.dim() == 2);
-  assert(embedding_dim.dim() == 3);
-  int height = score.size(0);
-  int width = score.size(1);
-  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
-  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
-
-  auto threshold_square = dis_threshold * dis_threshold;
-  auto ptr_score = score.data_ptr<float>();
-  auto ptr_mask = mask.data_ptr<bool>();
-  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
-  auto ptr_embedding = embedding.data_ptr<float>();
-  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
-  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
-  auto embedding_dim = embedding.size(2);
-  std::vector<std::vector<float>> kernel_vector(
-      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
-
-  Tensor text_label;
-  text_label = kernel_label.clone();
-  auto ptr_text_label = text_label.data_ptr<int32_t>();
-
-  for (int i = 0; i < height; i++) {
-    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
-    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
-    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
-
-    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
-         j++, k += embedding_dim) {
-      int32_t label = ptr_kernel_label_tmp[j];
-      if (label > 0) {
-        for (int d = 0; d < embedding_dim; d++)
-          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
-        kernel_vector[label][embedding_dim] += 1;
-        // kernel pixel number
-        if (ptr_kernel_contour_tmp[j]) {
-          contour_pixels.push(std::make_tuple(i, j, label));
-        }
-      }
-    }
-  }
-  for (int i = 0; i < kernel_region_num; i++) {
-    for (int j = 0; j < embedding_dim; j++) {
-      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
-    }
-  }
-  int dx[4] = {-1, 1, 0, 0};
-  int dy[4] = {0, 0, -1, 1};
-  while (!contour_pixels.empty()) {
-    auto query_pixel = contour_pixels.front();
-    contour_pixels.pop();
-    int y = std::get<0>(query_pixel);
-    int x = std::get<1>(query_pixel);
-    int32_t l = std::get<2>(query_pixel);
-    auto kernel_cv = kernel_vector[l];
-    for (int idx = 0; idx < 4; idx++) {
-      int tmpy = y + dy[idx];
-      int tmpx = x + dx[idx];
-      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
-      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
-      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
-        continue;
-
-      float dis = 0;
-      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < embedding_dim; i++) {
-        dis +=
-            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
-        // ignore further computing if dis is big enough
-        if (dis >= threshold_square) break;
-      }
-      if (dis >= threshold_square) continue;
-      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
-      ptr_text_label_tmp[tmpx] = l;
-    }
-  }
-
-  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
-                             height, width);
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
 }
 
 std::vector<std::vector<float>> pixel_group(
@@ -127,11 +21,6 @@ std::vector<std::vector<float>> pixel_group(
   kernel_label = kernel_label.contiguous();
   kernel_contour = kernel_contour.contiguous();
 
-  CHECK_CPU_INPUT(score);
-  CHECK_CPU_INPUT(mask);
-  CHECK_CPU_INPUT(embedding);
-  CHECK_CPU_INPUT(kernel_label);
-  CHECK_CPU_INPUT(kernel_contour);
-  return pixel_group_cpu(score, mask, embedding, kernel_label, kernel_contour,
-                         kernel_region_num, distance_threshold);
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
 }
diff --git a/mmcv/ops/csrc/parrots/points_in_boxes.cpp b/mmcv/ops/csrc/parrots/points_in_boxes.cpp
index 9ebeec9ab8406dbacf937edb106b28558f7c0db5..540da94038f6dea2dc10443905f289ddd131f1af 100644
--- a/mmcv/ops/csrc/parrots/points_in_boxes.cpp
+++ b/mmcv/ops/csrc/parrots/points_in_boxes.cpp
@@ -1,32 +1,21 @@
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                                int pts_num, const Tensor boxes,
-                                                const Tensor pts,
-                                                Tensor box_idx_of_points);
-
-void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
                                        int pts_num, const Tensor boxes,
                                        const Tensor pts,
                                        Tensor box_idx_of_points) {
-  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                             boxes, pts, box_idx_of_points);
-};
-
-void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
-                                               int pts_num, const Tensor boxes,
-                                               const Tensor pts,
-                                               Tensor box_idx_of_points);
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
 
-void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
                                       int pts_num, const Tensor boxes,
                                       const Tensor pts,
                                       Tensor box_idx_of_points) {
-  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
-                                            boxes, pts, box_idx_of_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
 
 void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
                                   Tensor box_idx_of_points_tensor) {
@@ -34,30 +23,12 @@ void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
   // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
   // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
   // default -1
-
-  if (pts_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_tensor);
-    CHECK_CUDA_INPUT(pts_tensor);
-    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
-
-    int batch_size = boxes_tensor.size(0);
-    int boxes_num = boxes_tensor.size(1);
-    int pts_num = pts_tensor.size(1);
-
-    const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data_ptr<float>();
-    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-
-    points_in_boxes_part_forward_cuda(batch_size, boxes_num, pts_num,
-                                      boxes_tensor, pts_tensor,
-                                      box_idx_of_points_tensor);
-#else
-    AT_ERROR("points_in_boxes_part is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("points_in_boxes_part is not implemented on CPU");
-  }
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
 }
 
 void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
@@ -65,28 +36,9 @@ void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
   // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
   // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
   // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
-
-  if (pts_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(boxes_tensor);
-    CHECK_CUDA_INPUT(pts_tensor);
-    CHECK_CUDA_INPUT(box_idx_of_points_tensor);
-
-    int batch_size = boxes_tensor.size(0);
-    int boxes_num = boxes_tensor.size(1);
-    int pts_num = pts_tensor.size(1);
-
-    const float *boxes = boxes_tensor.data_ptr<float>();
-    const float *pts = pts_tensor.data_ptr<float>();
-    int *box_idx_of_points = box_idx_of_points_tensor.data_ptr<int>();
-
-    points_in_boxes_all_forward_cuda(batch_size, boxes_num, pts_num,
-                                     boxes_tensor, pts_tensor,
-                                     box_idx_of_points_tensor);
-#else
-    AT_ERROR("points_in_boxes_all is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("points_in_boxes_all is not implemented on CPU");
-  }
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp b/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
deleted file mode 100644
index c16baa4cca4c380db4ae25462f5074607f084214..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/points_in_boxes_cpu.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "pytorch_cpp_helper.hpp"
-
-inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
-                                      float &local_x, float &local_y) {
-  float cosa = cos(-rz), sina = sin(-rz);
-  local_x = shift_x * cosa + shift_y * (-sina);
-  local_y = shift_x * sina + shift_y * cosa;
-}
-
-inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
-                                 float &local_x, float &local_y) {
-  // param pt: (x, y, z)
-  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
-  // cz in the bottom center
-  float x = pt[0], y = pt[1], z = pt[2];
-  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
-  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
-  cz += z_size /
-        2.0;  // shift to the center since cz in box3d is the bottom center
-
-  if (fabsf(z - cz) > z_size / 2.0) return 0;
-  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
-  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
-                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
-  return in_flag;
-}
-
-void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
-                                 Tensor pts_indices_tensor) {
-  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
-  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
-  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
-
-  CHECK_CONTIGUOUS(boxes_tensor);
-  CHECK_CONTIGUOUS(pts_tensor);
-  CHECK_CONTIGUOUS(pts_indices_tensor);
-
-  int boxes_num = boxes_tensor.size(0);
-  int pts_num = pts_tensor.size(0);
-
-  const float *boxes = boxes_tensor.data_ptr<float>();
-  const float *pts = pts_tensor.data_ptr<float>();
-  int *pts_indices = pts_indices_tensor.data_ptr<int>();
-
-  float local_x = 0, local_y = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    for (int j = 0; j < pts_num; j++) {
-      int cur_in_flag =
-          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
-      pts_indices[i * pts_num + j] = cur_in_flag;
-    }
-  }
-}
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons.cpp b/mmcv/ops/csrc/parrots/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp b/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d52018e6451f52d0c10648cea2ee036b3214376d
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h b/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..042678143472b18c85ac6d1bdcd79cc97a4e7ab0
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/psamask.cpp b/mmcv/ops/csrc/parrots/psamask.cpp
index 315bd2a250327938ad6d86849b66917270a72902..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67 100644
--- a/mmcv/ops/csrc/parrots/psamask.cpp
+++ b/mmcv/ops/csrc/parrots/psamask.cpp
@@ -2,255 +2,40 @@
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifndef min
-#define min(a, b) (((a) < (b)) ? (a) : (b))
-#endif
-#ifndef max
-#define max(a, b) (((a) > (b)) ? (a) : (b))
-#endif
-
-void psamask_collect_forward(const int num_, const int h_feature,
-                             const int w_feature, const int h_mask,
-                             const int w_mask, const int half_h_mask,
-                             const int half_w_mask, const Tensor mask_data,
-                             Tensor buffer_data) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data.view({-1})[(n * h_feature * w_feature +
-                                    (hidx + h - half_h_mask) * w_feature +
-                                    (widx + w - half_w_mask)) *
-                                       h_feature * w_feature +
-                                   h * w_feature + w] =
-                mask_data.view(
-                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                               h_feature +
-                           h) *
-                              w_feature +
-                          w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_distribute_forward(const int num_, const int h_feature,
-                                const int w_feature, const int h_mask,
-                                const int w_mask, const int half_h_mask,
-                                const int half_w_mask, const Tensor mask_data,
-                                Tensor buffer_data) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            buffer_data.view(
-                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
-                          h_feature * w_feature +
-                      (hidx + h - half_h_mask) * w_feature +
-                      (widx + w - half_w_mask)] =
-                mask_data.view(
-                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                               h_feature +
-                           h) *
-                              w_feature +
-                          w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_collect_backward(const int num_, const int h_feature,
-                              const int w_feature, const int h_mask,
-                              const int w_mask, const int half_h_mask,
-                              const int half_w_mask, const Tensor buffer_diff,
-                              Tensor mask_diff) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                                      h_feature +
-                                  h) *
-                                     w_feature +
-                                 w] =
-                buffer_diff.view({-1})[(n * h_feature * w_feature +
-                                        (hidx + h - half_h_mask) * w_feature +
-                                        (widx + w - half_w_mask)) *
-                                           h_feature * w_feature +
-                                       h * w_feature + w];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_distribute_backward(const int num_, const int h_feature,
-                                 const int w_feature, const int h_mask,
-                                 const int w_mask, const int half_h_mask,
-                                 const int half_w_mask,
-                                 const Tensor buffer_diff, Tensor mask_diff) {
-  for (int n = 0; n < num_; n++) {
-    for (int h = 0; h < h_feature; h++) {
-      for (int w = 0; w < w_feature; w++) {
-        // effective mask region : [hstart, hend) x [wstart, wend) with
-        // mask-indexed
-        const int hstart = max(0, half_h_mask - h);
-        const int hend = min(h_mask, h_feature + half_h_mask - h);
-        const int wstart = max(0, half_w_mask - w);
-        const int wend = min(w_mask, w_feature + half_w_mask - w);
-        // (hidx,                    widx                   ) with mask-indexed
-        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
-        // feature-indexed
-        for (int hidx = hstart; hidx < hend; hidx++) {
-          for (int widx = wstart; widx < wend; widx++) {
-            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
-                                      h_feature +
-                                  h) *
-                                     w_feature +
-                                 w] =
-                buffer_diff.view(
-                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
-                              h_feature * w_feature +
-                          (hidx + h - half_h_mask) * w_feature +
-                          (widx + w - half_w_mask)];
-          }
-        }
-      }
-    }
-  }
-}
-
-void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
-                         const int num_, const int h_feature,
-                         const int w_feature, const int h_mask,
-                         const int w_mask, const int half_h_mask,
-                         const int half_w_mask) {
-  if (psa_type == 0)
-    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                            half_h_mask, half_w_mask, input, output);
-  else
-    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
-                               half_h_mask, half_w_mask, input, output);
-}
-
-void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
-                          Tensor grad_input, const int num_,
-                          const int h_feature, const int w_feature,
-                          const int h_mask, const int w_mask,
-                          const int half_h_mask, const int half_w_mask) {
-  if (psa_type == 0)
-    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                             half_h_mask, half_w_mask, grad_output, grad_input);
-  else
-    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
-                                half_h_mask, half_w_mask, grad_output,
-                                grad_input);
-}
-
-#ifdef MMCV_WITH_CUDA
-void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
-                                      Tensor output, const int num_,
-                                      const int h_feature, const int w_feature,
-                                      const int h_mask, const int w_mask,
-                                      const int half_h_mask,
-                                      const int half_w_mask);
-
-void PSAMaskBackwardCUDAKernelLauncher(
-    const int psa_type, const Tensor grad_output, Tensor grad_input,
-    const int num_, const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int half_h_mask, const int half_w_mask);
-
-void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
                           const int num_, const int h_feature,
                           const int w_feature, const int h_mask,
                           const int w_mask, const int half_h_mask,
                           const int half_w_mask) {
-  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
-                                   w_feature, h_mask, w_mask, half_h_mask,
-                                   half_w_mask);
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
 }
 
-void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
                            Tensor grad_input, const int num_,
                            const int h_feature, const int w_feature,
                            const int h_mask, const int w_mask,
                            const int half_h_mask, const int half_w_mask) {
-  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
-                                    h_feature, w_feature, h_mask, w_mask,
-                                    half_h_mask, half_w_mask);
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
 }
-#endif
 
 void psamask_forward(const Tensor input, Tensor output, const int psa_type,
                      const int num_, const int h_feature, const int w_feature,
                      const int h_mask, const int w_mask, const int half_h_mask,
                      const int half_w_mask) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(output);
-    psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
-                         h_mask, w_mask, half_h_mask, half_w_mask);
-#else
-    AT_ERROR("PSAMask is not compiled with GPU support");
-#endif
-  } else {
-    psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
-                        h_mask, w_mask, half_h_mask, half_w_mask);
-  }
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
 }
 
 void psamask_backward(Tensor grad_output, const Tensor grad_input,
                       const int psa_type, const int num_, const int h_feature,
                       const int w_feature, const int h_mask, const int w_mask,
                       const int half_h_mask, const int half_w_mask) {
-  if (grad_input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_input);
-    CHECK_CUDA_INPUT(grad_output);
-    psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
-                          w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
-#else
-    AT_ERROR("PSAMask is not compiled with GPU support");
-#endif
-  } else {
-    psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
-                         w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
-  }
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
 }
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp b/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eb340ce42cf0ed4ccbe66a4b97aaed55a13be8b
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a30bffaffe059c98884332449c6af817036390
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/roi_align.cpp b/mmcv/ops/csrc/parrots/roi_align.cpp
index b44a742ceb7cae3bd410e802fb48c556c3bb9648..6e7077397d06ecd55af1e1060e64fe8c5ff08c94 100644
--- a/mmcv/ops/csrc/parrots/roi_align.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align.cpp
@@ -1,130 +1,41 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                       Tensor argmax_y, Tensor argmax_x,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       int pool_mode, bool aligned);
-
-void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                        Tensor argmax_y, Tensor argmax_x,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, int pool_mode,
-                                        bool aligned);
-
-void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
                             Tensor argmax_y, Tensor argmax_x,
                             int aligned_height, int aligned_width,
                             float spatial_scale, int sampling_ratio,
                             int pool_mode, bool aligned) {
-  ROIAlignForwardCUDAKernelLauncher(
-      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
-      spatial_scale, sampling_ratio, pool_mode, aligned);
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
 }
 
-void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
                              Tensor argmax_x, Tensor grad_input,
                              int aligned_height, int aligned_width,
                              float spatial_scale, int sampling_ratio,
                              int pool_mode, bool aligned) {
-  ROIAlignBackwardCUDAKernelLauncher(
-      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
-      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
-}
-#endif
-
-void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                Tensor argmax_y, Tensor argmax_x,
-                                int aligned_height, int aligned_width,
-                                float spatial_scale, int sampling_ratio,
-                                int pool_mode, bool aligned);
-
-void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                 Tensor argmax_y, Tensor argmax_x,
-                                 Tensor grad_input, int aligned_height,
-                                 int aligned_width, float spatial_scale,
-                                 int sampling_ratio, int pool_mode,
-                                 bool aligned);
-
-void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
-                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
-                           int aligned_width, float spatial_scale,
-                           int sampling_ratio, int pool_mode, bool aligned) {
-  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
-                             aligned_height, aligned_width, spatial_scale,
-                             sampling_ratio, pool_mode, aligned);
-}
-
-void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
-                            Tensor argmax_x, Tensor grad_input,
-                            int aligned_height, int aligned_width,
-                            float spatial_scale, int sampling_ratio,
-                            int pool_mode, bool aligned) {
-  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
-                              aligned_height, aligned_width, spatial_scale,
-                              sampling_ratio, pool_mode, aligned);
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
 }
 
 void roi_align_forward(Tensor input, Tensor rois, Tensor output,
                        Tensor argmax_y, Tensor argmax_x, int aligned_height,
                        int aligned_width, float spatial_scale,
                        int sampling_ratio, int pool_mode, bool aligned) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax_y);
-    CHECK_CUDA_INPUT(argmax_x);
-
-    roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
-                           aligned_height, aligned_width, spatial_scale,
-                           sampling_ratio, pool_mode, aligned);
-#else
-    AT_ERROR("RoIAlign is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(output);
-    CHECK_CPU_INPUT(argmax_y);
-    CHECK_CPU_INPUT(argmax_x);
-    roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x,
-                          aligned_height, aligned_width, spatial_scale,
-                          sampling_ratio, pool_mode, aligned);
-  }
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
 }
 
 void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
                         Tensor argmax_x, Tensor grad_input, int aligned_height,
                         int aligned_width, float spatial_scale,
                         int sampling_ratio, int pool_mode, bool aligned) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(argmax_y);
-    CHECK_CUDA_INPUT(argmax_x);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
-                            aligned_height, aligned_width, spatial_scale,
-                            sampling_ratio, pool_mode, aligned);
-#else
-    AT_ERROR("RoIAlign is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(grad_output);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(argmax_y);
-    CHECK_CPU_INPUT(argmax_x);
-    CHECK_CPU_INPUT(grad_input);
-
-    roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
-                           aligned_height, aligned_width, spatial_scale,
-                           sampling_ratio, pool_mode, aligned);
-  }
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
 }
diff --git a/mmcv/ops/csrc/parrots/roi_align_cpu.cpp b/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
deleted file mode 100644
index ac64eb31d21ff785c9758642515567a53967db59..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/roi_align_cpu.cpp
+++ /dev/null
@@ -1,430 +0,0 @@
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-
-#include "../pytorch_cpp_helper.hpp"
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          T x = xx;
-          T y = yy;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y <= 0) {
-            y = 0;
-          }
-          if (x <= 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignForward(const int nthreads, const T* input, const T* rois,
-                     T* output, T* argmax_y, T* argmax_x,
-                     const int pooled_height, const int pooled_width,
-                     const T spatial_scale, const int sampling_ratio,
-                     const int pool_mode,  // 0 - max pool, 1 - avg pool
-                     const bool aligned, const int channels, const int height,
-                     const int width) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign cannot have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // When the grid is empty, output zeros == 0/1, instead of NaN.
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          T maxval = -10000;
-          T maxidx_y = -1.f, maxidx_x = -1.f;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            const T y = roi_start_h + ph * bin_size_h +
-                        static_cast<T>(iy + .5f) * bin_size_h /
-                            static_cast<T>(roi_bin_grid_h);
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              const T x = roi_start_w + pw * bin_size_w +
-                          static_cast<T>(ix + .5f) * bin_size_w /
-                              static_cast<T>(roi_bin_grid_w);
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              T val = pc.w1 * offset_input[pc.pos1] +
-                      pc.w2 * offset_input[pc.pos2] +
-                      pc.w3 * offset_input[pc.pos3] +
-                      pc.w4 * offset_input[pc.pos4];
-              if (val > maxval) {
-                maxval = val;
-                maxidx_y = y;
-                maxidx_x = x;
-              }
-              output_val += val;
-              pre_calc_index += 1;
-            }
-          }
-          if (pool_mode == 0) {
-            // We do max pooling inside a bin
-            output[index] = maxval;
-            argmax_y[index] = maxidx_y;
-            argmax_x[index] = maxidx_x;
-          } else if (pool_mode == 1) {
-            // We do average (integral) pooling inside a bin
-            output[index] = output_val / count;
-          }  // if
-        }    // for pw
-      }      // for ph
-    }        // for c
-  }          // for n
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high,
-                                   const int index /* index for debug only*/) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-inline void add(T* address, const T& val) {
-  *address += val;
-}
-
-template <typename T>
-void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
-                      const T* argmax_y, const T* argmax_x, T* grad_input,
-                      const int pooled_height, const int pooled_width,
-                      const T spatial_scale, const int sampling_ratio,
-                      const int pool_mode,  // 0 - max pool, 1 - avg pool
-                      const bool aligned, const int channels, const int height,
-                      const int width, const int n_stride, const int c_stride,
-                      const int h_stride, const int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* offset_rois = rois + n * 5;
-    int roi_batch_ind = offset_rois[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_start_w = offset_rois[1] * spatial_scale - offset;
-    T roi_start_h = offset_rois[2] * spatial_scale - offset;
-    T roi_end_w = offset_rois[3] * spatial_scale - offset;
-    T roi_end_h = offset_rois[4] * spatial_scale - offset;
-
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlign do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    int output_offset = n * n_stride + c * c_stride;
-    const T* offset_grad_output = grad_output + output_offset;
-    const T grad_output_this_bin =
-        offset_grad_output[ph * h_stride + pw * w_stride];
-
-    if (pool_mode == 0) {
-      // We do max pooling inside a bin
-      T y = argmax_y[index], x = argmax_x[index];
-      if (y != -1.f) {
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high, index);
-
-        T g1 = grad_output_this_bin * w1;
-        T g2 = grad_output_this_bin * w2;
-        T g3 = grad_output_this_bin * w3;
-        T g4 = grad_output_this_bin * w4;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          // atomic add is not needed for now since it is single threaded
-          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
-        }  // if
-      }    // mode
-    } else if (pool_mode == 1) {
-      // We do average (integral) pooling inside a bin
-      // We use roi_bin_grid to sample the grid and mimic integral
-      int roi_bin_grid_h = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_height / pooled_height);  // e.g., = 2
-      int roi_bin_grid_w = (sampling_ratio > 0)
-                               ? sampling_ratio
-                               : ceil(roi_width / pooled_width);
-
-      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-        const T y = roi_start_h + ph * bin_size_h +
-                    static_cast<T>(iy + .5f) * bin_size_h /
-                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-          const T x = roi_start_w + pw * bin_size_w +
-                      static_cast<T>(ix + .5f) * bin_size_w /
-                          static_cast<T>(roi_bin_grid_w);
-
-          T w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-
-          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                        x_low, x_high, y_low, y_high, index);
-
-          T g1 = grad_output_this_bin * w1 / count;
-          T g2 = grad_output_this_bin * w2 / count;
-          T g3 = grad_output_this_bin * w3 / count;
-          T g4 = grad_output_this_bin * w4 / count;
-
-          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-            // atomic add is not needed for now since it is single threaded
-            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-            add(offset_grad_input + y_high * width + x_high,
-                static_cast<T>(g4));
-          }  // if
-        }    // ix
-      }      // iy
-    }        // mode
-  }          // for
-}  // ROIAlignBackward
-
-void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                Tensor argmax_y, Tensor argmax_x,
-                                int aligned_height, int aligned_width,
-                                float spatial_scale, int sampling_ratio,
-                                int pool_mode, bool aligned) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "ROIAlign_forward", [&] {
-        ROIAlignForward<scalar_t>(
-            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
-            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
-            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
-            aligned, channels, height, width);
-      });
-}
-
-void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                 Tensor argmax_y, Tensor argmax_x,
-                                 Tensor grad_input, int aligned_height,
-                                 int aligned_width, float spatial_scale,
-                                 int sampling_ratio, int pool_mode,
-                                 bool aligned) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad_output.stride(0);
-  int c_stride = grad_output.stride(1);
-  int h_stride = grad_output.stride(2);
-  int w_stride = grad_output.stride(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "ROIAlign_backward", [&] {
-        ROIAlignBackward<scalar_t>(
-            output_size, grad_output.data_ptr<scalar_t>(),
-            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
-            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
-            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
-            sampling_ratio, pool_mode, aligned, channels, height, width,
-            n_stride, c_stride, h_stride, w_stride);
-      });
-}
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
index c1bf57e36d48b5e5d4e3f5fbfe1afad7a6f7eae1..5ef691ada07e599740906254369631189e5d6f51 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
@@ -1,141 +1,41 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor output);
-
-void ROIAlignRotatedBackwardCUDAKernelLauncher(
-    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
-
-void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
-                                    int pooled_height, int pooled_width,
-                                    float spatial_scale, int sample_num,
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
                                     bool aligned, bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = features.size(1);
-  int data_height = features.size(2);
-  int data_width = features.size(3);
-  ROIAlignRotatedForwardCUDAKernelLauncher(
-      features, rois, spatial_scale, sample_num, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, pooled_height,
-      pooled_width, output);
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
 }
 
-void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
-                                     Tensor bottom_grad, int pooled_height,
-                                     int pooled_width, float spatial_scale,
-                                     int sample_num, bool aligned,
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
                                      bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
-  int size_rois = rois.size(1);
-  if (size_rois != 6) {
-    AT_ERROR("wrong roi size");
-  }
-
-  int num_channels = bottom_grad.size(1);
-  int data_height = bottom_grad.size(2);
-  int data_width = bottom_grad.size(3);
-  ROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, sample_num, aligned, clockwise,
-      num_channels, data_height, data_width, num_rois, pooled_height,
-      pooled_width, bottom_grad);
-}
-#endif
-
-void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       bool aligned, bool clockwise);
-
-void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, bool aligned,
-                                        bool clockwise);
-
-void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
-                                   int pooled_height, int pooled_width,
-                                   float spatial_scale, int sample_num,
-                                   bool aligned, bool clockwise) {
-  ROIAlignRotatedForwardCPULauncher(features, rois, output, pooled_height,
-                                    pooled_width, spatial_scale, sample_num,
-                                    aligned, clockwise);
-}
-
-void roi_align_rotated_backward_cpu(Tensor features, Tensor rois, Tensor output,
-                                    int pooled_height, int pooled_width,
-                                    float spatial_scale, int sample_num,
-                                    bool aligned, bool clockwise) {
-  ROIAlignRotatedBackwardCPULauncher(features, rois, output, pooled_height,
-                                     pooled_width, spatial_scale, sample_num,
-                                     aligned, clockwise);
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
 }
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
-                               int pooled_height, int pooled_width,
-                               float spatial_scale, int sample_num,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
                                bool aligned, bool clockwise) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-
-    roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
-                                   pooled_width, spatial_scale, sample_num,
-                                   aligned, clockwise);
-#else
-    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(input);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(output);
-
-    roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
-                                  pooled_width, spatial_scale, sample_num,
-                                  aligned, clockwise);
-  }
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
 }
 
-void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
-                                Tensor grad_input, int pooled_height,
-                                int pooled_width, float spatial_scale,
-                                int sample_num, bool aligned, bool clockwise) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_align_rotated_backward_cuda(grad_output, rois, grad_input,
-                                    pooled_height, pooled_width, spatial_scale,
-                                    sample_num, aligned, clockwise);
-#else
-    AT_ERROR("RoIAlignRotated is not compiled with GPU support");
-#endif
-  } else {
-    CHECK_CPU_INPUT(grad_output);
-    CHECK_CPU_INPUT(rois);
-    CHECK_CPU_INPUT(grad_input);
-
-    roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
-                                   pooled_width, spatial_scale, sample_num,
-                                   aligned, clockwise);
-  }
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
 }
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
deleted file mode 100644
index 73b8b2ac8fbdb6af80268c6dd85312497607e660..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_cpu.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-// Modified from
-// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
-// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-#include <ATen/ATen.h>
-#include <ATen/TensorUtils.h>
-
-#include "../pytorch_cpp_helper.hpp"
-
-// implementation taken from Caffe2
-template <typename T>
-struct PreCalc {
-  int pos1;
-  int pos2;
-  int pos3;
-  int pos4;
-  T w1;
-  T w2;
-  T w3;
-  T w4;
-};
-
-template <typename T>
-void pre_calc_for_bilinear_interpolate(
-    const int height, const int width, const int pooled_height,
-    const int pooled_width, const int iy_upper, const int ix_upper,
-    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
-    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
-    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
-  int pre_calc_index = 0;
-  for (int ph = 0; ph < pooled_height; ph++) {
-    for (int pw = 0; pw < pooled_width; pw++) {
-      for (int iy = 0; iy < iy_upper; iy++) {
-        const T yy = roi_start_h + ph * bin_size_h +
-                     static_cast<T>(iy + .5f) * bin_size_h /
-                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-        for (int ix = 0; ix < ix_upper; ix++) {
-          const T xx = roi_start_w + pw * bin_size_w +
-                       static_cast<T>(ix + .5f) * bin_size_w /
-                           static_cast<T>(roi_bin_grid_w);
-
-          // Rotate by theta around the center and translate
-          // In image space, (y, x) is the order for Right Handed System,
-          // and this is essentially multiplying the point by a rotation matrix
-          // to rotate it counterclockwise through angle theta.
-          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
-          // deal with: inverse elements are out of feature map boundary
-          if (y < -1.0 || y > height || x < -1.0 || x > width) {
-            // empty
-            PreCalc<T> pc;
-            pc.pos1 = 0;
-            pc.pos2 = 0;
-            pc.pos3 = 0;
-            pc.pos4 = 0;
-            pc.w1 = 0;
-            pc.w2 = 0;
-            pc.w3 = 0;
-            pc.w4 = 0;
-            pre_calc[pre_calc_index] = pc;
-            pre_calc_index += 1;
-            continue;
-          }
-
-          if (y < 0) {
-            y = 0;
-          }
-          if (x < 0) {
-            x = 0;
-          }
-
-          int y_low = (int)y;
-          int x_low = (int)x;
-          int y_high;
-          int x_high;
-
-          if (y_low >= height - 1) {
-            y_high = y_low = height - 1;
-            y = (T)y_low;
-          } else {
-            y_high = y_low + 1;
-          }
-
-          if (x_low >= width - 1) {
-            x_high = x_low = width - 1;
-            x = (T)x_low;
-          } else {
-            x_high = x_low + 1;
-          }
-
-          T ly = y - y_low;
-          T lx = x - x_low;
-          T hy = 1. - ly, hx = 1. - lx;
-          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-          // save weights and indices
-          PreCalc<T> pc;
-          pc.pos1 = y_low * width + x_low;
-          pc.pos2 = y_low * width + x_high;
-          pc.pos3 = y_high * width + x_low;
-          pc.pos4 = y_high * width + x_high;
-          pc.w1 = w1;
-          pc.w2 = w2;
-          pc.w3 = w3;
-          pc.w4 = w4;
-          pre_calc[pre_calc_index] = pc;
-
-          pre_calc_index += 1;
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-void ROIAlignRotatedForward(const int nthreads, const T* input,
-                            const T& spatial_scale, const bool aligned,
-                            const bool clockwise, const int channels,
-                            const int height, const int width,
-                            const int pooled_height, const int pooled_width,
-                            const int sampling_ratio, const T* rois,
-                            T* output) {
-  int n_rois = nthreads / channels / pooled_width / pooled_height;
-  // (n, c, ph, pw) is an element in the pooled output
-  // can be parallelized using omp
-  // #pragma omp parallel for num_threads(32)
-  for (int n = 0; n < n_rois; n++) {
-    int index_n = n * channels * pooled_width * pooled_height;
-
-    const T* current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_center_w = current_roi[1] * spatial_scale - offset;
-    T roi_center_h = current_roi[2] * spatial_scale - offset;
-    T roi_width = current_roi[3] * spatial_scale;
-    T roi_height = current_roi[4] * spatial_scale;
-    T theta = current_roi[5];
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    T cos_theta = cos(theta);
-    T sin_theta = sin(theta);
-
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlignRotated do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // We do average (integral) pooling inside a bin
-    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
-
-    // we want to precalculate indices and weights shared by all channels,
-    // this is the key point of optimization
-    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
-                                     pooled_width * pooled_height);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    T roi_start_h = -roi_height / 2.0;
-    T roi_start_w = -roi_width / 2.0;
-
-    pre_calc_for_bilinear_interpolate(
-        height, width, pooled_height, pooled_width, roi_bin_grid_h,
-        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
-        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
-        sin_theta, pre_calc);
-
-    for (int c = 0; c < channels; c++) {
-      int index_n_c = index_n + c * pooled_width * pooled_height;
-      const T* offset_input =
-          input + (roi_batch_ind * channels + c) * height * width;
-      int pre_calc_index = 0;
-
-      for (int ph = 0; ph < pooled_height; ph++) {
-        for (int pw = 0; pw < pooled_width; pw++) {
-          int index = index_n_c + ph * pooled_width + pw;
-
-          T output_val = 0.;
-          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-              PreCalc<T> pc = pre_calc[pre_calc_index];
-              output_val += pc.w1 * offset_input[pc.pos1] +
-                            pc.w2 * offset_input[pc.pos2] +
-                            pc.w3 * offset_input[pc.pos3] +
-                            pc.w4 * offset_input[pc.pos4];
-
-              pre_calc_index += 1;
-            }
-          }
-          output_val /= count;
-
-          output[index] = output_val;
-        }  // for pw
-      }    // for ph
-    }      // for c
-  }        // for n
-}
-
-template <typename T>
-void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
-                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
-                                   int& x_high, int& y_low, int& y_high) {
-  // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    // empty
-    w1 = w2 = w3 = w4 = 0.;
-    x_low = x_high = y_low = y_high = -1;
-    return;
-  }
-
-  if (y < 0) {
-    y = 0;
-  }
-
-  if (x < 0) {
-    x = 0;
-  }
-
-  y_low = (int)y;
-  x_low = (int)x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T)y_low;
-  } else {
-    y_high = y_low + 1;
-  }
-
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T)x_low;
-  } else {
-    x_high = x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
-  // reference in forward
-  // T v1 = input[y_low * width + x_low];
-  // T v2 = input[y_low * width + x_high];
-  // T v3 = input[y_high * width + x_low];
-  // T v4 = input[y_high * width + x_high];
-  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
-
-  return;
-}
-
-template <class T>
-inline void add(T* address, const T& val) {
-  *address += val;
-}
-
-template <typename T>
-void ROIAlignRotatedBackward(
-    const int nthreads,
-    // may not be contiguous. should index using n_stride, etc
-    const T* grad_output, const T& spatial_scale, const bool aligned,
-    const bool clockwise, const int channels, const int height, const int width,
-    const int pooled_height, const int pooled_width, const int sampling_ratio,
-    T* grad_input, const T* rois, const int n_stride, const int c_stride,
-    const int h_stride, const int w_stride) {
-  for (int index = 0; index < nthreads; index++) {
-    // (n, c, ph, pw) is an element in the pooled output
-    int pw = index % pooled_width;
-    int ph = (index / pooled_width) % pooled_height;
-    int c = (index / pooled_width / pooled_height) % channels;
-    int n = index / pooled_width / pooled_height / channels;
-
-    const T* current_roi = rois + n * 6;
-    int roi_batch_ind = current_roi[0];
-
-    // Do not use rounding; this implementation detail is critical
-    T offset = aligned ? (T)0.5 : (T)0.0;
-    T roi_center_w = current_roi[1] * spatial_scale - offset;
-    T roi_center_h = current_roi[2] * spatial_scale - offset;
-    T roi_width = current_roi[3] * spatial_scale;
-    T roi_height = current_roi[4] * spatial_scale;
-    T theta = current_roi[5];
-    if (clockwise) {
-      theta = -theta;  // If clockwise, the angle needs to be reversed.
-    }
-    T cos_theta = cos(theta);
-    T sin_theta = sin(theta);
-
-    if (aligned) {
-      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
-                 "ROIs in ROIAlignRotated do not have non-negative size!");
-    } else {  // for backward-compatibility only
-      roi_width = std::max(roi_width, (T)1.);
-      roi_height = std::max(roi_height, (T)1.);
-    }
-
-    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
-    int output_offset = n * n_stride + c * c_stride;
-    const T* offset_grad_output = grad_output + output_offset;
-    const T grad_output_this_bin =
-        offset_grad_output[ph * h_stride + pw * w_stride];
-
-    // We use roi_bin_grid to sample the grid and mimic integral
-    int roi_bin_grid_h = (sampling_ratio > 0)
-                             ? sampling_ratio
-                             : ceil(roi_height / pooled_height);  // e.g., = 2
-    int roi_bin_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
-
-    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
-    // Appropriate translation needs to be applied after.
-    T roi_start_h = -roi_height / 2.0;
-    T roi_start_w = -roi_width / 2.0;
-
-    // We do average (integral) pooling inside a bin
-    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
-
-    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
-      const T yy = roi_start_h + ph * bin_size_h +
-                   static_cast<T>(iy + .5f) * bin_size_h /
-                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
-      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
-        const T xx = roi_start_w + pw * bin_size_w +
-                     static_cast<T>(ix + .5f) * bin_size_w /
-                         static_cast<T>(roi_bin_grid_w);
-
-        // Rotate by theta around the center and translate
-        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
-        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
-
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-
-        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
-                                      x_low, x_high, y_low, y_high);
-
-        T g1 = grad_output_this_bin * w1 / count;
-        T g2 = grad_output_this_bin * w2 / count;
-        T g3 = grad_output_this_bin * w3 / count;
-        T g4 = grad_output_this_bin * w4 / count;
-
-        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          // atomic add is not needed for now since it is single threaded
-          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
-        }  // if
-      }    // ix
-    }      // iy
-  }        // for
-}  // ROIAlignRotatedBackward
-
-void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
-                                       int aligned_height, int aligned_width,
-                                       float spatial_scale, int sampling_ratio,
-                                       bool aligned, bool clockwise) {
-  int output_size = output.numel();
-  int channels = input.size(1);
-  int height = input.size(2);
-  int width = input.size(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      input.scalar_type(), "ROIAlignRotated_forward", [&] {
-        ROIAlignRotatedForward<scalar_t>(
-            output_size, input.data_ptr<scalar_t>(),
-            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
-            height, width, aligned_height, aligned_width, sampling_ratio,
-            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
-      });
-}
-
-void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
-                                        Tensor grad_input, int aligned_height,
-                                        int aligned_width, float spatial_scale,
-                                        int sampling_ratio, bool aligned,
-                                        bool clockwise) {
-  int output_size = grad_output.numel();
-  int channels = grad_input.size(1);
-  int height = grad_input.size(2);
-  int width = grad_input.size(3);
-
-  // get stride values to ensure indexing into gradients is correct.
-  int n_stride = grad_output.stride(0);
-  int c_stride = grad_output.stride(1);
-  int h_stride = grad_output.stride(2);
-  int w_stride = grad_output.stride(3);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
-        ROIAlignRotatedBackward<scalar_t>(
-            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
-            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
-            height, width, aligned_height, aligned_width, sampling_ratio,
-            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
-            n_stride, c_stride, h_stride, w_stride);
-      });
-}
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
index 1a87ad27c2dd7fa532768e2913ea1ea5f21ee3ec..9386250a27b1db338bcc522c4acf9b29b05077db 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -14,14 +14,14 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -30,7 +30,7 @@ void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto output = buildATensor(ctx, outs[0]);
   roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
-                                 pooled_width, spatial_scale, sample_num,
+                                 pooled_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
 }
 
@@ -41,14 +41,14 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -57,7 +57,7 @@ void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto grad_input = buildATensor(ctx, outs[0]);
   roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
-                                  pooled_width, spatial_scale, sample_num,
+                                  pooled_width, spatial_scale, sampling_ratio,
                                   aligned, clockwise);
 }
 #endif
@@ -69,14 +69,14 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -85,7 +85,7 @@ void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto output = buildATensor(ctx, outs[0]);
   roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
-                                pooled_width, spatial_scale, sample_num,
+                                pooled_width, spatial_scale, sampling_ratio,
                                 aligned, clockwise);
 }
 
@@ -96,14 +96,14 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
   int pooled_height;
   int pooled_width;
   float spatial_scale;
-  int sample_num;
+  int sampling_ratio;
   bool aligned;
   bool clockwise;
   SSAttrs(attr)
       .get<int>("pooled_height", pooled_height)
       .get<int>("pooled_width", pooled_width)
       .get<float>("spatial_scale", spatial_scale)
-      .get<int>("sample_num", sample_num)
+      .get<int>("sampling_ratio", sampling_ratio)
       .get<bool>("aligned", aligned)
       .get<bool>("clockwise", clockwise)
       .done();
@@ -112,7 +112,7 @@ void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
   const auto& rois = buildATensor(ctx, ins[1]);
   auto grad_input = buildATensor(ctx, outs[0]);
   roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
-                                 pooled_width, spatial_scale, sample_num,
+                                 pooled_width, spatial_scale, sampling_ratio,
                                  aligned, clockwise);
 }
 
@@ -120,7 +120,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
     .attr("pooled_height")
     .attr("pooled_width")
     .attr("spatial_scale")
-    .attr("sample_num")
+    .attr("sampling_ratio")
     .attr("aligned")
     .attr("clockwise")
     .input(2)
@@ -135,7 +135,7 @@ PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
     .attr("pooled_height")
     .attr("pooled_width")
     .attr("spatial_scale")
-    .attr("sample_num")
+    .attr("sampling_ratio")
     .attr("aligned")
     .attr("clockwise")
     .input(2)
diff --git a/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h b/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
index eafe31371487cf3ff0fb3f42956dc88c78c841c5..8136b56d133d4dfa32b0d1aa2a02425560dee0e0 100644
--- a/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
+++ b/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -5,27 +5,27 @@
 using namespace at;
 
 #ifdef MMCV_WITH_CUDA
-void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
                                     int pooled_height, int pooled_width,
-                                    float spatial_scale, int sample_num,
+                                    float spatial_scale, int sampling_ratio,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
                                      Tensor bottom_grad, int pooled_height,
                                      int pooled_width, float spatial_scale,
-                                     int sample_num, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                      bool clockwise);
 #endif
 
-void roi_align_rotated_forward_cpu(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
                                    int pooled_height, int pooled_width,
-                                   float spatial_scale, int sample_num,
+                                   float spatial_scale, int sampling_ratio,
                                    bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
                                     Tensor bottom_grad, int pooled_height,
                                     int pooled_width, float spatial_scale,
-                                    int sample_num, bool aligned,
+                                    int sampling_ratio, bool aligned,
                                     bool clockwise);
 
 #endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/roi_pool.cpp b/mmcv/ops/csrc/parrots/roi_pool.cpp
index 34c4b996bfeba975e241fa2c493d230f15cfebc1..bba90b806c5fe59d9e20a0b41a51df9922e91c3f 100644
--- a/mmcv/ops/csrc/parrots/roi_pool.cpp
+++ b/mmcv/ops/csrc/parrots/roi_pool.cpp
@@ -1,67 +1,31 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
-                                      Tensor argmax, int pooled_height,
-                                      int pooled_width, float spatial_scale);
-
-void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
-                                       Tensor argmax, Tensor grad_input,
-                                       int pooled_height, int pooled_width,
-                                       float spatial_scale);
-
-void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
                            Tensor argmax, int pooled_height, int pooled_width,
                            float spatial_scale) {
-  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
-                                   pooled_width, spatial_scale);
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
 }
 
-void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
                             Tensor grad_input, int pooled_height,
                             int pooled_width, float spatial_scale) {
-  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
-                                    pooled_height, pooled_width, spatial_scale);
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
 }
-#endif
 
 void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
                       int pooled_height, int pooled_width,
                       float spatial_scale) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(output);
-    CHECK_CUDA_INPUT(argmax);
-
-    roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
-                          pooled_width, spatial_scale);
-#else
-    AT_ERROR("RoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("RoIPool is not implemented on CPU");
-  }
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
 }
 
 void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
                        Tensor grad_input, int pooled_height, int pooled_width,
                        float spatial_scale) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(argmax);
-    CHECK_CUDA_INPUT(grad_input);
-
-    roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
-                           pooled_width, spatial_scale);
-#else
-    AT_ERROR("RoIPool is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("RoIPool is not implemented on CPU");
-  }
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
 }
diff --git a/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp b/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
index c7e267f8f097dacd83433cc37848b09683236ffd..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4 100644
--- a/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
+++ b/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -1,40 +1,28 @@
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void RoiawarePool3dForwardCUDAKernelLauncher(
-    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
-    int out_y, int out_z, const Tensor rois, const Tensor pts,
-    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
-    Tensor pooled_features, int pool_method);
-
-void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
                                   int max_pts_each_voxel, int out_x, int out_y,
                                   int out_z, const Tensor rois,
                                   const Tensor pts, const Tensor pts_feature,
                                   Tensor argmax, Tensor pts_idx_of_voxels,
                                   Tensor pooled_features, int pool_method) {
-  RoiawarePool3dForwardCUDAKernelLauncher(
-      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
-      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
-      pool_method);
-};
-
-void RoiawarePool3dBackwardCUDAKernelLauncher(
-    int boxes_num, int out_x, int out_y, int out_z, int channels,
-    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
-    const Tensor grad_out, Tensor grad_in, int pool_method);
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
 
-void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
                                    int out_z, int channels,
                                    int max_pts_each_voxel,
                                    const Tensor pts_idx_of_voxels,
                                    const Tensor argmax, const Tensor grad_out,
                                    Tensor grad_in, int pool_method) {
-  RoiawarePool3dBackwardCUDAKernelLauncher(
-      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
 
 void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
                              Tensor argmax, Tensor pts_idx_of_voxels,
@@ -47,36 +35,20 @@ void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
   // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
   // params pooled_features: (N, out_x, out_y, out_z, C)
   // params pool_method: 0: max_pool 1: avg_pool
-  if (pts.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(rois);
-    CHECK_CUDA_INPUT(pts);
-    CHECK_CUDA_INPUT(pts_feature);
-    CHECK_CUDA_INPUT(argmax);
-    CHECK_CUDA_INPUT(pts_idx_of_voxels);
-    CHECK_CUDA_INPUT(pooled_features);
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
 
-    int boxes_num = rois.size(0);
-    int pts_num = pts.size(0);
-    int channels = pts_feature.size(1);
-    int max_pts_each_voxel =
-        pts_idx_of_voxels.size(4);  // index 0 is the counter
-    int out_x = pts_idx_of_voxels.size(1);
-    int out_y = pts_idx_of_voxels.size(2);
-    int out_z = pts_idx_of_voxels.size(3);
-    assert((out_x < 256) && (out_y < 256) &&
-           (out_z < 256));  // we encode index with 8bit
-
-    roiaware_pool3d_forward_cuda(boxes_num, pts_num, channels,
-                                 max_pts_each_voxel, out_x, out_y, out_z, rois,
-                                 pts, pts_feature, argmax, pts_idx_of_voxels,
-                                 pooled_features, pool_method);
-#else
-    AT_ERROR("roiaware_pool3d is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("roiaware_pool3d is not implemented on CPU");
-  }
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
 }
 
 void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
@@ -87,29 +59,14 @@ void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
   // params grad_out: (N, out_x, out_y, out_z, C)
   // params grad_in: (npoints, C), return value
   // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
 
-  if (grad_in.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(pts_idx_of_voxels);
-    CHECK_CUDA_INPUT(argmax);
-    CHECK_CUDA_INPUT(grad_out);
-    CHECK_CUDA_INPUT(grad_in);
-
-    int boxes_num = pts_idx_of_voxels.size(0);
-    int out_x = pts_idx_of_voxels.size(1);
-    int out_y = pts_idx_of_voxels.size(2);
-    int out_z = pts_idx_of_voxels.size(3);
-    int max_pts_each_voxel =
-        pts_idx_of_voxels.size(4);  // index 0 is the counter
-    int channels = grad_out.size(4);
-
-    roiaware_pool3d_backward_cuda(boxes_num, out_x, out_y, out_z, channels,
-                                  max_pts_each_voxel, pts_idx_of_voxels, argmax,
-                                  grad_out, grad_in, pool_method);
-#else
-    AT_ERROR("roiaware_pool3d is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("roiaware_pool3d is not implemented on CPU");
-  }
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
 }
diff --git a/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp b/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
index e9b5054e70e06862fe567422ac037573b7b35e62..a10080b7c23abb3a31b6f764c972ea7917f52346 100644
--- a/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
+++ b/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -7,24 +7,18 @@ All Rights Reserved 2018.
 */
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void RoIPointPool3dForwardCUDAKernelLauncher(
-    int batch_size, int pts_num, int boxes_num, int feature_in_len,
-    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
-    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
-
-void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
                                   int feature_in_len, int sampled_pts_num,
                                   const Tensor xyz, const Tensor boxes3d,
                                   const Tensor pts_feature,
                                   Tensor pooled_features,
                                   Tensor pooled_empty_flag) {
-  RoIPointPool3dForwardCUDAKernelLauncher(
-      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
-      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
 
 void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
                              Tensor pooled_features, Tensor pooled_empty_flag) {
@@ -33,28 +27,13 @@ void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
   // params pts_feature: (B, N, C)
   // params pooled_features: (B, M, 512, 3+C)
   // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
 
-  if (xyz.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(xyz);
-    CHECK_CUDA_INPUT(boxes3d);
-    CHECK_CUDA_INPUT(pts_feature);
-    CHECK_CUDA_INPUT(pooled_features);
-    CHECK_CUDA_INPUT(pooled_empty_flag);
-
-    int batch_size = xyz.size(0);
-    int pts_num = xyz.size(1);
-    int boxes_num = boxes3d.size(1);
-    int feature_in_len = pts_feature.size(2);
-    int sampled_pts_num = pooled_features.size(2);
-
-    roipoint_pool3d_forward_cuda(batch_size, pts_num, boxes_num, feature_in_len,
-                                 sampled_pts_num, xyz, boxes3d, pts_feature,
-                                 pooled_features, pooled_empty_flag);
-#else
-    AT_ERROR("roipoint_pool3d is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("roipoint_pool3d is not implemented on CPU");
-  }
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
 }
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align.cpp b/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad11a9d2fe71750de2d12249c2323d1e68d671c0
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "rotated_feature_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void rotated_feature_align_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+void rotated_feature_align_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+#endif
+
+void rotated_feature_align_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h b/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a695ee5e3de4b2d8f77e93fb06986967f3a35d0
--- /dev/null
+++ b/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
+#define ROTATED_FEATURE_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/mmcv/ops/csrc/parrots/sync_bn.cpp b/mmcv/ops/csrc/parrots/sync_bn.cpp
index 2e023a85998e53f9cb7f0920fed46b9ffd2c4067..fd5a513273a7bbce2cf41c790706fe4801f4c414 100644
--- a/mmcv/ops/csrc/parrots/sync_bn.cpp
+++ b/mmcv/ops/csrc/parrots/sync_bn.cpp
@@ -1,92 +1,47 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
-
-void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
-                                        Tensor var);
-
-void SyncBNForwardOutputCUDAKernelLauncher(
-    const Tensor input, const Tensor mean, const Tensor var,
-    Tensor running_mean, Tensor running_var, const Tensor weight,
-    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
-    float momentum, int group_size);
-
-void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
-                                           const Tensor norm,
-                                           Tensor grad_weight,
-                                           Tensor grad_bias);
-
-void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
-                                          const Tensor weight,
-                                          const Tensor grad_weight,
-                                          const Tensor grad_bias,
-                                          const Tensor norm, const Tensor std,
-                                          Tensor grad_input);
-
-void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
-  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
 }
 
-void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
                               Tensor var) {
-  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
 }
 
-void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
                                  const Tensor var, Tensor running_mean,
                                  Tensor running_var, const Tensor weight,
                                  const Tensor bias, Tensor norm, Tensor std,
                                  Tensor output, float eps, float momentum,
                                  int group_size) {
-  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
-                                        running_var, weight, bias, norm, std,
-                                        output, eps, momentum, group_size);
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
 }
 
-void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
                                  Tensor grad_weight, Tensor grad_bias) {
-  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
-                                        grad_bias);
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
 }
 
-void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
                                 const Tensor grad_weight,
                                 const Tensor grad_bias, const Tensor norm,
                                 const Tensor std, Tensor grad_input) {
-  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
-                                       grad_bias, norm, std, grad_input);
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
 }
-#endif
 
 void sync_bn_forward_mean(const Tensor input, Tensor mean) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(mean);
-    sync_bn_forward_mean_cuda(input, mean);
-#else
-    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SyncBatchNorm is not implemented on CPU");
-  }
+  sync_bn_forward_mean_impl(input, mean);
 }
 
 void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(mean);
-    CHECK_CUDA_INPUT(var);
-    sync_bn_forward_var_cuda(input, mean, var);
-#else
-    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SyncBatchNorm is not implemented on CPU");
-  }
+  sync_bn_forward_var_impl(input, mean, var);
 }
 
 void sync_bn_forward_output(const Tensor input, const Tensor mean,
@@ -95,65 +50,20 @@ void sync_bn_forward_output(const Tensor input, const Tensor mean,
                             Tensor running_var, Tensor norm, Tensor std,
                             Tensor output, float eps, float momentum,
                             int group_size) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(mean);
-    CHECK_CUDA_INPUT(var);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(bias);
-    CHECK_CUDA_INPUT(running_mean);
-    CHECK_CUDA_INPUT(running_var);
-    CHECK_CUDA_INPUT(norm);
-    CHECK_CUDA_INPUT(std);
-    CHECK_CUDA_INPUT(output);
-    sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
-                                weight, bias, norm, std, output, eps, momentum,
-                                group_size);
-#else
-    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SyncBatchNorm is not implemented on CPU");
-  }
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
 }
 
 void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
                             Tensor grad_weight, Tensor grad_bias) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(norm);
-    CHECK_CUDA_INPUT(grad_weight);
-    CHECK_CUDA_INPUT(grad_bias);
-    sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
-#else
-    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SyncBatchNorm is not implemented on CPU");
-  }
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
 }
 
 void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
                            const Tensor grad_weight, const Tensor grad_bias,
                            const Tensor norm, const Tensor std,
                            Tensor grad_input) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(weight);
-    CHECK_CUDA_INPUT(grad_weight);
-    CHECK_CUDA_INPUT(grad_bias);
-    CHECK_CUDA_INPUT(norm);
-    CHECK_CUDA_INPUT(std);
-    CHECK_CUDA_INPUT(grad_input);
-    sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias,
-                               norm, std, grad_input);
-#else
-    AT_ERROR("SyncBatchNorm is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("SyncBatchNorm is not implemented on CPU");
-  }
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
 }
diff --git a/mmcv/ops/csrc/parrots/three_interpolate.cpp b/mmcv/ops/csrc/parrots/three_interpolate.cpp
index dbbcd995d0e89c42f3ccb68e69b341cb77a75471..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977 100644
--- a/mmcv/ops/csrc/parrots/three_interpolate.cpp
+++ b/mmcv/ops/csrc/parrots/three_interpolate.cpp
@@ -2,60 +2,32 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
-                                               const Tensor points,
-                                               const Tensor idx,
-                                               const Tensor weight, Tensor out);
-
-void three_interpolate_forward_cuda(int b, int c, int m, int n,
+void three_interpolate_forward_impl(int b, int c, int m, int n,
                                     const Tensor points, const Tensor idx,
                                     const Tensor weight, Tensor out) {
-  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
-                                            out);
-};
-
-void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
-                                                const Tensor grad_out,
-                                                const Tensor idx,
-                                                const Tensor weight,
-                                                Tensor grad_points);
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
 
-void three_interpolate_backward_cuda(int b, int c, int n, int m,
+void three_interpolate_backward_impl(int b, int c, int n, int m,
                                      const Tensor grad_out, const Tensor idx,
                                      const Tensor weight, Tensor grad_points) {
-  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
-                                             grad_points);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
 
 void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
                                Tensor weight_tensor, Tensor out_tensor, int b,
                                int c, int m, int n) {
-  if (points_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    three_interpolate_forward_cuda(b, c, m, n, points_tensor, idx_tensor,
-                                   weight_tensor, out_tensor);
-#else
-    AT_ERROR("three_interpolate is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("three_interpolate is not implemented on CPU");
-  }
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
 }
 
 void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
                                 Tensor weight_tensor, Tensor grad_points_tensor,
                                 int b, int c, int n, int m) {
-  if (grad_out_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    three_interpolate_backward_cuda(b, c, n, m, grad_out_tensor, idx_tensor,
-                                    weight_tensor, grad_points_tensor);
-#else
-    AT_ERROR("three_interpolate is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("three_interpolate is not implemented on CPU");
-  }
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/three_nn.cpp b/mmcv/ops/csrc/parrots/three_nn.cpp
index 158ac002314a17775ea9806b1ec0eaba2a7d0ffa..b629200c0727cdec5ca4e0abd8ac65baacaa31f9 100644
--- a/mmcv/ops/csrc/parrots/three_nn.cpp
+++ b/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -2,29 +2,17 @@
 // https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
 
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
-                                      const Tensor known, Tensor dist2,
-                                      Tensor idx);
-
-void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
                            const Tensor known, Tensor dist2, Tensor idx) {
-  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
-};
-#endif
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
 
 void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
                       Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
                       int m) {
-  if (unknown_tensor.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    three_nn_forward_cuda(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
-                          idx_tensor);
-#else
-    AT_ERROR("three_nn is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("three_nn is not implemented on CPU");
-  }
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
 }
diff --git a/mmcv/ops/csrc/parrots/tin_shift.cpp b/mmcv/ops/csrc/parrots/tin_shift.cpp
index a10af24d3c3a732e750de2e72ff5fd0d538f54cf..b03f587541f17cae3c3f03f5cb8747d4b0208efc 100644
--- a/mmcv/ops/csrc/parrots/tin_shift.cpp
+++ b/mmcv/ops/csrc/parrots/tin_shift.cpp
@@ -1,52 +1,20 @@
 // Copyright (c) OpenMMLab. All rights reserved
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
-                                       Tensor output);
-
-void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
-                                        Tensor grad_input);
-
-void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
-  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
 }
 
-void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
                              Tensor grad_input) {
-  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
 }
 
-#endif
-
 void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
-  if (input.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(input);
-    CHECK_CUDA_INPUT(shift);
-    CHECK_CUDA_INPUT(output);
-
-    tin_shift_forward_cuda(input, shift, output);
-#else
-    AT_ERROR("TINShift is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("TINShift is not implemented on CPU");
-  }
+  tin_shift_forward_impl(input, shift, output);
 }
 
 void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
-  if (grad_output.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(grad_output);
-    CHECK_CUDA_INPUT(shift);
-    CHECK_CUDA_INPUT(grad_input);
-
-    tin_shift_backward_cuda(grad_output, shift, grad_input);
-#else
-    AT_ERROR("TINShift is not compiled with GPU support");
-#endif
-  } else {
-    AT_ERROR("TINShift is not implemented on CPU");
-  }
+  tin_shift_backward_impl(grad_output, shift, grad_input);
 }
diff --git a/mmcv/ops/csrc/parrots/upfirdn2d.cpp b/mmcv/ops/csrc/parrots/upfirdn2d.cpp
index c966822b60a980fd75187b2c49ee448d99cfddb1..dd325bd7887a49b5f0ccd134604f24c0fd40fc10 100644
--- a/mmcv/ops/csrc/parrots/upfirdn2d.cpp
+++ b/mmcv/ops/csrc/parrots/upfirdn2d.cpp
@@ -1,26 +1,118 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// from
+// Modified from
 // https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
-#include "pytorch_cpp_helper.hpp"
 
-#ifdef MMCV_WITH_CUDA
-torch::Tensor upfirdn2d_op(const torch::Tensor &input,
-                           const torch::Tensor &kernel, int up_x, int up_y,
-                           int down_x, int down_y, int pad_x0, int pad_x1,
-                           int pad_y0, int pad_y1);
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
 
-#endif
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
 
-torch::Tensor upfirdn2d(const torch::Tensor &input, const torch::Tensor &kernel,
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
                         int up_x, int up_y, int down_x, int down_y, int pad_x0,
                         int pad_x1, int pad_y0, int pad_y1) {
-#ifdef MMCV_WITH_CUDA
-  CHECK_CUDA(input);
-  CHECK_CUDA(kernel);
-
-  return upfirdn2d_op(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
-                      pad_y0, pad_y1);
-#else
-  AT_ERROR("UpFirDn2d is not compiled with GPU support");
-#endif
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
 }
diff --git a/mmcv/ops/csrc/parrots/voxelization.cpp b/mmcv/ops/csrc/parrots/voxelization.cpp
index 6b6ce4e5a7e11e0a1c8fbb6cf590ac0f1d8d1262..7946be6178ad5eae64958b4631c1cabec2a04eee 100644
--- a/mmcv/ops/csrc/parrots/voxelization.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization.cpp
@@ -1,58 +1,45 @@
 // Copyright (c) OpenMMLab. All rights reserved.
 #include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
 
-#ifdef MMCV_WITH_CUDA
-int HardVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
-    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const int max_points,
-    const int max_voxels, const int NDim = 3);
-
-int hard_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &voxels,
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                                at::Tensor &coors,
                                at::Tensor &num_points_per_voxel,
                                const std::vector<float> voxel_size,
                                const std::vector<float> coors_range,
                                const int max_points, const int max_voxels,
                                const int NDim = 3) {
-  return HardVoxelizeForwardCUDAKernelLauncher(
-      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
-      max_points, max_voxels, NDim);
-};
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
 
-void DynamicVoxelizeForwardCUDAKernelLauncher(
-    const at::Tensor &points, at::Tensor &coors,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const int NDim = 3);
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
 
-void dynamic_voxelize_forward_cuda(const at::Tensor &points, at::Tensor &coors,
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
                                    const int NDim = 3) {
-  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
-                                           coors_range, NDim);
-};
-#endif
-
-int hard_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &voxels,
-                              at::Tensor &coors,
-                              at::Tensor &num_points_per_voxel,
-                              const std::vector<float> voxel_size,
-                              const std::vector<float> coors_range,
-                              const int max_points, const int max_voxels,
-                              const int NDim = 3);
-
-void dynamic_voxelize_forward_cpu(const at::Tensor &points, at::Tensor &coors,
-                                  const std::vector<float> voxel_size,
-                                  const std::vector<float> coors_range,
-                                  const int NDim = 3);
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
 
 void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &voxel_size,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3) {
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
   int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
   std::vector<float> voxel_size_v(
       voxel_size.data_ptr<float>(),
@@ -60,18 +47,13 @@ void hard_voxelize_forward(const at::Tensor &points,
   std::vector<float> coors_range_v(
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(points);
 
-    *voxel_num_data = hard_voxelize_forward_cuda(
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
         points, voxels, coors, num_points_per_voxel, voxel_size_v,
         coors_range_v, max_points, max_voxels, NDim);
-#else
-    AT_ERROR("hard_voxelize is not compiled with GPU support");
-#endif
   } else {
-    *voxel_num_data = hard_voxelize_forward_cpu(
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
         points, voxels, coors, num_points_per_voxel, voxel_size_v,
         coors_range_v, max_points, max_voxels, NDim);
   }
@@ -87,17 +69,6 @@ void dynamic_voxelize_forward(const at::Tensor &points,
   std::vector<float> coors_range_v(
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
-  if (points.device().is_cuda()) {
-#ifdef MMCV_WITH_CUDA
-    CHECK_CUDA_INPUT(points);
-
-    dynamic_voxelize_forward_cuda(points, coors, voxel_size_v, coors_range_v,
-                                  NDim);
-#else
-    AT_ERROR("dynamic_voxelize is not compiled with GPU support");
-#endif
-  } else {
-    dynamic_voxelize_forward_cpu(points, coors, voxel_size_v, coors_range_v,
-                                 NDim);
-  }
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
 }
diff --git a/mmcv/ops/csrc/parrots/voxelization_cpu.cpp b/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
deleted file mode 100644
index 59eb86f543caa1ddeeca136069542033f9f7c3ac..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/parrots/voxelization_cpu.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved.
-#include "pytorch_cpp_helper.hpp"
-
-template <typename T, typename T_int>
-void dynamic_voxelize_forward_cpu_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
-    const std::vector<float> coors_range, const std::vector<int> grid_size,
-    const int num_points, const int num_features, const int NDim) {
-  const int ndim_minus_1 = NDim - 1;
-  bool failed = false;
-  // int coor[NDim];
-  int* coor = new int[NDim]();
-  int c;
-
-  for (int i = 0; i < num_points; ++i) {
-    failed = false;
-    for (int j = 0; j < NDim; ++j) {
-      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
-      // necessary to rm points out of range
-      if ((c < 0 || c >= grid_size[j])) {
-        failed = true;
-        break;
-      }
-      coor[ndim_minus_1 - j] = c;
-    }
-
-    if (failed)
-      memset(&coors[i][0], -1, NDim * sizeof(T_int));
-    else
-      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
-  }
-
-  delete[] coor;
-}
-
-template <typename T, typename T_int>
-void hard_voxelize_forward_cpu_kernel(
-    const torch::TensorAccessor<T, 2> points,
-    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
-    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
-    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
-    const std::vector<float> voxel_size, const std::vector<float> coors_range,
-    const std::vector<int> grid_size, const int max_points,
-    const int max_voxels, const int num_points, const int num_features,
-    const int NDim) {
-  // declare a temp coors
-  at::Tensor temp_coors = at::zeros(
-      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
-
-  // First use dynamic voxelization to get coors,
-  // then check max points/voxels constraints
-  dynamic_voxelize_forward_cpu_kernel<T, int>(
-      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
-      num_points, num_features, NDim);
-
-  int voxelidx, num;
-  auto coor = temp_coors.accessor<int, 2>();
-
-  for (int i = 0; i < num_points; ++i) {
-    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
-
-    if (coor[i][0] == -1) continue;
-
-    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
-
-    // record voxel
-    if (voxelidx == -1) {
-      voxelidx = voxel_num;
-      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
-      voxel_num += 1;
-
-      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
-    }
-
-    // put points into voxel
-    num = num_points_per_voxel[voxelidx];
-    if (max_points == -1 || num < max_points) {
-      memcpy(&voxels[voxelidx][num][0], &points[i][0],
-             num_features * sizeof(T));
-      num_points_per_voxel[voxelidx] += 1;
-    }
-  }
-
-  return;
-}
-
-void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
-                                  const std::vector<float> voxel_size,
-                                  const std::vector<float> coors_range,
-                                  const int NDim = 3) {
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
-        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
-            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
-      });
-}
-
-int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
-                              at::Tensor& coors,
-                              at::Tensor& num_points_per_voxel,
-                              const std::vector<float> voxel_size,
-                              const std::vector<float> coors_range,
-                              const int max_points, const int max_voxels,
-                              const int NDim = 3) {
-  // current version tooks about 0.02s_0.03s for one frame on cpu
-  // check device
-  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
-
-  std::vector<int> grid_size(NDim);
-  const int num_points = points.size(0);
-  const int num_features = points.size(1);
-
-  for (int i = 0; i < NDim; ++i) {
-    grid_size[i] =
-        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
-  }
-
-  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
-  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
-  // grid_size[1], grid_size[0]);
-  at::Tensor coor_to_voxelidx =
-      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
-
-  int voxel_num = 0;
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
-        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
-            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
-            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
-            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
-            coors_range, grid_size, max_points, max_voxels, num_points,
-            num_features, NDim);
-      });
-
-  return voxel_num;
-}
diff --git a/mmcv/ops/csrc/parrots/voxelization_parrots.cpp b/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
index 208e539647f040fc34a45e28bed183311b38cbbb..90e2a4445c217a49ecddf064455874b1be12a14f 100644
--- a/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
+++ b/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
@@ -12,10 +12,12 @@ void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
                                         const OperatorBase::in_list_t& ins,
                                         OperatorBase::out_list_t& outs) {
   int max_points, max_voxels, NDim;
+  bool deterministic;
   SSAttrs(attr)
       .get<int>("max_points", max_points)
       .get<int>("max_voxels", max_voxels)
       .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
       .done();
   const auto& points = buildATensor(ctx, ins[0]);
   const auto& voxel_size = buildATensor(ctx, ins[1]);
@@ -28,7 +30,7 @@ void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
 
   hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                         num_points_per_voxel, voxel_num, max_points, max_voxels,
-                        NDim);
+                        NDim, deterministic);
 }
 
 void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
@@ -51,10 +53,12 @@ void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
                                        const OperatorBase::in_list_t& ins,
                                        OperatorBase::out_list_t& outs) {
   int max_points, max_voxels, NDim;
+  bool deterministic;
   SSAttrs(attr)
       .get<int>("max_points", max_points)
       .get<int>("max_voxels", max_voxels)
       .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
       .done();
   const auto& points = buildATensor(ctx, ins[0]);
   const auto& voxel_size = buildATensor(ctx, ins[1]);
@@ -67,7 +71,7 @@ void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
 
   hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
                         num_points_per_voxel, voxel_num, max_points, max_voxels,
-                        NDim);
+                        NDim, deterministic);
 }
 
 void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
@@ -89,6 +93,7 @@ PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
     .attr("max_points")
     .attr("max_voxels")
     .attr("NDim")
+    .attr("deterministic")
     .input(3)
     .output(4)
     .apply(hard_voxelize_forward_cpu_parrots)
diff --git a/mmcv/ops/csrc/parrots/voxelization_pytorch.h b/mmcv/ops/csrc/parrots/voxelization_pytorch.h
index f0fb325231e9a14638a205998a6a052761ea5cd4..0019d51912cb4b8077147e553925ab107bc216ce 100644
--- a/mmcv/ops/csrc/parrots/voxelization_pytorch.h
+++ b/mmcv/ops/csrc/parrots/voxelization_pytorch.h
@@ -9,7 +9,8 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3);
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true);
 
 void dynamic_voxelize_forward(const at::Tensor &points,
                               const at::Tensor &voxel_size,
diff --git a/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/mmcv/ops/csrc/pytorch/chamfer_distance.cpp b/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ea1ba675eabead6c3df28eee92eff837983adf6
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, gradxyz1,
+                       gradxyz2, graddist1, graddist2, idx1, idx2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor gradxyz1, Tensor gradxyz2,
+                               Tensor graddist1, Tensor graddist2, Tensor idx1,
+                               Tensor idx2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, gradxyz1, gradxyz2, graddist1,
+                                 graddist2, idx1, idx2);
+}
diff --git a/mmcv/ops/csrc/pytorch/contour_expand.cpp b/mmcv/ops/csrc/pytorch/contour_expand.cpp
index 7639ae5673c23efc5e5e535d99565428053c3361..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b 100755
--- a/mmcv/ops/csrc/pytorch/contour_expand.cpp
+++ b/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -102,7 +102,6 @@ std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
   IntArrayRef data_shape = kernel_mask.sizes();
 
   auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
-  IntArrayRef label_map_shape = internal_kernel_label.sizes();
   vector<vector<int>> text_line;
 
   kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
diff --git a/mmcv/ops/csrc/pytorch/convex_iou.cpp b/mmcv/ops/csrc/pytorch/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/mmcv/ops/csrc/pytorch/corner_pool.cpp b/mmcv/ops/csrc/pytorch/corner_pool.cpp
deleted file mode 100644
index 732cdb0562850cb20c2d47a56093b8024a3f6d42..0000000000000000000000000000000000000000
--- a/mmcv/ops/csrc/pytorch/corner_pool.cpp
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/princeton-vl/CornerNet-Lite/tree/master/core/models/py_utils/_cpools/src
-#include "pytorch_cpp_helper.hpp"
-
-Tensor bottom_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get height
-  int64_t height = input.size(2);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 2, ind, height);
-    Tensor cur_temp = at::slice(output, 2, ind, height).clone();
-    Tensor next_temp = at::slice(output, 2, 0, height - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor bottom_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(2, 0);
-  auto grad_output_temp = grad_output.select(2, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < height - 1; ++ind) {
-    input_temp = input.select(2, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(2, ind + 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor left_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get width
-  int64_t width = input.size(3);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 3, 0, width - ind);
-    Tensor cur_temp = at::slice(output, 3, 0, width - ind).clone();
-    Tensor next_temp = at::slice(output, 3, ind, width).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor left_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, width - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(width - 1);
-
-  auto output_temp = output.select(3, width - 1);
-  auto grad_output_temp = grad_output.select(3, width - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < width; ++ind) {
-    input_temp = input.select(3, width - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, width - ind - 1);
-
-    grad_output_temp = grad_output.select(3, width - ind - 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor right_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get width
-  int64_t width = input.size(3);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < width; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 3, ind, width);
-    Tensor cur_temp = at::slice(output, 3, ind, width).clone();
-    Tensor next_temp = at::slice(output, 3, 0, width - ind).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor right_pool_backward(Tensor input, Tensor grad_output) {
-  Tensor output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(3, 0);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(0);
-
-  auto output_temp = output.select(3, 0);
-  auto grad_output_temp = grad_output.select(3, 0);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(3);
-  auto gt_mask = torch::zeros({batch, channel, height},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, height},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 0; ind < width - 1; ++ind) {
-    input_temp = input.select(3, ind + 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, ind + 1);
-
-    grad_output_temp = grad_output.select(3, ind + 1).unsqueeze(3);
-    output.scatter_add_(3, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
-
-Tensor top_pool_forward(Tensor input) {
-  // Initialize output
-  Tensor output = at::zeros_like(input);
-  // Get height
-  int64_t height = input.size(2);
-  output.copy_(input);
-
-  for (int64_t ind = 1; ind < height; ind <<= 1) {
-    Tensor max_temp = at::slice(output, 2, 0, height - ind);
-    Tensor cur_temp = at::slice(output, 2, 0, height - ind).clone();
-    Tensor next_temp = at::slice(output, 2, ind, height).clone();
-    at::max_out(max_temp, cur_temp, next_temp);
-  }
-
-  return output;
-}
-
-Tensor top_pool_backward(Tensor input, Tensor grad_output) {
-  auto output = at::zeros_like(input);
-
-  int32_t batch = input.size(0);
-  int32_t channel = input.size(1);
-  int32_t height = input.size(2);
-  int32_t width = input.size(3);
-
-  auto max_val = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kFloat));
-  auto max_ind = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kLong));
-
-  auto input_temp = input.select(2, height - 1);
-  max_val.copy_(input_temp);
-
-  max_ind.fill_(height - 1);
-
-  auto output_temp = output.select(2, height - 1);
-  auto grad_output_temp = grad_output.select(2, height - 1);
-  output_temp.copy_(grad_output_temp);
-
-  auto un_max_ind = max_ind.unsqueeze(2);
-  auto gt_mask = torch::zeros({batch, channel, width},
-                              at::device(at::kCUDA).dtype(at::kBool));
-  auto max_temp = torch::zeros({batch, channel, width},
-                               at::device(at::kCUDA).dtype(at::kFloat));
-  for (int32_t ind = 1; ind < height; ++ind) {
-    input_temp = input.select(2, height - ind - 1);
-    at::gt_out(gt_mask, input_temp, max_val);
-
-    at::masked_select_out(max_temp, input_temp, gt_mask);
-    max_val.masked_scatter_(gt_mask, max_temp);
-    max_ind.masked_fill_(gt_mask, height - ind - 1);
-
-    grad_output_temp = grad_output.select(2, height - ind - 1).unsqueeze(2);
-    output.scatter_add_(2, un_max_ind, grad_output_temp);
-  }
-
-  return output;
-}
diff --git a/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa5a8b3d517e9cec4cf953aa9f3de8e2fb17c3a3
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void active_rotated_filter_forward_cpu_kernel(
+    const T* weightData, const int* indicesData, const int num_output_planes,
+    const int num_input_planes, const int num_orientations, const int kH,
+    const int kW, const int num_rotations, T* outputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T val = *(weightData + weightIndex);
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          T* target = outputData +
+                      i * (num_rotations * num_input_planes * nEntry) +
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *target = val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void active_rotated_filter_backward_cpu_kernel(
+    const T* gradOutputData, const int* indicesData,
+    const int num_output_planes, const int num_input_planes,
+    const int num_orientations, const int kH, const int kW,
+    const int num_rotations, T* gradInputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T* val = gradInputData + gradInputIndex;
+        *val = 0;
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          const T* target =
+              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *val = *val + *target;
+        }
+      }
+    }
+  }
+}
+
+void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
+                                           const Tensor indices,
+                                           Tensor output) {
+  const int num_output_planes = input.size(0);
+  const int num_input_planes = input.size(1);
+  const int num_orientations = input.size(2);
+  const int kH = input.size(3);
+  const int kW = input.size(4);
+  const int num_rotations = indices.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
+        active_rotated_filter_forward_cpu_kernel<scalar_t>(
+            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, output.data_ptr<scalar_t>());
+      });
+}
+
+void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
+                                            const Tensor indices,
+                                            Tensor grad_in) {
+  const int num_orientations = indices.size(0);
+  const int kH = indices.size(1);
+  const int kW = indices.size(2);
+  const int num_rotations = indices.size(3);
+  const int num_output_planes = grad_out.size(0) / num_rotations;
+  const int num_input_planes = grad_out.size(1) / num_orientations;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
+        active_rotated_filter_backward_cpu_kernel<scalar_t>(
+            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, grad_in.data_ptr<scalar_t>());
+      });
+}
+
+void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
+}
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
+                     active_rotated_filter_forward_cpu);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
+                     active_rotated_filter_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
index 223ee1a6c0c9959339cce442ecfe3a4ac2d25342..d2774c82654ef83d220ca81566cce8d25d02c275 100644
--- a/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -59,7 +59,7 @@ Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
 Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
                        const float iou_threshold) {
   auto result = at::empty({0}, dets.options());
-  AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms_rotated", [&] {
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
     result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
   });
   return result;
diff --git a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
index 9083281e6301e61be5ab7bc9737f78e1bb5fe4bd..db06a224a075e641b8d7738fe3e7be3f71990fc7 100755
--- a/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -1,6 +1,8 @@
 // Copyright (c) OpenMMLab. All rights reserved
 // It is modified from https://github.com/WenmuZhou/PAN.pytorch
 
+#include <queue>
+
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"
 
@@ -39,7 +41,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
     Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
   assert(score.dim() == 2);
   assert(mask.dim() == 2);
-  assert(embedding_dim.dim() == 3);
+  assert(embedding.dim() == 3);
   int height = score.size(0);
   int width = score.size(1);
   assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
@@ -103,7 +105,7 @@ std::vector<std::vector<float>> pixel_group_cpu(
 
       float dis = 0;
       auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
-      for (size_t i = 0; i < embedding_dim; i++) {
+      for (size_t i = 0; i < size_t(embedding_dim); i++) {
         dis +=
             pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
         // ignore further computing if dis is big enough
diff --git a/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
index 0f7511b8fd5816d28e38b01c556eb9557d13a2b1..8c849de0cbc564a9a88cdbcd35b4acdb065f99a3 100644
--- a/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -395,7 +395,6 @@ void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
                                         int aligned_width, float spatial_scale,
                                         int sampling_ratio, bool aligned,
                                         bool clockwise) {
-  int output_size = grad_output.numel();
   int channels = grad_input.size(1);
   int height = grad_input.size(2);
   int width = grad_input.size(3);
@@ -431,8 +430,6 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
                                     int aligned_width, float spatial_scale,
                                     int sampling_ratio, bool aligned,
                                     bool clockwise) {
-  // Number of ROIs
-  int num_rois = rois.size(0);
   int size_rois = rois.size(1);
   if (size_rois != 6) {
     AT_ERROR("wrong roi size");
@@ -442,15 +439,15 @@ void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
       sampling_ratio, aligned, clockwise);
 }
 
-void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+                                    float spatial_scale, int sampling_ratio,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                      bool clockwise);
 REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
                      roi_align_rotated_forward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp b/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09dcdd33759aa03e619c629ef7ae052d0fe48f2b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
@@ -0,0 +1,262 @@
+// modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T bilinear_interpolate(const T* input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  const T v_low = fma(v2 - v1, lx, v1);
+  const T v_high = fma(v4 - v3, lx, v3);
+  const T val = fma(v_high - v_low, ly, v_low);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+inline void valueAdd(scalar_t* address, scalar_t val) {
+  scalar_t old = *address;
+  *address = (old + val);
+}
+
+template <typename scalar_t>
+void rotated_feature_align_backward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    valueAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+
+void rotated_feature_align_forward_cpu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_cpu_kernel<scalar_t>(
+            output_size, points, bottom_data, bboxes_data,
+            scalar_t(spatial_scale), features.size(1), features.size(2),
+            features.size(3), top_data);
+      });
+}
+
+void rotated_feature_align_backward_cpu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_cpu_kernel<scalar_t>(
+            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
+            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
+      });
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
+                     rotated_feature_align_forward_cpu);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
+                     rotated_feature_align_backward_cpu);
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2c592b77d35af5dba3c8bc986aca30c2726d25c
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
@@ -0,0 +1,84 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/spconv/indice.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;
+
+#define DECLARE_CPU_INDEX(Index)          \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6266741ff9a4c1e122012d94578bb2cef58e4178
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
@@ -0,0 +1,82 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto foutData = fout.data();
+    auto finData = fin.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          finData[idxi + plane] += foutData[idxo + plane];
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp b/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4223da36093c558f62dd92a698411b3f5572096
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
@@ -0,0 +1,68 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(scalar_t) * numPlanes);
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const scalar_t* buf = buffer.data();
+    scalar_t* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j) {
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
+
+#define DECLARE_CPU_SPECS(scalar_t)         \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
index 25cc2b52cff3e15231b6d946ac8919e96be26974..a21f849a0b90ebb489d26daadbbc48427d6dd502 100644
--- a/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
+++ b/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -26,13 +26,22 @@ void dynamic_voxelize_forward_cpu_kernel(
       coor[ndim_minus_1 - j] = c;
     }
 
-    if (failed)
-      memset(&coors[i][0], -1, NDim * sizeof(T_int));
-    else
-      memcpy(&coors[i][0], &coor[0], NDim * sizeof(T_int));
+    // memcpy and memset will cause problem because of the memory distribution
+    // discontinuity of TensorAccessor, so here using loops to replace memcpy
+    // or memset
+    if (failed) {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = -1;
+      }
+    } else {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = coor[k];
+      }
+    }
   }
 
   delete[] coor;
+  return;
 }
 
 template <typename T, typename T_int>
@@ -72,14 +81,21 @@ void hard_voxelize_forward_cpu_kernel(
       voxel_num += 1;
 
       coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
-      memcpy(&coors[voxelidx][0], &coor[i][0], NDim * sizeof(T_int));
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
     }
 
     // put points into voxel
     num = num_points_per_voxel[voxelidx];
     if (max_points == -1 || num < max_points) {
-      memcpy(&voxels[voxelidx][num][0], &points[i][0],
-             num_features * sizeof(T));
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
       num_points_per_voxel[voxelidx] += 1;
     }
   }
diff --git a/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27fffb9faeaa33eff201c0fcaf236866e5d10712
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#include "active_rotated_filter_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output) {
+  int num_output_planes = input.size(0);
+  int num_input_planes = input.size(1);
+  int num_orientations = input.size(2);
+  int kH = input.size(3);
+  int kW = input.size(4);
+  int num_rotations = indices.size(3);
+  int nEntry = num_orientations * kH * kW;
+  int output_size = input.numel();
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
+        active_rotated_filter_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                output.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in) {
+  int num_orientations = indices.size(0);
+  int kH = indices.size(1);
+  int kW = indices.size(2);
+  int num_rotations = indices.size(3);
+  int num_output_planes = grad_out.size(0) / num_rotations;
+  int num_input_planes = grad_out.size(1) / num_orientations;
+  int nEntry = num_orientations * kH * kW;
+  int output_size = grad_in.numel();
+
+  at::cuda::CUDAGuard device_guard(indices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
+      [&] {
+        active_rotated_filter_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_out.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                grad_in.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
index c4e684b904b8ebb55d1d03210ff389960b868afe..bdb5fab9fc61ad19d9230cfdc26642dc7fe5972e 100644
--- a/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -13,7 +13,7 @@ void AssignScoreWithKForwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(points.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -36,9 +36,9 @@ void AssignScoreWithKBackwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(grad_out.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks1(DIVUP(B * M * O, THREADS_PER_BLOCK));
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
   dim3 threads1(THREADS_PER_BLOCK);
-  dim3 blocks2(DIVUP(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
   dim3 threads2(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
index f5f5f39c5d5229703e3c0ac1a00919bac7d76e3b..c42c3e2ae6164dfc504c2794db1436607ec8445f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -22,7 +22,7 @@ void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
index 16679c766f6ef9799fc573461c0853665dc0cff4..b3272539bfc882cdd49c9077054a1a08452bdbc9 100644
--- a/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -2,6 +2,22 @@
 #include "bbox_overlaps_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
+// Disable fp16 on ROCm device
+#ifndef HIP_DIFF
+#if __CUDA_ARCH__ >= 530
+template <>
+__global__ void bbox_overlaps_cuda_kernel<at::Half>(
+    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
+    const int num_bbox1, const int num_bbox2, const int mode,
+    const bool aligned, const int offset) {
+  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
+                                 reinterpret_cast<const __half*>(bbox2),
+                                 reinterpret_cast<__half*>(ious), num_bbox1,
+                                 num_bbox2, mode, aligned, offset);
+}
+#endif  // __CUDA_ARCH__ >= 530
+#endif  // HIP_DIFF
+
 void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
                                     Tensor ious, const int mode,
                                     const bool aligned, const int offset) {
diff --git a/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..980482eb540b27746c771b2f2f3492668e1a571d
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+#include "chamfer_distance_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
+                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
+                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
+                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
+                grad_xyz2.data_ptr<scalar_t>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
+                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
+                grad_xyz1.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu b/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
new file mode 100644
index 0000000000000000000000000000000000000000..804f7ac3bae433173f2e71011fa5be2c2c81e761
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
+#include "convex_iou_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious) {
+  int output_size = ious.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
+        convex_iou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output) {
+  int output_size = output.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
+        convex_giou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
index 56d2e644f8453d9d85370c24676f36e3c04ebee6..c10e9d40e04a785821738b49d55b143a5875f09c 100644
--- a/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -24,8 +24,8 @@ void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
   auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
   auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
 
-  const int threads = THREADS_FORWARD;
-  const dim3 blocks(batch_size, oH, oW);
+  const dim3 threads(WARP_SIZE, 4, 4);
+  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
 
   at::cuda::CUDAGuard device_guard(input1.device());
 
@@ -56,17 +56,20 @@ void CorrelationBackwardCUDAKernelLauncher(
   const int iW = input1.size(3);
   const int C = input1.size(1);
 
-  const dim3 blocks(C, iH, iW);
-  const dim3 threads(THREADS_BACKWARD, THREADS_BACKWARD);
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+  const dim3 blocks(batch_size, iH, iW);
+  const dim3 threads(THREADS_PER_BLOCK);
 
   at::cuda::CUDAGuard device_guard(input1.device());
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
         TensorAcc4R input1_acc =
-            input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R input2_acc =
-            input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R grad_input1_acc =
             grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
         TensorAcc4R grad_input2_acc =
@@ -74,20 +77,18 @@ void CorrelationBackwardCUDAKernelLauncher(
         TensorAcc5R grad_output_acc =
             grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
 
-        for (int n = 0; n < batch_size; ++n) {
-          correlation_backward_cuda_kernel_input1<scalar_t>
-              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
-                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-                  dilation_patchW, dH, dW, n);
-        }
+        correlation_backward_cuda_kernel_input1<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
 
-        for (int n = 0; n < batch_size; ++n) {
-          correlation_backward_cuda_kernel_input2<scalar_t>
-              <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                  grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
-                  patchW, padH, padW, dilationH, dilationW, dilation_patchH,
-                  dilation_patchW, dH, dW, n);
-        }
+        correlation_backward_cuda_kernel_input2<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
       }));
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
index 2e7a3f553ad6909aaafd82a2144c7ba4ad653129..12cf7afdc26bc61efb528424a5d2624b754e8516 100644
--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -570,20 +570,15 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
                                                    const Tensor boxes_b,
                                                    Tensor ans_overlap);
 
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou);
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long* mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh);
 
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long* mask, int boxes_num,
-                                       float nms_overlap_thresh);
-
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long* mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh);
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long* mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh);
 
 void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
@@ -592,45 +587,35 @@ void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
                                                 ans_overlap);
 };
 
-void iou3d_boxes_iou_bev_forward_cuda(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou) {
-  IoU3DBoxesIoUBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
-                                            ans_iou);
-};
-
-void iou3d_nms_forward_cuda(const Tensor boxes, unsigned long long* mask,
-                            int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSForwardCUDAKernelLauncher(boxes, mask, boxes_num, nms_overlap_thresh);
+void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                      nms_overlap_thresh);
 };
 
-void iou3d_nms_normal_forward_cuda(const Tensor boxes, unsigned long long* mask,
-                                   int boxes_num, float nms_overlap_thresh) {
-  IoU3DNMSNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
-                                          nms_overlap_thresh);
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                            nms_overlap_thresh);
 };
 
 void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                                           const int num_b, const Tensor boxes_b,
                                           Tensor ans_overlap);
 
-void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou);
-
-void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long* mask,
-                            int boxes_num, float nms_overlap_thresh);
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh);
 
-void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long* mask,
-                                   int boxes_num, float nms_overlap_thresh);
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh);
 
 REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
                      iou3d_boxes_overlap_bev_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, CUDA,
-                     iou3d_boxes_iou_bev_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms_forward_impl, CUDA, iou3d_nms_forward_cuda);
-REGISTER_DEVICE_IMPL(iou3d_nms_normal_forward_impl, CUDA,
-                     iou3d_nms_normal_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
 
 void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
                                   const Tensor xyz, const Tensor new_xyz,
@@ -924,20 +909,20 @@ REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
 REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
 
 void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor output);
 
 void ROIAlignRotatedBackwardCUDAKernelLauncher(
     const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
 
-void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+                                    float spatial_scale, int sampling_ratio,
                                     bool aligned, bool clockwise) {
   // Number of ROIs
   int num_rois = rois.size(0);
@@ -947,11 +932,11 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
     AT_ERROR("wrong roi size");
   }
 
-  int num_channels = features.size(1);
-  int data_height = features.size(2);
-  int data_width = features.size(3);
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
   ROIAlignRotatedForwardCUDAKernelLauncher(
-      features, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
       num_channels, data_height, data_width, num_rois, aligned_height,
       aligned_width, output);
 }
@@ -959,7 +944,7 @@ void roi_align_rotated_forward_cuda(Tensor features, Tensor rois, Tensor output,
 void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                      bool clockwise) {
   // Number of ROIs
   int num_rois = rois.size(0);
@@ -972,26 +957,101 @@ void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
   int data_height = bottom_grad.size(2);
   int data_width = bottom_grad.size(3);
   ROIAlignRotatedBackwardCUDAKernelLauncher(
-      top_grad, rois, spatial_scale, sample_ratio, aligned, clockwise,
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
       num_channels, data_height, data_width, num_rois, aligned_height,
       aligned_width, bottom_grad);
 }
 
-void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+                                    float spatial_scale, int sampling_ratio,
                                     bool aligned, bool clockwise);
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                      bool clockwise);
 REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
                      roi_align_rotated_forward_cuda);
 REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
                      roi_align_rotated_backward_cuda);
 
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
 void RoiawarePool3dForwardCUDAKernelLauncher(
     int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
     int out_y, int out_z, const Tensor rois, const Tensor pts,
@@ -1321,6 +1381,12 @@ int HardVoxelizeForwardCUDAKernelLauncher(
     const std::vector<float> coors_range, const int max_points,
     const int max_voxels, const int NDim = 3);
 
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
 void DynamicVoxelizeForwardCUDAKernelLauncher(
     const at::Tensor& points, at::Tensor& coors,
     const std::vector<float> voxel_size, const std::vector<float> coors_range,
@@ -1338,6 +1404,16 @@ int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
       max_points, max_voxels, NDim);
 };
 
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
 void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
@@ -1354,11 +1430,361 @@ int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
                                const int max_points, const int max_voxels,
                                const int NDim);
 
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
 void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
                                    const int NDim);
+
 REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
                      hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
 REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
                      dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct);
+
+torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
+                                                indiceNum, numAct);
+};
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct);
+REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
+                     indice_maxpool_forward_cuda);
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum);
+
+torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
+                                                 indicePairs, indiceNum);
+};
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum);
+
+REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
+                     indice_maxpool_backward_cuda)
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM);
+
+torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return IndiceConvForwardCUDAKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+};
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardCUDAKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+};
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
+                     indice_conv_backward_cuda);
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
+                                                    indicePairs, indiceNum,
+                                                    numActOut, _inverse, _subM);
+};
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
+                     fused_indice_conv_batchnorm_forward_cuda)
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, gradxyz1, gradxyz2,
+                                            graddist1, graddist2, idx1, idx2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
diff --git a/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..62dbf5da357ac8f2178e53d21fd8f9d3339eca81
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#include "diff_iou_rotated_cuda_kernel.cuh"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
+                                                        at::Tensor mask,
+                                                        at::Tensor num_valid) {
+  at::cuda::CUDAGuard device_guard(vertices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  CHECK_CONTIGUOUS(vertices);
+  CHECK_CONTIGUOUS(mask);
+  CHECK_CONTIGUOUS(num_valid);
+  CHECK_CUDA(vertices);
+  CHECK_CUDA(mask);
+  CHECK_CUDA(num_valid);
+
+  int b = vertices.size(0);
+  int n = vertices.size(1);
+  int m = vertices.size(2);
+  at::Tensor idx =
+      torch::zeros({b, n, MAX_NUM_VERT_IDX},
+                   at::device(vertices.device()).dtype(at::ScalarType::Int));
+
+  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
+                                                       stream>>>(
+      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
+      num_valid.data_ptr<int>(), idx.data_ptr<int>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return idx;
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b161d39228e1ea24cca1950dba1a3ca86e391e1
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
@@ -0,0 +1,104 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output =
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {  // the center index of subm conv don't need gather and scatter
+               // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+
+  return output;
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
index 672fec6b7df9b688c5da3cf83aa9144dab1a4f3d..fd0a7b5daf03510cfb7408ff82cfac760af92afb 100644
--- a/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -16,7 +16,7 @@ void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -43,7 +43,7 @@ void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
index e7c57b018aca1b9d5d2ecfee385ca39964bdcde7..42fc2bb67b13938b8994f1961ec2fbc41a30d2d8 100644
--- a/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -19,7 +19,7 @@ void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -46,7 +46,7 @@ void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
index 0643c16044b417163b5fad254acd1a7022b5bc60..ad5878fba27fee2fc6989d59b1f4f5959788c518 100644
--- a/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -21,8 +21,8 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
-              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
   dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
 
   iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
@@ -32,54 +32,35 @@ void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-void IoU3DBoxesIoUBevForwardCUDAKernelLauncher(const int num_a,
-                                               const Tensor boxes_a,
-                                               const int num_b,
-                                               const Tensor boxes_b,
-                                               Tensor ans_iou) {
-  at::cuda::CUDAGuard device_guard(boxes_a.device());
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
-  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK_IOU3D),
-              DIVUP(num_a, THREADS_PER_BLOCK_IOU3D));
-  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
-
-  iou3d_boxes_iou_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
-      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
-      ans_iou.data_ptr<float>());
-
-  AT_CUDA_CHECK(cudaGetLastError());
-}
-
-void IoU3DNMSForwardCUDAKernelLauncher(const Tensor boxes,
-                                       unsigned long long *mask, int boxes_num,
-                                       float nms_overlap_thresh) {
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long *mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh) {
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
   dim3 threads(THREADS_PER_BLOCK_NMS);
 
-  nms_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
       boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
 
   AT_CUDA_CHECK(cudaGetLastError());
 }
 
-void IoU3DNMSNormalForwardCUDAKernelLauncher(const Tensor boxes,
-                                             unsigned long long *mask,
-                                             int boxes_num,
-                                             float nms_overlap_thresh) {
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long *mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh) {
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
-              DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
   dim3 threads(THREADS_PER_BLOCK_NMS);
 
-  nms_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
       boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
 
   AT_CUDA_CHECK(cudaGetLastError());
diff --git a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
index 4954fe457dcb7b08dbcae0814b9f5d9ecb54f9d7..e3351819779cc356cc21d7bb375082f71da2cb75 100644
--- a/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -19,7 +19,7 @@ void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(m, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu b/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9314f2dda6c89e1f35369b1b7ab9d290cf2ab295
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
+#include "min_area_polygons_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
+                                       Tensor polygons) {
+  int num_pointsets = pointsets.size(0);
+  const int output_size = polygons.numel();
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
+        min_area_polygons_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                num_pointsets, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
index 2fccaa2132d5c041b02835756e3da9313c22a158..fd191ee9c99eb000dced9131abf551ce65c691d3 100644
--- a/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -31,7 +31,7 @@ void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
                                const int num_point, scalar_t *data_col) {
   const int num_kernels = batch_size * num_query * num_heads * channels;
   const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
+  const int num_threads = THREADS_PER_BLOCK;
   ms_deformable_im2col_gpu_kernel<scalar_t>
       <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
           num_kernels, data_value, data_spatial_shapes, data_level_start_index,
@@ -54,11 +54,11 @@ void ms_deformable_col2im_cuda(
     const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
     scalar_t *grad_attn_weight) {
   const int num_threads =
-      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
+      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
   const int num_kernels = batch_size * num_query * num_heads * channels;
   const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024) {
-    if ((channels & 1023) == 0) {
+  if (channels > THREADS_PER_BLOCK) {
+    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
       ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
           <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
              num_threads * 3 * sizeof(scalar_t), stream>>>(
@@ -178,16 +178,6 @@ void ms_deformable_col2im_cuda(
                          channels, num_levels, num_query, num_point, grad_value,
                          grad_sampling_loc, grad_attn_weight);
         break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
-                                                                      1024>
-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
-                         data_level_start_index, data_sampling_loc,
-                         data_attn_weight, batch_size, spatial_size, num_heads,
-                         channels, num_levels, num_query, num_point, grad_value,
-                         grad_sampling_loc, grad_attn_weight);
-        break;
       default:
         if (channels < 64) {
           ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
diff --git a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
index 16cf64683f82186e58c8263eaef7e635607d322e..1b87e0fa75bd5507ed0c94c7e32eb601a95a5f76 100644
--- a/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -13,41 +13,24 @@ Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
   auto boxes_sorted = boxes.index_select(0, order_t);
 
   int boxes_num = boxes.size(0);
-  const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
   Tensor mask =
       at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
-  dim3 blocks(col_blocks, col_blocks);
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
   dim3 threads(threadsPerBlock);
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   nms_cuda<<<blocks, threads, 0, stream>>>(
       boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
       (unsigned long long*)mask.data_ptr<int64_t>());
 
-  at::Tensor mask_cpu = mask.to(at::kCPU);
-  unsigned long long* mask_host =
-      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
-
-  std::vector<unsigned long long> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
-
-  at::Tensor keep_t =
-      at::zeros({boxes_num}, boxes.options().dtype(at::kBool).device(at::kCPU));
-  bool* keep = keep_t.data_ptr<bool>();
-
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / threadsPerBlock;
-    int inblock = i % threadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      keep[i] = true;
-      // set every overlap box with bit 1 in remv
-      unsigned long long* p = mask_host + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-
+  // Filter the boxes which should be kept.
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
   AT_CUDA_CHECK(cudaGetLastError());
-  return order_t.masked_select(keep_t.to(at::kCUDA));
+  return order_t.masked_select(keep_t);
 }
diff --git a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
index 17e6441ba4676e8e99c438ab3718da471878ea28..3cc89d010a80126360fe42503a1754ef4a420afa 100644
--- a/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -21,7 +21,7 @@ void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -47,7 +47,7 @@ void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
   at::cuda::CUDAGuard device_guard(boxes.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e7db9ddfd63e4bfb3ca150a83dde5a79fb1717e
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
+
+#include <stdio.h>
+
+#include "points_in_polygons_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output) {
+  const int output_size = rows * cols;
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
+        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
+        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
+        scalar_t *inside_flag = output.data_ptr<scalar_t>();
+
+        points_in_polygons_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, vertex1, vertex2, rows, cols, inside_flag);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e0636098b1d6fb6eef0c6a5ff334ddb43ae7855f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "prroi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_forward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
+          output.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width,
+                                         float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_input.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
+                                             Tensor input, Tensor rois,
+                                             Tensor grad_rois,
+                                             int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_coor_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
+          input.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
index fe7d1806bcea56a453b822aa9d1094a54a7dcc5c..a0bdfa60c2d3ba75d089d0bfa44648821aaf4fed 100644
--- a/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -2,11 +2,8 @@
 // Modified from
 // https://github.com/hszhao/semseg/blob/master/lib/psa/src
 
-#include <THC/THC.h>
 #include <torch/serialize/tensor.h>
 
-#include <THC/THCDeviceUtils.cuh>
-
 #include "psamask_cuda_kernel.cuh"
 #include "pytorch_cuda_helper.hpp"
 
diff --git a/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9829da731d6f5ad61ad2cde04a3b8511b5ca942c
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "riroi_align_rotated_cuda_kernel.cuh"
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        riroi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                num_samples, clockwise, channels, height, width, pooled_height,
+                pooled_width, num_orientations, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        riroi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_diff, rois_data, spatial_scale, num_samples,
+                clockwise, channels, height, width, pooled_height, pooled_width,
+                num_orientations, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
index aa631bcb71085ca2875b9d8f16e0ae09689a1326..c0fd987bb91d4c903c7e408190d7a31b906bae62 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -3,21 +3,21 @@
 #include "roi_align_rotated_cuda_kernel.cuh"
 
 void ROIAlignRotatedForwardCUDAKernelLauncher(
-    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor output) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
         const scalar_t *rois_data = rois.data_ptr<scalar_t>();
         scalar_t *top_data = output.data_ptr<scalar_t>();
 
         roi_align_rotated_forward_cuda_kernel<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
                 output_size, bottom_data, rois_data, scalar_t(spatial_scale),
-                sample_num, aligned, clockwise, channels, height, width,
+                sampling_ratio, aligned, clockwise, channels, height, width,
                 pooled_height, pooled_width, top_data);
       }));
 
@@ -26,7 +26,7 @@ void ROIAlignRotatedForwardCUDAKernelLauncher(
 
 void ROIAlignRotatedBackwardCUDAKernelLauncher(
     const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
-    const int sample_num, const bool aligned, const bool clockwise,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
     const int channels, const int height, const int width, const int num_rois,
     const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
   const int output_size = num_rois * pooled_height * pooled_width * channels;
@@ -37,7 +37,7 @@ void ROIAlignRotatedBackwardCUDAKernelLauncher(
         scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
         roi_align_rotated_backward_cuda_kernel<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
-                output_size, top_diff, rois_data, spatial_scale, sample_num,
+                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
                 aligned, clockwise, channels, height, width, pooled_height,
                 pooled_width, bottom_diff);
       }));
diff --git a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
index 2bc7c3f764f9a9849766916e8ec834b355e6f63d..7d83755f4c89104a037cb7c16a59e6dd25f84e12 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -26,7 +26,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
   Tensor pts_mask =
       -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
 
-  dim3 blocks_mask(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -42,7 +42,7 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
 
   // TODO: Merge the collect and pool functions, SS
 
-  dim3 blocks_collect(DIVUP(boxes_num, THREADS_PER_BLOCK));
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
 
   AT_DISPATCH_INTEGRAL_TYPES(
       pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
@@ -55,8 +55,8 @@ void RoiawarePool3dForwardCUDAKernelLauncher(
 
   AT_CUDA_CHECK(cudaGetLastError());
 
-  dim3 blocks_pool(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
-                   boxes_num);
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
   if (pool_method == 0) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
@@ -93,7 +93,7 @@ void RoiawarePool3dBackwardCUDAKernelLauncher(
   at::cuda::CUDAGuard device_guard(grad_out.device());
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 blocks(DIVUP(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
               boxes_num);
   dim3 threads(THREADS_PER_BLOCK);
 
diff --git a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
index 49c003f9092a7d766e6fe1a7ac297db4d9d6be7f..af2098e8229ef29c08fe3c8d715863fe67cda06e 100644
--- a/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -24,7 +24,7 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -38,14 +38,14 @@ void RoIPointPool3dForwardCUDAKernelLauncher(
                              boxes3d.options().dtype(at::kInt));
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks2(DIVUP(boxes_num, THREADS_PER_BLOCK), batch_size);
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
 
   get_pooled_idx<<<blocks2, threads, 0, stream>>>(
       batch_size, pts_num, boxes_num, sampled_pts_num,
       pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
       pooled_empty_flag.data_ptr<int>());
 
-  dim3 blocks_pool(DIVUP(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
                    batch_size);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d172338ae76b7d1509b3011383d3ea95ee8d9527
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cuda_helper.hpp"
+#include "rotated_feature_align_cuda_kernel.cuh"
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
+      ([&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, bottom_data, bboxes_data,
+                scalar_t(spatial_scale), features.size(1), features.size(2),
+                features.size(3), top_data);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad) {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
+      ([&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, top_diff, bboxes_data,
+                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
+                top_grad.size(3), bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
index 4939fe40a07662b753dfac6a4f3726a626cbb3b2..cbc44651fc51a5392031e51355de242837242596 100644
--- a/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -26,10 +26,15 @@ std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
   std::tie(out_coors, coors_map, reduce_count) =
       at::unique_dim(coors_clean, 0, true, true, true);
 
-  // the first element of out_coors is always (-1,-1,-1) and should be removed
-  out_coors = out_coors.slice(0, 1);
-  reduce_count = reduce_count.slice(0, 1).to(torch::kInt32);
-  coors_map = coors_map.to(torch::kInt32) - 1;
+  if (out_coors[0][0].lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
 
   auto reduced_feats =
       at::empty({out_coors.size(0), num_feats}, feats.options());
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..89a2d3af843718676f243e1d885f78050c4a61b9
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
@@ -0,0 +1,156 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/indice.cuh>
+#include <type_traits>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                            indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                            indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;            \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1addf2e9828f344734c1d346919a97b0f1c36936
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
@@ -0,0 +1,483 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/maxpool.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
+                                             const scalar_t *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const scalar_t *fout, scalar_t *fin,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  fout += blockIdx.y * NumTLP;
+  fin += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          fin[idxi] += fout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(
+    const scalar_t *outFeatures, const scalar_t *inFeatures,
+    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
+    const Index *indicesOut, int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const scalar_t *fout, scalar_t *fin,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  scalar_t bufdi[vecloadFactor];
+  scalar_t bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(fout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(fin)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(fin)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const scalar_t *fout, scalar_t *fin,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d,
+                  tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    fout.data(), fin.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       fout.data(), fin.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
+                                                       Index>;                 \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
+                                                        scalar_t, Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..44ca42e3f63905f53e6ce75d80db14eec5fb083f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
@@ -0,0 +1,89 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  double totalTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::TorchGPU(),
+                         tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return inputGrad;
+}
diff --git a/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu b/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2929a75773ef2bfbf34e2fd6704a4a2e39dc9214
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
@@ -0,0 +1,157 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1a0e1a73a6b3134a0c0f96358cc6d93bddcf768
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
@@ -0,0 +1,474 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
index 839d2d86c459d0655414ca8b8ef0fa03a32e150f..56a5550066035efb96d1d8e46c5f1ecd3e36083b 100644
--- a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
index 9afde8f2456c54ea406f91399d12fc1ce4efa238..91c68829b9f2c19f1a64def88475c0fedf40de9f 100644
--- a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
   dim3 threads(THREADS_PER_BLOCK);
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
diff --git a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
index bcb7da338adc659ac88b0f04676746a7c48f5ca9..f4166b7b7a4fc7297f452636a991bbf91789dd85 100644
--- a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -145,6 +145,104 @@ int HardVoxelizeForwardCUDAKernelLauncher(
   return voxel_num_int;
 }
 
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0) return 0;
+
+  dim3 blocks(
+      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors[0][0].lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(at::kInt);
+  coors_map = coors_map.to(at::kInt);
+
+  at::Tensor coors_count = at::zeros({1}, coors_map.options());
+  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
+  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
+  reduce_count = at::zeros({num_coors}, coors_map.options());
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "get_assign_pos", ([&] {
+        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
+            num_points, coors_map.contiguous().data_ptr<int32_t>(),
+            pts_id.contiguous().data_ptr<int32_t>(),
+            coors_count.contiguous().data_ptr<int32_t>(),
+            reduce_count.contiguous().data_ptr<int32_t>(),
+            coors_order.contiguous().data_ptr<int32_t>());
+      }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        nondeterministic_assign_point_voxel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                num_points, points.contiguous().data_ptr<scalar_t>(),
+                coors_map.contiguous().data_ptr<int32_t>(),
+                pts_id.contiguous().data_ptr<int32_t>(),
+                temp_coors.contiguous().data_ptr<int32_t>(),
+                reduce_count.contiguous().data_ptr<int32_t>(),
+                coors_order.contiguous().data_ptr<int32_t>(),
+                voxels.contiguous().data_ptr<scalar_t>(),
+                coors.contiguous().data_ptr<int32_t>(),
+                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+                max_voxels, max_points, num_features, NDim);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
 void DynamicVoxelizeForwardCUDAKernelLauncher(
     const at::Tensor &points, at::Tensor &coors,
     const std::vector<float> voxel_size, const std::vector<float> coors_range,
diff --git a/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp b/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp b/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54073a54ec5d335d2e2ed68c553eb1d6eb49557b
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
@@ -0,0 +1,34 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
diff --git a/mmcv/ops/csrc/pytorch/iou3d.cpp b/mmcv/ops/csrc/pytorch/iou3d.cpp
index 71f5030ef6e6a729ca1a56f61f0e760ef966d6e5..5ef9c7e819943a1c5305ca3fd6294b8a3f870056 100644
--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -19,31 +19,24 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                        num_b, boxes_b, ans_overlap);
 }
 
-void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou) {
-  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
-                       boxes_b, ans_iou);
-}
-
-void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
-                            int boxes_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
                        nms_overlap_thresh);
 }
 
-void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
-                                   int boxes_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
                        nms_overlap_thresh);
 }
 
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                      Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params boxes_b: (M, 5)
   // params ans_overlap: (N, M)
-
   int num_a = boxes_a.size(0);
   int num_b = boxes_b.size(0);
 
@@ -51,20 +44,9 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                        ans_overlap);
 }
 
-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
-}
-
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params keep: (N)
   CHECK_CONTIGUOUS(boxes);
   CHECK_CONTIGUOUS(keep);
@@ -73,13 +55,14 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
   int64_t *keep_data = keep.data_ptr<int64_t>();
   int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
 
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
 
   Tensor mask =
       at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
   unsigned long long *mask_data =
       (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
 
   at::Tensor mask_cpu = mask.to(at::kCPU);
   unsigned long long *mask_host =
@@ -105,9 +88,9 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
   }
 }
 
-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
   // params keep: (N)
 
   CHECK_CONTIGUOUS(boxes);
@@ -117,14 +100,15 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
   int64_t *keep_data = keep.data_ptr<int64_t>();
   int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
 
-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
 
   Tensor mask =
       at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
   unsigned long long *mask_data =
       (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
-                                nms_overlap_thresh);
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
+                                  nms_overlap_thresh);
 
   at::Tensor mask_cpu = mask.to(at::kCPU);
   unsigned long long *mask_host =
diff --git a/mmcv/ops/csrc/pytorch/min_area_polygons.cpp b/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82d55559c52047bfd82c3813c995e3d0ae0c24c0
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
@@ -0,0 +1,100 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                        cnrtQueue_t queue, const cnrtDataType_t d_type,
+                        const void *bbox1, const void *bbox2, void *ious,
+                        const int32_t num_bbox1, const int32_t num_bbox2,
+                        const int32_t mode, const bool aligned,
+                        const int32_t offset);
+
+static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                       const int32_t batch_num_all) {
+  auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto core_num = union_num * core_dim;
+
+  // Union1 policyFunc
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_dim;
+  auto need_core_num = PAD_UP(batch_num_all, core_dim);
+  k_dim->y =
+      (need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
+  k_dim->z = 1;
+
+  return;
+}
+
+void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                   Tensor ious, const int32_t mode,
+                                   const bool aligned, const int32_t offset) {
+  // check dtype
+  TORCH_CHECK(
+      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      bboxes1.scalar_type(), ".");
+  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
+              "bboxes1's dtype should be the same with bboxes2's dtype.");
+
+  // params check
+  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
+              bboxes1.dim(), "D");
+  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
+              bboxes2.dim(), "D");
+
+  auto rows = bboxes1.size(0);
+  auto cols = bboxes2.size(0);
+  auto batch_num_all = rows;
+
+  if (rows * cols == 0) {
+    // return if zero element
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(&k_dim, &k_type, batch_num_all);
+
+  // get compute queue
+  cnrtQueue_t queue = torch_mlu::getCurQueue();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
+
+  // get ptr of tensors
+  auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
+  auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
+  auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
+  auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious);
+  auto ious_ptr = ious_impl->cnnlMalloc();
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
+  CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
+              << " ]";
+  KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
+                     ious_ptr, rows, cols, mode, aligned, offset);
+}
+
+void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9242644c894c31b8d7ac2d719fc80d2b57bbdb96
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
@@ -0,0 +1,332 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <string>
+#include <vector>
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                   cnrtQueue_t queue,
+                                   const cnrtDataType_t d_type,
+                                   const void *input, const void *target,
+                                   const void *weight, const int32_t N,
+                                   const int32_t C, const float alpha,
+                                   const float gamma, void *output);
+
+void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                    cnrtQueue_t queue,
+                                    const cnrtDataType_t d_type,
+                                    const void *input, const void *target,
+                                    const void *weight, const float gamma,
+                                    const float alpha, const int32_t dim_n,
+                                    const int32_t deal_n, const int32_t dim_c,
+                                    void *output);
+// Policy Function for Forward
+static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                              const Tensor &input, const Tensor &target,
+                              const Tensor &weight) {
+  auto N = input.size(0);
+  auto C = input.size(1);
+
+  const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
+  const int split_target_num = 2;
+  const int split_pipeline_num = 6;
+  const int has_weight = weight.data_ptr() != nullptr;
+  const int target_data_width = target.scalar_type() == at::kLong
+                                    ? target.itemsize() / 2
+                                    : target.itemsize();
+  const int threshold_c =
+      PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      input.itemsize();
+
+  int n_seg = 1;
+  if (C <= threshold_c) {
+    int c_size = C * input.itemsize();
+    int reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int wegiht_size = 0;
+    if (has_weight) {
+      c_size = c_align_size;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      wegiht_size = c_align_size;
+    }
+    // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
+    // split_target_num
+    //     + weight_size + reservered_align_size <= nram_size
+    n_seg = (nram_size - wegiht_size - reservered_align_size) /
+            (split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
+  }
+  auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_num = core_dim * cluster_num;
+
+  k_dim->x = *k_type;
+  k_dim->y =
+      seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
+  k_dim->z = 1;
+}
+
+// Policy Function for Backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  // set Union1 Job
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
+                                              Tensor weight, Tensor output,
+                                              const float gamma,
+                                              const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  // return if zero-element
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  policyFuncForward(&k_dim, &k_type, input, target, weight);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+  // launch kernel
+  KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
+                                target_ptr, weight_ptr, input.size(0),
+                                input.size(1), alpha, gamma, output_ptr);
+}
+
+void getDealNAndThresholdC(const int compute_data_bytes,
+                           const int target_data_bytes, const int total_c,
+                           int *deal_n_ptr, int *threshold_c_ptr,
+                           const bool has_weight, const bool is_half) {
+  /* NRAM partition:
+   *
+   * |-----------------ping pong--------------------|
+   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
+   *
+   * split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
+   */
+  const int nram_split_num = 5;
+  const int nram_split_pingpong = 2;
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  int32_t compute_align_size = NFU_ALIGN_SIZE;
+  if (is_half) {
+    compute_align_size += NFU_ALIGN_SIZE;
+  }
+  const int32_t compute_align_num = compute_align_size / compute_data_bytes;
+  // reservered_align_size: including input(ping pong), pt(ping pong),
+  //                        alpha_t(ping pong), temp(ping pong),
+  //                        output(ping pong), target(ping pong),
+  //                        flt_min and gamma.
+  const int reservered_align_size =
+      ((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
+  int nram_pingpong_size = max_nram_size - reservered_align_size;
+
+  int compute_c = total_c;
+  int threshold_c = 0;
+  if (has_weight) {
+    // reserved space for weight to align
+    nram_pingpong_size -= NFU_ALIGN_SIZE;
+
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes +
+    //     threshold_c * compute_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
+        (compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
+    threshold_c = PAD_DOWN(threshold_c, compute_align_num);
+    int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
+
+    // reserved space for weight
+    nram_pingpong_size -= weight_space;
+    compute_c = PAD_UP(total_c, compute_align_num);
+  } else {
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
+        (nram_split_num * compute_data_bytes);
+  }
+  // deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
+  //     nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
+  //     nram_pingpong_size
+  *deal_n_ptr =
+      nram_pingpong_size /
+      ((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
+       nram_split_pingpong);
+  *threshold_c_ptr = threshold_c;
+}
+
+void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  bool has_weight = false;
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+    has_weight = true;
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  auto dim_c = input.size(1);
+  const int compute_data_bytes = sizeof(float);
+  // target supports only INT on MLU device while it keeps LONG on host side,
+  // so target.itemsize() / 2
+  const int target_data_bytes = target.scalar_type() == at::kLong
+                                    ? (target.itemsize() / 2)
+                                    : target.itemsize();
+  int deal_n = 0;
+  int threshold_c = 0;
+  bool is_half = false;
+  if (input.scalar_type() == at::kHalf) {
+    is_half = true;
+  }
+  // calculate deal_n and threshold_c
+  getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
+                        &threshold_c, has_weight, is_half);
+
+  // check C
+  TORCH_CHECK(threshold_c >= dim_c,
+              "input.size(1) should be in the range of [0, ", threshold_c,
+              "]. ", "But now input.size(1) is ", dim_c, ".");
+
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  // set task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto dim_n = input.size(0);
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+
+  // launch kernel
+  KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
+                                 target_ptr, weight_ptr, gamma, alpha, dim_n,
+                                 deal_n, dim_c, output_ptr);
+}
+
+void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
+                                           alpha);
+}
+
+void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha) {
+  SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
+                     sigmoid_focal_loss_forward_mlu);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
+                     sigmoid_focal_loss_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33c4f7de50834fc9e97af00b2e455e2eba279ddc
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -0,0 +1,130 @@
+/*************************************************************************
+ * Copyright (C) 2021 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+
+int selectUnionType(uint32_t use_job, int box_num_per_core) {
+  // the box_num_per_core should be at least 256, otherwise the real IO
+  // bandwidth would be very low
+  while (box_num_per_core < 256 && use_job >= 4) {
+    box_num_per_core *= 2;
+    use_job /= 2;
+  }
+  return use_job;
+}
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  // dimension parameters check
+  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
+              boxes.dim(), "D");
+  TORCH_CHECK(boxes.size(1) == 4,
+              "boxes should have 4 elements in dimension 1, got ",
+              boxes.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
+              scores.dim(), "D");
+
+  // data type check
+  TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
+              "boxes should have the same type as scores");
+  TORCH_CHECK(
+      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
+      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int input_num_boxes = boxes.size(0);
+  int input_stride = boxes.size(0);
+  int max_output_boxes = boxes.size(0);
+
+  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
+  cnrtDim3_t k_dim;
+  cnrtJobType_t k_type;
+  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t job_limit = union_number * core_dim;
+  uint32_t core_number = union_number * core_dim;
+  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
+  // initiate k_type as Union1
+  k_dim.x = core_dim;
+  k_dim.y = 1;
+  k_dim.z = 1;
+  k_type = CNRT_FUNC_TYPE_UNION1;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  if (use_job < 4) {
+    k_dim.x = 1;
+    k_type = CNRT_FUNC_TYPE_BLOCK;
+  } else if (use_job == 4) {
+    k_dim.x = core_dim;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+  } else {
+    k_dim.x = use_job;
+    k_type = (cnrtFunctionType_t)use_job;
+  }
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_t = boxes.transpose(0, 1);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  // workspace
+  const int info_num = 5;  // x1, x2, y1, y2 and score
+  size_t space_size = 0;
+  if (boxes.scalar_type() == at::kHalf) {
+    space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
+  } else {
+    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
+  }
+  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
+              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
+            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+            offset, workspace_ptr, output_size_ptr, output_ptr);
+
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  return output.slice(0, 0, output_num);
+}
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0579da879edb4140236e74222fd3d05117e93dfb
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
@@ -0,0 +1,308 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <algorithm>
+
+#include "psamask_utils.hpp"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+#define COMPUTE_COUNT_ALIGN 64
+
+void KernelPsamaskForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *x, void *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+void KernelPsamaskBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *dy, void *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+namespace {
+void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
+                PartitionSeg *partition_ptr, const int n, const int h_feature) {
+  unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  unsigned int use_cluster_num = cluster_num;
+  unsigned int use_core_num = core_dim;
+
+  if (n >= cluster_num || n >= h_feature) {
+    partition_ptr->cluster_partition = PARTITION_N;
+    partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
+    partition_ptr->h_per_cluster = h_feature;
+    use_cluster_num =
+        (n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
+  } else {
+    partition_ptr->cluster_partition = PARTITION_H;
+    partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
+    partition_ptr->n_per_cluster = n;
+    use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
+                      partition_ptr->h_per_cluster;
+  }
+
+  if (partition_ptr->n_per_cluster >= core_dim ||
+      partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
+    partition_ptr->core_partition = PARTITION_N;
+    partition_ptr->n_per_core =
+        (partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->h_per_core = partition_ptr->h_per_cluster;
+    use_core_num =
+        (partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
+        partition_ptr->n_per_core;
+  } else {
+    partition_ptr->core_partition = PARTITION_H;
+    partition_ptr->h_per_core =
+        (partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->n_per_core = partition_ptr->n_per_cluster;
+    use_core_num =
+        (partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
+        partition_ptr->h_per_core;
+  }
+  *k_dim_ptr = {core_dim, use_cluster_num, 1};
+}
+
+}  // namespace
+
+bool findLimit(const int shape_core_n, const int shape_core_h,
+               const int shape_core_w, const int shape_core_ci,
+               const int shape_core_co, int *limit_n_seg_ptr,
+               int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
+  const bool need_temp = psa_type == 1;
+  const int input_bytes = sizeof(float);
+  int limit_n_seg = shape_core_n;
+  int limit_h_seg = shape_core_h;
+  int limit_w_seg = shape_core_w;
+
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
+  const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
+  const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
+  const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
+  const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
+  const int max_num = max_nram_size / input_bytes;
+
+  int n_limit =
+      max_num /
+      (CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
+       align_hw * align_co * (1 + need_temp));
+  if (n_limit > 0) {
+    n_limit = std::min(n_limit, shape_core_n);
+    limit_n_seg = n_limit;
+  } else {
+    int h_limit =
+        max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
+                   align_w * align_co * (1 + need_temp));
+    if (h_limit > 0) {
+      h_limit = std::min(h_limit, shape_core_h);
+      limit_h_seg = h_limit;
+      limit_n_seg = 1;
+    } else {
+      int w_limit =
+          max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
+                     CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
+      if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
+        w_limit = std::min(w_limit, shape_core_w);
+        w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
+                  (COMPUTE_COUNT_ALIGN / input_bytes);
+        limit_w_seg = w_limit;
+        limit_h_seg = 1;
+        limit_n_seg = 1;
+      } else {
+        CNLOG(INFO) << "The size of input channel is too large.";
+        return false;
+      }
+    }
+  }
+  *limit_n_seg_ptr = limit_n_seg;
+  *limit_h_seg_ptr = limit_h_seg;
+  *limit_w_seg_ptr = limit_w_seg;
+  return true;
+}
+
+void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
+                                     Tensor y, const int num_,
+                                     const int h_feature, const int w_feature,
+                                     const int h_mask, const int w_mask,
+                                     const int half_h_mask,
+                                     const int half_w_mask) {
+  // params check
+  TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
+              x.scalar_type());
+  TORCH_CHECK(y.scalar_type() == x.scalar_type(),
+              "y should have the same type as x");
+  TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
+  TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
+
+  int x_c = x.size(1);
+  int y_c = y.size(1);
+  TORCH_CHECK(h_mask * w_mask == x_c,
+              "channel of x should be the same as h_mask * w_mask");
+  TORCH_CHECK(h_feature * w_feature == y_c,
+              "channel of y should be the same as h_feature * w_feature");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (x.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
+  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
+  at::Tensor y_tmp =
+      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
+  auto x_ptr = x_impl->cnnlMalloc();
+  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
+  auto y_ptr = y_impl->cnnlMalloc();
+
+  KernelPsamaskForward(
+      k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
+      partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  y.copy_(y_tmp);
+}
+
+void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
+                                      Tensor dx, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  // params check
+  TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
+              dy.scalar_type());
+  TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
+              "dx should have the same type as dy");
+  TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
+  TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
+
+  int dy_c = dy.size(1);
+  int dx_c = dx.size(1);
+  TORCH_CHECK(h_feature * w_feature == dy_c,
+              "channel of dy should be the same as h_feature * w_feature");
+  TORCH_CHECK(h_mask * w_mask == dx_c,
+              "channel of dx should be the same as h_mask * w_mask");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (dx.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
+  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
+  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
+                                dy.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
+  auto dx_ptr = dx_impl->cnnlMalloc();
+  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
+  auto dy_ptr = dy_impl->cnnlMalloc();
+
+  KernelPsamaskBackward(
+      k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
+      half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  dx.copy_(dx_tmp);
+}
+
+void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
+                                  w_feature, h_mask, w_mask, half_h_mask,
+                                  half_w_mask);
+}
+
+void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                   h_feature, w_feature, h_mask, w_mask,
+                                   half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+
+REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..077dbfc51e2fd42aa85f3ef47f9af2f08134d698
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
@@ -0,0 +1,206 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                    cnrtQueue_t queue, const cnrtDataType_t d_type,
+                    const void *input, const void *rois, const int channels,
+                    const bool aligned, const int pooled_height,
+                    const int pooled_width, const int input_height,
+                    const int input_width, const int sampling_ratio,
+                    const float spatial_scale, const int num_rois,
+                    void *output);
+
+void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                            cnrtQueue_t queue, const cnrtDataType_t dtype,
+                            const void *grads, const void *boxes,
+                            void *grads_image, const int boxes_num,
+                            const int hi, const int wi, const int c,
+                            const int no, const int ho, const int wo,
+                            const float spatial_scale, const int sampling_ratio,
+                            const bool aligned);
+
+void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax_y, Tensor argmax_x,
+                                      int aligned_height, int aligned_width,
+                                      float spatial_scale, int sampling_ratio,
+                                      int pool_mode, bool aligned) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
+              "rois should have the same type as input");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  if (output.numel() == 0) {
+    output = at::zeros({num_rois, channels, aligned_height, aligned_width},
+                       input.options());
+    return;
+  }
+
+  at::Tensor output_tmp =
+      at::empty({num_rois, channels, aligned_height, aligned_width},
+                input.options(), memory_format);
+
+  // get tensor impl
+  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto self_ptr = self_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim.z = 1;
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
+                 aligned, aligned_height, aligned_width, height, width,
+                 sampling_ratio, spatial_scale, num_rois, output_ptr);
+
+  output.copy_(output_tmp);
+}
+
+static int nearestPower2(int x) {
+  x--;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return x;
+}
+
+void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       Tensor grad_input, int aligned_height,
+                                       int aligned_width, float spatial_scale,
+                                       int sampling_ratio, int pool_mode,
+                                       bool aligned) {
+  // params check
+  TORCH_CHECK(
+      grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
+      "grad type should be Float or Half, got ", grad.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
+              "rois should have the same type as grad");
+  TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  int batch_size = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
+  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
+  auto grad_input_ = at::empty({batch_size, channels, height, width},
+                               grad.options(), memory_format)
+                         .zero_();
+
+  int boxes_num = rois.size(0);
+  int hi = grad.size(2);
+  int wi = grad.size(3);
+  int c = grad.size(1);
+
+  int no = grad_input.size(0);
+  int ho = grad_input.size(2);
+  int wo = grad_input.size(3);
+
+  // get tensor impl
+  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_ptr = grad_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  int need_core = nearestPower2(boxes_num);
+  int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t dim_y = (need_core - 1) / dim_x + 1;
+  dim_y = (dim_y > union_number) ? union_number : dim_y;
+  cnrtDim3_t k_dim = {dim_x, dim_y, 1};
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
+
+  KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
+                         grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
+                         spatial_scale, sampling_ratio, aligned);
+  grad_input.copy_(grad_input_);
+}
+
+void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
+                                   aligned_height, aligned_width, spatial_scale,
+                                   sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardMLUKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..255aefdd9e2bb9893ca8a332401c5e86223d162e
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
@@ -0,0 +1,232 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+#include "roi_align_rotated_utils.hpp"
+
+namespace {
+
+void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+}  // namespace
+
+void KernelRoiAlignRotatedForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *features, const void *rois,
+    void *output, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void KernelRoiAlignRotatedBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
+    void *bottom_grad, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                             Tensor output, int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio, bool aligned,
+                                             bool clockwise) {
+  TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
+               (output.scalar_type() == rois.scalar_type())),
+              "data types of input, rois and output should be the same, ",
+              "but now input type is ", input.scalar_type(), ", rois type is ",
+              rois.scalar_type(), ", output type is ", output.scalar_type(),
+              ".");
+  TORCH_CHECK(
+      (input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
+              output.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == output.size(0)),
+              "the 1st dimensions of rois and output should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and output is ", output.size(0), ".");
+
+  TORCH_CHECK((input.size(1) == output.size(1)),
+              "the 2nd dimensions of input and output should be the same, ",
+              "but now the 2nd dimension of input is ", input.size(1),
+              ", and output is ", output.size(1), ".");
+
+  int channel = input.size(1);
+  int width = input.size(3);
+  int height = input.size(2);
+  int batch = input.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  // return if zero-elements
+  if (input.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  at::Tensor output_tmp =
+      at::empty({batch, channel, pooled_height, pooled_width}, input.options(),
+                memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
+                               rois_ptr, output_ptr, batch, height, width,
+                               channel, rois_nums, roiAlignRotatedParams);
+  output.copy_(output_tmp);
+}
+
+void ROIAlignRotatedBackwardMLUKernelLauncher(
+    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
+    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
+    bool clockwise) {
+  TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
+               (bottom_grad.scalar_type() == rois.scalar_type())),
+              "data types of top_grad, rois and bottom_grad should be ",
+              "the same, but now top_grad type is ", top_grad.scalar_type(),
+              ", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
+              bottom_grad.scalar_type(), ".");
+  TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
+               bottom_grad.scalar_type() == at::kHalf),
+              "Data type of bottom_grad should be Float ro Half, got ",
+              bottom_grad.scalar_type(), ".");
+
+  TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
+              top_grad.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
+              bottom_grad.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == top_grad.size(0)),
+              "the 1st dimensions of rois and top_grad should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and top_grad is ", top_grad.size(0), ".");
+
+  TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
+              "the 2nd dimensions of bottom_grad and top_grad should be ",
+              "the same, but now the 2nd dimension of bottom_grad is ",
+              bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
+
+  int channel = bottom_grad.size(1);
+  int width = bottom_grad.size(3);
+  int height = bottom_grad.size(2);
+  int batch = bottom_grad.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
+
+  // return if zero-elements
+  if (bottom_grad.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
+                                         top_grad.options(), memory_format)
+                                   .zero_();
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
+                                rois_ptr, bottom_grad_ptr, batch, height, width,
+                                channel, rois_nums, roiAlignRotatedParams);
+  bottom_grad.copy_(bottom_grad_tmp);
+}
+
+void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
+                                          aligned_width, spatial_scale,
+                                          sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  ROIAlignRotatedBackwardMLUKernelLauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
+                     roi_align_rotated_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
+                     roi_align_rotated_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7db23957d2cbba8f496b9effd67a62f87cde39e5
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax);
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale);
+
+// policy function for forward
+static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
+                              cnrtFunctionType_t *k_type) {
+  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                     Tensor argmax, int pooled_height,
+                                     int pooled_width, float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
+              "rois should have the same type as input");
+
+  // Check dtype relationship.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+
+  // Check shape.
+  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // compute kernel params
+  auto batch = input.size(0);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto channels = input.size(1);
+  auto rois_num = output.size(0);
+
+  if (output.numel() == 0) {
+    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       input.options());
+    return;
+  }
+  if (argmax.numel() == 0) {
+    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       argmax.options());
+    return;
+  }
+
+  // zero element check
+  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
+      argmax.numel() == 0) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  at::Tensor output_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                input.options(), memory_format);
+  at::Tensor argmax_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                argmax.options(), memory_format);
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
+                       batch, channels, height, width, pooled_height,
+                       pooled_width, rois_num, spatial_scale, output_ptr,
+                       (int *)argmax_ptr);
+  output.copy_(output_);
+  argmax.copy_(argmax_);
+}
+
+// policy function for backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
+                                      Tensor argmax, Tensor grad_input,
+                                      int pooled_height, int pooled_width,
+                                      float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
+               grad_output.scalar_type() == at::kHalf),
+              "grad_output type should be FLoat or Half, got ",
+              grad_output.scalar_type());
+
+  // Check dtype relationship.
+  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
+              "rois should have the same type as grad_output");
+
+  // Check shape.
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
+              grad_output.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // Check relationship between tensor.
+  // Check the relationship of n.
+  TORCH_CHECK(grad_output.size(0) == rois.size(0),
+              "grad_output.size(0) = ", grad_output.size(0),
+              ", while rois.size(0) = ", rois.size(0),
+              ". They should be the same.");
+
+  // Check the relationship of channels.
+  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
+              "grad_output.size(1) = ", grad_output.size(1),
+              ", while argmax.size(1) = ", argmax.size(1),
+              ". They should be the same.");
+
+  // Check the relationship of height and width.
+  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
+              "argmax.size(2) = ", argmax.size(2),
+              ", while grad_output.size(2) = ", grad_output.size(2),
+              ". They should be the same.");
+  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
+              "argmax.size(3) = ", argmax.size(3),
+              ", while grad_output.size(3) = ", grad_output.size(3),
+              ". They should be the same.");
+
+  // Check zero element.
+  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
+      grad_input.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
+
+  int boxes_num = grad_output.size(0);
+  int no = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto grad_input_ = at::empty({no, channels, height, width},
+                               grad_input.options(), memory_format)
+                         .zero_();
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // calculate task dimension
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
+                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
+                        pooled_height, pooled_width, channels, no, height,
+                        width, spatial_scale);
+
+  grad_input.copy_(grad_input_);
+}
+
+void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
+                                  pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
+                                   pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..728330795da89e944e037040f92e10be3634c406
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
@@ -0,0 +1,203 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelTinShiftForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+void KernelTinShiftBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *grad_output, const void *shifts, void *grad_input,
+    const int batch_size, const int time_size, const int channel_size,
+    const int hw_size, const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+// policy function
+static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type, int *channel_per_core,
+                       int *max_number_hw_per_core, int *max_length_per_core) {
+  const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int core_num = core_limit * cluster_limit;
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+
+  const size_t size_per_channel = time_size * hw_size * input.itemsize();
+  *channel_per_core = nram_size / size_per_channel;
+  int task_dim = 0;
+  if (*channel_per_core == 0) {
+    const size_t size_per_hw = hw_size * input.itemsize();
+    *max_number_hw_per_core = nram_size / size_per_hw;
+    if (*max_number_hw_per_core <= 0) {
+      *max_length_per_core = nram_size / input.itemsize();
+    }
+    int tmp_max_number_hw_per_core =
+        *max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1;
+    const int loop_time =
+        (time_size / (tmp_max_number_hw_per_core)) +
+        ((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0);
+    task_dim = batch_size * channel_size * loop_time < core_num
+                   ? batch_size * channel_size * loop_time
+                   : core_num;
+  } else {
+    task_dim = batch_size * channel_size < core_num ? batch_size * channel_size
+                                                    : core_num;
+  }
+
+  k_dim->x = core_limit;
+  k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1;
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+}
+
+void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
+                                      Tensor output) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(
+      input.size(0) == shift.size(0),
+      "input batch size should be the same as shift's, input batch size is ",
+      input.size(0), " and shift batch size is ", shift.size(0), ".");
+  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
+  TORCH_CHECK(input.size(3) != 0,
+              "The last dim size of input should not be zero.");
+  if (input.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core,
+             &max_length_per_core);
+
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr,
+                        batch_size, time_size, channel_size, hw_size,
+                        group_size, group_channel, data_dtype, channel_per_core,
+                        max_number_hw_per_core, max_length_per_core);
+}
+
+void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
+                                       Tensor grad_input) {
+  // params check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type(), ".");
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(grad_output.size(0) == shift.size(0),
+              "grad_output batch size should be the same as shift's, "
+              "grad_output batch size is ",
+              grad_output.size(0), ", shift batch size is ", shift.size(0),
+              ".");
+  TORCH_CHECK(grad_output.size(0) != 0,
+              "grad_output batch size should not be zero.");
+  TORCH_CHECK(grad_output.size(3) != 0,
+              "The last dim size of grad_output should not be zero.");
+  if (grad_output.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(grad_output, &k_dim, &k_type, &channel_per_core,
+             &max_number_hw_per_core, &max_length_per_core);
+
+  const int batch_size = grad_output.size(0);
+  const int time_size = grad_output.size(1);
+  const int channel_size = grad_output.size(2);
+  const int hw_size = grad_output.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype());
+
+  KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr,
+                         grad_input_ptr, batch_size, time_size, channel_size,
+                         hw_size, group_size, group_channel, data_dtype,
+                         channel_per_core, max_number_hw_per_core,
+                         max_length_per_core);
+}
+
+void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardMLUKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
+                            Tensor grad_input) {
+  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
diff --git a/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm b/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
new file mode 100644
index 0000000000000000000000000000000000000000..cad6a41a09a0d9dbf43ae473235c356b16a2eec8
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+
+using at::Tensor;
+
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+
+    const float f_offset = float(offset);
+
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
diff --git a/mmcv/ops/csrc/pytorch/points_in_polygons.cpp b/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/mmcv/ops/csrc/pytorch/prroi_pool.cpp b/mmcv/ops/csrc/pytorch/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/ops/csrc/pytorch/pybind.cpp
index 09d62d3d1fee0b8bad58915e092c01968711277d..c134090871301a4755483d85def92fd039dcae1f 100644
--- a/mmcv/ops/csrc/pytorch/pybind.cpp
+++ b/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -1,4 +1,6 @@
 // Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
 #include "pytorch_cpp_helper.hpp"
 
 std::string get_compiler_version();
@@ -113,17 +115,15 @@ void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
 
 void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
                  Tensor dist2_tensor, int b, int n, int m, int nsample);
+
 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                      Tensor ans_overlap);
 
-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou);
-
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh);
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
 
-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh);
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
 
 void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
                                      Tensor idx_tensor, int b, int n, int m);
@@ -240,21 +240,54 @@ void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
                         Tensor idx_tensor, int b, int n, int m,
                         float min_radius, float max_radius, int nsample);
 
-Tensor bottom_pool_forward(Tensor input);
-
-Tensor bottom_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor left_pool_forward(Tensor input);
-
-Tensor left_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor right_pool_forward(Tensor input);
-
-Tensor right_pool_backward(Tensor input, Tensor grad_output);
-
-Tensor top_pool_forward(Tensor input);
-
-Tensor top_pool_backward(Tensor input, Tensor grad_output);
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);
 
 void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
                      const int mode_flag, const bool aligned);
@@ -273,13 +306,14 @@ Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
                                int pooled_height, int pooled_width,
-                               float spatial_scale, int sample_num,
+                               float spatial_scale, int sampling_ratio,
                                bool aligned, bool clockwise);
 
 void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
                                 Tensor grad_input, int pooled_height,
                                 int pooled_width, float spatial_scale,
-                                int sample_num, bool aligned, bool clockwise);
+                                int sampling_ratio, bool aligned,
+                                bool clockwise);
 
 std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
     const torch::Tensor &feats, const torch::Tensor &coors,
@@ -298,7 +332,8 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim);
+                           const int max_voxels, const int NDim,
+                           const bool deterministic);
 
 void dynamic_voxelize_forward(const at::Tensor &points,
                               const at::Tensor &voxel_size,
@@ -340,6 +375,54 @@ void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
                           int dilationH, int dilationW, int dilation_patchH,
                           int dilation_patchW, int dH, int dW);
 
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
+                                                  at::Tensor mask,
+                                                  at::Tensor num_valid);
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor gradxyz1, Tensor gradxyz2,
+                               Tensor graddist1, Tensor graddist2, Tensor idx1,
+                               Tensor idx2);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
         py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
@@ -395,21 +478,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
         py::arg("input"), py::arg("weight"), py::arg("offset"),
         py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
-        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padH"),
-        py::arg("padW"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
+        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
         py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
   m.def("deform_conv_backward_input", &deform_conv_backward_input,
         "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
         py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
         py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
-        py::arg("dW"), py::arg("dH"), py::arg("padH"), py::arg("padW"),
+        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
         py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
         py::arg("deformable_group"), py::arg("im2col_step"));
   m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
         "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
         py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
         py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
-        py::arg("dH"), py::arg("padH"), py::arg("padW"), py::arg("dilationW"),
+        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
         py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
         py::arg("scale"), py::arg("im2col_step"));
   m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
@@ -473,15 +556,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("dist2_tensor"));
   m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
         "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
-        py::arg("boxes_b"), py::arg("ans_overlap"));
-  m.def("iou3d_boxes_iou_bev_forward", &iou3d_boxes_iou_bev_forward,
-        "iou3d_boxes_iou_bev_forward", py::arg("boxes_a"), py::arg("boxes_b"),
-        py::arg("ans_iou"));
-  m.def("iou3d_nms_forward", &iou3d_nms_forward, "iou3d_nms_forward",
+        py::arg("boxes_b"), py::arg("ans_iou"));
+  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
         py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
         py::arg("nms_overlap_thresh"));
-  m.def("iou3d_nms_normal_forward", &iou3d_nms_normal_forward,
-        "iou3d_nms_normal_forward", py::arg("boxes"), py::arg("keep"),
+  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
+        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
         py::arg("num_out"), py::arg("nms_overlap_thresh"));
   m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
         "furthest_point_sampling_forward", py::arg("points_tensor"),
@@ -567,6 +647,54 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
         py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
         py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
   m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
         py::arg("input"), py::arg("output"), py::arg("psa_type"),
         py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
@@ -581,26 +709,6 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         py::arg("input"), py::arg("shift"), py::arg("output"));
   m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
         py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
-  m.def("bottom_pool_forward", &bottom_pool_forward, "Bottom Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("bottom_pool_backward", &bottom_pool_backward, "Bottom Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_forward", &left_pool_forward, "Left Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("left_pool_backward", &left_pool_backward, "Left Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_forward", &right_pool_forward, "Right Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("right_pool_backward", &right_pool_backward, "Right Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_forward", &top_pool_forward, "Top Pool Forward",
-        py::arg("input"), py::call_guard<py::gil_scoped_release>());
-  m.def("top_pool_backward", &top_pool_backward, "Top Pool Backward",
-        py::arg("input"), py::arg("grad_output"),
-        py::call_guard<py::gil_scoped_release>());
   m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
         py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
         py::arg("mode_flag"), py::arg("aligned"));
@@ -614,13 +722,13 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
         "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
         py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
-        py::arg("spatial_scale"), py::arg("sample_num"), py::arg("aligned"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
         py::arg("clockwise"));
   m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
         "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
         py::arg("grad_output"), py::arg("pooled_height"),
         py::arg("pooled_width"), py::arg("spatial_scale"),
-        py::arg("sample_num"), py::arg("aligned"), py::arg("clockwise"));
+        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
   m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
         "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
         py::arg("reduce_type"));
@@ -633,7 +741,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
         py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
         py::arg("num_points_per_voxel"), py::arg("voxel_num"),
-        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"));
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
+        py::arg("deterministic"));
   m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
         "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
         py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
@@ -686,4 +795,62 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
         py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
         py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+  m.def("diff_iou_rotated_sort_vertices_forward",
+        &diff_iou_rotated_sort_vertices_forward,
+        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
+        py::arg("mask"), py::arg("num_valid"));
+  m.def("chamfer_distance_forward", &chamfer_distance_forward,
+        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("chamfer_distance_backward", &chamfer_distance_backward,
+        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("gradxyz1"), py::arg("gradxyz2"), py::arg("graddist1"),
+        py::arg("graddist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
+        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
+        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
 }
diff --git a/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
index 5ef691ada07e599740906254369631189e5d6f51..77ea5ce70cff1724a6b012aee127ba256c7dd326 100644
--- a/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -2,23 +2,23 @@
 #include "pytorch_cpp_helper.hpp"
 #include "pytorch_device_registry.hpp"
 
-void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
                                     int aligned_height, int aligned_width,
-                                    float spatial_scale, int sample_ratio,
+                                    float spatial_scale, int sampling_ratio,
                                     bool aligned, bool clockwise) {
-  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
                        aligned_height, aligned_width, spatial_scale,
-                       sample_ratio, aligned, clockwise);
+                       sampling_ratio, aligned, clockwise);
 }
 
 void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
                                      Tensor bottom_grad, int aligned_height,
                                      int aligned_width, float spatial_scale,
-                                     int sample_ratio, bool aligned,
+                                     int sampling_ratio, bool aligned,
                                      bool clockwise) {
   DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
                        bottom_grad, aligned_height, aligned_width,
-                       spatial_scale, sample_ratio, aligned, clockwise);
+                       spatial_scale, sampling_ratio, aligned, clockwise);
 }
 
 void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
diff --git a/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp b/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp b/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6f38fc68a3ec4fc1de253215c1068fba6109599
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
@@ -0,0 +1,48 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
diff --git a/mmcv/ops/csrc/pytorch/spconv_ops.cpp b/mmcv/ops/csrc/pytorch/spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09c8110ad8c895145575484a7e9c6e7bf1fb5bce
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/spconv_ops.cpp
@@ -0,0 +1,171 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/ops/csrc/pytorch/spconv_utils.h b/mmcv/ops/csrc/pytorch/spconv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d3de025b690f6247abfb813614e70de36b02d7d
--- /dev/null
+++ b/mmcv/ops/csrc/pytorch/spconv_utils.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
diff --git a/mmcv/ops/csrc/pytorch/voxelization.cpp b/mmcv/ops/csrc/pytorch/voxelization.cpp
index 1d1c229c1e07cd978b79dd518fc4176d907e9387..7946be6178ad5eae64958b4631c1cabec2a04eee 100644
--- a/mmcv/ops/csrc/pytorch/voxelization.cpp
+++ b/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -14,6 +14,17 @@ int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
                               max_points, max_voxels, NDim);
 }
 
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
 void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
                                    const std::vector<float> voxel_size,
                                    const std::vector<float> coors_range,
@@ -27,7 +38,8 @@ void hard_voxelize_forward(const at::Tensor &points,
                            const at::Tensor &coors_range, at::Tensor &voxels,
                            at::Tensor &coors, at::Tensor &num_points_per_voxel,
                            at::Tensor &voxel_num, const int max_points,
-                           const int max_voxels, const int NDim = 3) {
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
   int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
   std::vector<float> voxel_size_v(
       voxel_size.data_ptr<float>(),
@@ -36,9 +48,15 @@ void hard_voxelize_forward(const at::Tensor &points,
       coors_range.data_ptr<float>(),
       coors_range.data_ptr<float>() + coors_range.numel());
 
-  *voxel_num_data = hard_voxelize_forward_impl(
-      points, voxels, coors, num_points_per_voxel, voxel_size_v, coors_range_v,
-      max_points, max_voxels, NDim);
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
 }
 
 void dynamic_voxelize_forward(const at::Tensor &points,
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
index b4578700b755bace1b2596cb3a6620289ab9224e..ecf9ee6e827cb8a71e7e6f2907576e7f4fa4ebed 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -85,7 +85,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
     case 0:
     case 1:
       nthreads = batch_size * channels * width;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
       top_bottom_pool_kernel<scalar_t>
           <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
               input, output, batch_size, channels, height, width, pool_type);
@@ -93,7 +93,7 @@ void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
     case 2:
     case 3:
       nthreads = batch_size * channels * height;
-      col_block = DIVUP(nthreads, THREADS_PER_BLOCK);
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
       left_right_pool_kernel<scalar_t>
           <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
               input, output, batch_size, channels, height, width, pool_type);
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
index 1ff2e4e2342ddc71527c2bd0bd02f2a39ece2eb4..47d756a33bd692b05cba38bd2c5fafe874261616 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -67,7 +67,7 @@ void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
   const int data_size =
       tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
 
-  const int col_block = DIVUP(data_size, THREADS_PER_BLOCK);
+  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
 
   cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
       input, output_value, output_index, tensor_desc, cum_dim, cum_type);
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
index 76056dee57fad8c17bed45e2c080c2a388c0b7b6..b9b2439ba9a90cc00eb6b27316ccdd86acefa7e3 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
@@ -282,7 +282,7 @@ nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
       }
     }
 
-    if (field_name.compare("deformable_group") == 0) {
+    if (field_name.compare("deform_groups") == 0) {
       deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
     }
 
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
index 330ee8060da861d6d8be2dcec4bf4df0c2e53540..30ca758b845e029e306a028daf08651a9f8e25fa 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
@@ -254,7 +254,7 @@ nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
     }
     std::string field_name(fc->fields[i].name);
 
-    if (field_name.compare("deformable_group") == 0) {
+    if (field_name.compare("deform_groups") == 0) {
       deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
     }
 
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
index 303ea56086fd3c3806e83d4d087aa11f32e37765..3de37ca6ead7511cb543f79fc4f187b75f0d6941 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -114,7 +114,8 @@ size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
       mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
   size_t boxes_workspace =
       mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
   size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
                                                sizeof(unsigned long long));
   size_t index_template_workspace =
@@ -163,7 +164,8 @@ void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
                                     int spatial_dimension, int num_classes,
                                     size_t output_length, void* workspace,
                                     cudaStream_t stream) {
-  const int col_blocks = DIVUP(spatial_dimension, threadsPerBlock);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
   float* boxes_sorted = (float*)workspace;
   workspace = static_cast<char*>(workspace) +
               mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
diff --git a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
index 3c7423ac337f2d839624bf3d32ddb731ae96bb43..f1b095efa45d1aaa91942bbde1b62a24decbb2f0 100644
--- a/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
+++ b/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -67,7 +67,7 @@ void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
     num_update_indice *= indice_desc.shape[i];
   }
   // scatter
-  const int col_block = DIVUP(num_update_indice, threadsPerBlock);
+  const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
   onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
       num_update_indice, indices, update, output, tensor_desc, indice_desc);
 }
diff --git a/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
index e7df9c0cf75f74679247cd2d785fc6776823c0c9..846d06a419f66bb60bc7b9fceb901b4db263ffab 100644
--- a/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
+++ b/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -3,8 +3,6 @@
 #define TRT_CUDA_HELPER_HPP
 #include <cublas_v2.h>
 
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
 #define cudaCheckError()                                       \
   {                                                            \
     cudaError_t e = cudaGetLastError();                        \
diff --git a/mmcv/ops/deform_conv.py b/mmcv/ops/deform_conv.py
index dc2391d424df4e321f8d52e3702b183b2828d360..85f665cd329ad2dbb8d0511e1dca92620b21344a 100644
--- a/mmcv/ops/deform_conv.py
+++ b/mmcv/ops/deform_conv.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple, Union
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -48,16 +48,16 @@ class DeformConv2dFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input,
-                offset,
-                weight,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deform_groups=1,
-                bias=False,
-                im2col_step=32):
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
         if input is not None and input.dim() != 4:
             raise ValueError(
                 f'Expected 4D tensor as input, got {input.dim()}D tensor \
@@ -111,7 +111,10 @@ class DeformConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
         input, offset, weight = ctx.saved_tensors
 
         grad_input = grad_offset = grad_weight = None
@@ -236,7 +239,7 @@ class DeformConv2d(nn.Module):
                  deform_groups: int = 1,
                  bias: bool = False,
                  im2col_step: int = 32) -> None:
-        super(DeformConv2d, self).__init__()
+        super().__init__()
 
         assert not bias, \
             f'bias={bias} is not supported in DeformConv2d.'
@@ -356,7 +359,7 @@ class DeformConv2dPack(DeformConv2d):
     _version = 2
 
     def __init__(self, *args, **kwargs):
-        super(DeformConv2dPack, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.conv_offset = nn.Conv2d(
             self.in_channels,
             self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
@@ -371,7 +374,7 @@ class DeformConv2dPack(DeformConv2d):
         self.conv_offset.weight.data.zero_()
         self.conv_offset.bias.data.zero_()
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
         offset = self.conv_offset(x)
         return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
                              self.dilation, self.groups, self.deform_groups,
diff --git a/mmcv/ops/deform_roi_pool.py b/mmcv/ops/deform_roi_pool.py
index cc245ba91fee252226ba22e76bb94a35db9a629b..ec9a4c124685733c1f11191a8ff83c657f870ec5 100644
--- a/mmcv/ops/deform_roi_pool.py
+++ b/mmcv/ops/deform_roi_pool.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn
+from typing import Optional, Tuple
+
+from torch import Tensor, nn
 from torch.autograd import Function
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
@@ -28,13 +30,13 @@ class DeformRoIPoolFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input,
-                rois,
-                offset,
-                output_size,
-                spatial_scale=1.0,
-                sampling_ratio=0,
-                gamma=0.1):
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor],
+                output_size: Tuple[int, ...],
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                gamma: float = 0.1) -> Tensor:
         if offset is None:
             offset = input.new_zeros(0)
         ctx.output_size = _pair(output_size)
@@ -64,7 +66,9 @@ class DeformRoIPoolFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
         input, rois, offset = ctx.saved_tensors
         grad_input = grad_output.new_zeros(input.shape)
         grad_offset = grad_output.new_zeros(offset.shape)
@@ -92,17 +96,20 @@ deform_roi_pool = DeformRoIPoolFunction.apply
 class DeformRoIPool(nn.Module):
 
     def __init__(self,
-                 output_size,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPool, self).__init__()
+                 output_size: Tuple[int, ...],
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__()
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
         self.sampling_ratio = int(sampling_ratio)
         self.gamma = float(gamma)
 
-    def forward(self, input, rois, offset=None):
+    def forward(self,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor] = None) -> Tensor:
         return deform_roi_pool(input, rois, offset, self.output_size,
                                self.spatial_scale, self.sampling_ratio,
                                self.gamma)
@@ -111,14 +118,13 @@ class DeformRoIPool(nn.Module):
 class DeformRoIPoolPack(DeformRoIPool):
 
     def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale,
-                                                sampling_ratio, gamma)
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
 
         self.output_channels = output_channels
         self.deform_fc_channels = deform_fc_channels
@@ -135,7 +141,7 @@ class DeformRoIPoolPack(DeformRoIPool):
         self.offset_fc[-1].weight.data.zero_()
         self.offset_fc[-1].bias.data.zero_()
 
-    def forward(self, input, rois):
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
         assert input.size(1) == self.output_channels
         x = deform_roi_pool(input, rois, None, self.output_size,
                             self.spatial_scale, self.sampling_ratio,
@@ -152,14 +158,13 @@ class DeformRoIPoolPack(DeformRoIPool):
 class ModulatedDeformRoIPoolPack(DeformRoIPool):
 
     def __init__(self,
-                 output_size,
-                 output_channels,
-                 deform_fc_channels=1024,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 gamma=0.1):
-        super(ModulatedDeformRoIPoolPack,
-              self).__init__(output_size, spatial_scale, sampling_ratio, gamma)
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
 
         self.output_channels = output_channels
         self.deform_fc_channels = deform_fc_channels
@@ -187,7 +192,7 @@ class ModulatedDeformRoIPoolPack(DeformRoIPool):
         self.mask_fc[2].weight.data.zero_()
         self.mask_fc[2].bias.data.zero_()
 
-    def forward(self, input, rois):
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
         assert input.size(1) == self.output_channels
         x = deform_roi_pool(input, rois, None, self.output_size,
                             self.spatial_scale, self.sampling_ratio,
diff --git a/mmcv/ops/deprecated_wrappers.py b/mmcv/ops/deprecated_wrappers.py
index a2e593df9ee57637038683d7a1efaa347b2b69e7..629a8033ff56be221b71a475ffd650ab7164f114 100644
--- a/mmcv/ops/deprecated_wrappers.py
+++ b/mmcv/ops/deprecated_wrappers.py
@@ -12,7 +12,8 @@ class Conv2d_deprecated(Conv2d):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
 
 
 class ConvTranspose2d_deprecated(ConvTranspose2d):
@@ -22,7 +23,7 @@ class ConvTranspose2d_deprecated(ConvTranspose2d):
         warnings.warn(
             'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
             'deprecated in the future. Please import them from "mmcv.cnn" '
-            'instead')
+            'instead', DeprecationWarning)
 
 
 class MaxPool2d_deprecated(MaxPool2d):
@@ -31,7 +32,8 @@ class MaxPool2d_deprecated(MaxPool2d):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
 
 
 class Linear_deprecated(Linear):
@@ -40,4 +42,5 @@ class Linear_deprecated(Linear):
         super().__init__(*args, **kwargs)
         warnings.warn(
             'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
-            ' the future. Please import them from "mmcv.cnn" instead')
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
diff --git a/mmcv/ops/diff_iou_rotated.py b/mmcv/ops/diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc6c72f872967d3f6c8909a509ef00f1d58e2b8
--- /dev/null
+++ b/mmcv/ops/diff_iou_rotated.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
+from typing import Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+EPSILON = 1e-8
+ext_module = ext_loader.load_ext('_ext',
+                                 ['diff_iou_rotated_sort_vertices_forward'])
+
+
+class SortVertices(Function):
+
+    @staticmethod
+    def forward(ctx, vertices, mask, num_valid):
+        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
+            vertices, mask, num_valid)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, gradout):
+        return ()
+
+
+def box_intersection(corners1: Tensor,
+                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find intersection points of rectangles.
+    Convention: if two edges are collinear, there is no intersection point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4, 4, 2) Intersections.
+         - Tensor: (B, N, 4, 4) Valid intersections mask.
+    """
+    # build edges from corners
+    # B, N, 4, 4: Batch, Box, edge, point
+    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
+    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
+    # duplicate data to pair each edges from the boxes
+    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
+    line1_ext = line1.unsqueeze(3)
+    line2_ext = line2.unsqueeze(2)
+    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
+    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
+    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
+    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
+    t = denumerator_t / numerator
+    t[numerator == .0] = -1.
+    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
+    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
+    u = -denumerator_u / numerator
+    u[numerator == .0] = -1.
+    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
+    mask = mask_t * mask_u
+    # overwrite with EPSILON. otherwise numerically unstable
+    t = denumerator_t / (numerator + EPSILON)
+    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
+                                dim=-1)
+    intersections = intersections * mask.float().unsqueeze(-1)
+    return intersections, mask
+
+
+def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
+    """Check if corners of box1 lie in box2.
+    Convention: if a corner is exactly on the edge of the other box,
+    it's also a valid point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tensor: (B, N, 4) Intersection.
+    """
+    # a, b, c, d - 4 vertices of box2
+    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
+    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
+    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
+    # ab, am, ad - vectors between corresponding vertices
+    ab = b - a  # (B, N, 1, 2)
+    am = corners1 - a  # (B, N, 4, 2)
+    ad = d - a  # (B, N, 1, 2)
+    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
+    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
+    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
+    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
+    # NOTE: the expression looks ugly but is stable if the two boxes
+    # are exactly the same also stable with different scale of bboxes
+    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
+                                           )  # (B, N, 4)
+    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
+                                           )  # (B, N, 4)
+    return cond1 * cond2
+
+
+def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Check if corners of two boxes lie in each other.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
+         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
+    """
+    c1_in_2 = box1_in_box2(corners1, corners2)
+    c2_in_1 = box1_in_box2(corners2, corners1)
+    return c1_in_2, c2_in_1
+
+
+def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
+                   c2_in_1: Tensor, intersections: Tensor,
+                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find vertices of intersection area.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
+        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
+        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
+        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 24, 2) Vertices of intersection area;
+               only some elements are valid.
+         - Tensor: (B, N, 24) Mask of valid elements in vertices.
+    """
+    # NOTE: inter has elements equals zero and has zeros gradient
+    # (masked by multiplying with 0); can be used as trick
+    B = corners1.size()[0]
+    N = corners1.size()[1]
+    # (B, N, 4 + 4 + 16, 2)
+    vertices = torch.cat(
+        [corners1, corners2,
+         intersections.view([B, N, -1, 2])], dim=2)
+    # Bool (B, N, 4 + 4 + 16)
+    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
+    return vertices, mask
+
+
+def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
+    """Sort indices.
+    Note:
+        why 9? the polygon has maximal 8 vertices.
+        +1 to duplicate the first element.
+        the index should have following structure:
+            (A, B, C, ... , A, X, X, X)
+        and X indicates the index of arbitrary elements in the last
+        16 (intersections not corners) with value 0 and mask False.
+        (cause they have zero value and zero gradient)
+
+    Args:
+        vertices (Tensor): (B, N, 24, 2) Box vertices.
+        mask (Tensor): (B, N, 24) Mask.
+
+    Returns:
+        Tensor: (B, N, 9) Sorted indices.
+
+    """
+    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
+    mean = torch.sum(
+        vertices * mask.float().unsqueeze(-1), dim=2,
+        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
+    vertices_normalized = vertices - mean  # normalization makes sorting easier
+    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
+
+
+def calculate_area(idx_sorted: Tensor,
+                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate area of intersection.
+
+    Args:
+        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
+        vertices (Tensor): (B, N, 24, 2) Vertices.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
+    """
+    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
+    selected = torch.gather(vertices, 2, idx_ext)
+    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
+        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
+    total = torch.sum(total, dim=2)
+    area = torch.abs(total) / 2
+    return area, selected
+
+
+def oriented_box_intersection_2d(corners1: Tensor,
+                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate intersection area of 2d rotated boxes.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
+    """
+    intersections, valid_mask = box_intersection(corners1, corners2)
+    c12, c21 = box_in_box(corners1, corners2)
+    vertices, mask = build_vertices(corners1, corners2, c12, c21,
+                                    intersections, valid_mask)
+    sorted_indices = sort_indices(vertices, mask)
+    return calculate_area(sorted_indices, vertices)
+
+
+def box2corners(box: Tensor) -> Tensor:
+    """Convert rotated 2d box coordinate to corners.
+
+    Args:
+        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
+
+    Returns:
+        Tensor: (B, N, 4, 2) Corners.
+    """
+    B = box.size()[0]
+    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
+    x4 = torch.FloatTensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
+    x4 = x4 * w  # (B, N, 4)
+    y4 = torch.FloatTensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
+    y4 = y4 * h  # (B, N, 4)
+    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
+    sin = torch.sin(alpha)
+    cos = torch.cos(alpha)
+    row1 = torch.cat([cos, sin], dim=-1)
+    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
+    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
+    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
+    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
+    rotated[..., 0] += x
+    rotated[..., 1] += y
+    return rotated
+
+
+def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 2d boxes.
+
+    Args:
+        box1 (Tensor): (B, N, 5) First box.
+        box2 (Tensor): (B, N, 5) Second box.
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1,
+                                                   corners2)  # (B, N)
+    area1 = box1[:, :, 2] * box1[:, :, 3]
+    area2 = box2[:, :, 2] * box2[:, :, 3]
+    union = area1 + area2 - intersection
+    iou = intersection / union
+    return iou
+
+
+def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+    return intersection_3d / union_3d
diff --git a/mmcv/ops/focal_loss.py b/mmcv/ops/focal_loss.py
index 763bc93bd2575c49ca8ccf20996bbd92d1e0d1a4..3b203fc15bd8c1d50464936a9b731d8b1977c3e5 100644
--- a/mmcv/ops/focal_loss.py
+++ b/mmcv/ops/focal_loss.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -15,7 +17,9 @@ ext_module = ext_loader.load_ext('_ext', [
 class SigmoidFocalLossFunction(Function):
 
     @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
         return g.op(
             'mmcv::MMCVSigmoidFocalLoss',
             input,
@@ -27,14 +31,15 @@ class SigmoidFocalLossFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
-
-        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+
+        assert isinstance(
+            target, (torch.Tensor, torch.LongTensor, torch.cuda.LongTensor))
         assert input.dim() == 2
         assert target.dim() == 1
         assert input.size(0) == target.size(0)
@@ -63,7 +68,7 @@ class SigmoidFocalLossFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         input, target, weight = ctx.saved_tensors
 
         grad_input = input.new_zeros(input.size())
@@ -87,14 +92,22 @@ sigmoid_focal_loss = SigmoidFocalLossFunction.apply
 
 class SigmoidFocalLoss(nn.Module):
 
-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SigmoidFocalLoss, self).__init__()
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
         self.gamma = gamma
         self.alpha = alpha
         self.register_buffer('weight', weight)
         self.reduction = reduction
 
-    def forward(self, input, target):
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
         return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
                                   self.weight, self.reduction)
 
@@ -109,7 +122,9 @@ class SigmoidFocalLoss(nn.Module):
 class SoftmaxFocalLossFunction(Function):
 
     @staticmethod
-    def symbolic(g, input, target, gamma, alpha, weight, reduction):
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
         return g.op(
             'mmcv::MMCVSoftmaxFocalLoss',
             input,
@@ -121,12 +136,12 @@ class SoftmaxFocalLossFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input,
-                target,
-                gamma=2.0,
-                alpha=0.25,
-                weight=None,
-                reduction='mean'):
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction='mean') -> torch.Tensor:
 
         assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
         assert input.dim() == 2
@@ -168,7 +183,7 @@ class SoftmaxFocalLossFunction(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         input_softmax, target, weight = ctx.saved_tensors
         buff = input_softmax.new_zeros(input_softmax.size(0))
         grad_input = input_softmax.new_zeros(input_softmax.size())
@@ -193,14 +208,22 @@ softmax_focal_loss = SoftmaxFocalLossFunction.apply
 
 class SoftmaxFocalLoss(nn.Module):
 
-    def __init__(self, gamma, alpha, weight=None, reduction='mean'):
-        super(SoftmaxFocalLoss, self).__init__()
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
         self.gamma = gamma
         self.alpha = alpha
         self.register_buffer('weight', weight)
         self.reduction = reduction
 
-    def forward(self, input, target):
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
         return softmax_focal_loss(input, target, self.gamma, self.alpha,
                                   self.weight, self.reduction)
 
diff --git a/mmcv/ops/furthest_point_sample.py b/mmcv/ops/furthest_point_sample.py
index 374b7a878f1972c183941af28ba1df216ac1a60f..22b1a3048d08b3f1eda43e4a3d5c36a6f6ab5349 100644
--- a/mmcv/ops/furthest_point_sample.py
+++ b/mmcv/ops/furthest_point_sample.py
@@ -18,11 +18,11 @@ class FurthestPointSampling(Function):
                 num_points: int) -> torch.Tensor:
         """
         Args:
-            points_xyz (Tensor): (B, N, 3) where N > num_points.
+            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
             num_points (int): Number of points in the sampled set.
 
         Returns:
-             Tensor: (B, num_points) indices of the sampled points.
+            torch.Tensor: (B, num_points) indices of the sampled points.
         """
         assert points_xyz.is_contiguous()
 
@@ -56,11 +56,12 @@ class FurthestPointSamplingWithDist(Function):
                 num_points: int) -> torch.Tensor:
         """
         Args:
-            points_dist (Tensor): (B, N, N) Distance between each point pair.
+            points_dist (torch.Tensor): (B, N, N) Distance between each point
+                pair.
             num_points (int): Number of points in the sampled set.
 
         Returns:
-             Tensor: (B, num_points) indices of the sampled points.
+            torch.Tensor: (B, num_points) indices of the sampled points.
         """
         assert points_dist.is_contiguous()
 
diff --git a/mmcv/ops/fused_bias_leakyrelu.py b/mmcv/ops/fused_bias_leakyrelu.py
index 6d12508469c6c8fa1884debece44c58d158cb6fa..e23617fb3af36234f1694e7c1210797d04b72113 100644
--- a/mmcv/ops/fused_bias_leakyrelu.py
+++ b/mmcv/ops/fused_bias_leakyrelu.py
@@ -113,7 +113,8 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
     """
 
     @staticmethod
-    def forward(ctx, grad_output, out, negative_slope, scale):
+    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
+                negative_slope: float, scale: float) -> tuple:
         ctx.save_for_backward(out)
         ctx.negative_slope = negative_slope
         ctx.scale = scale
@@ -139,7 +140,8 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
         return grad_input, grad_bias
 
     @staticmethod
-    def backward(ctx, gradgrad_input, gradgrad_bias):
+    def backward(ctx, gradgrad_input: torch.Tensor,
+                 gradgrad_bias: nn.Parameter) -> tuple:
         out, = ctx.saved_tensors
 
         # The second order deviation, in fact, contains two parts, while the
@@ -160,7 +162,8 @@ class FusedBiasLeakyReLUFunctionBackward(Function):
 class FusedBiasLeakyReLUFunction(Function):
 
     @staticmethod
-    def forward(ctx, input, bias, negative_slope, scale):
+    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
+                negative_slope: float, scale: float) -> torch.Tensor:
         empty = input.new_empty(0)
 
         out = ext_module.fused_bias_leakyrelu(
@@ -178,7 +181,7 @@ class FusedBiasLeakyReLUFunction(Function):
         return out
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         out, = ctx.saved_tensors
 
         grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
@@ -188,51 +191,59 @@ class FusedBiasLeakyReLUFunction(Function):
 
 
 class FusedBiasLeakyReLU(nn.Module):
-    """Fused bias leaky ReLU.
+    r"""Fused bias leaky ReLU.
 
     This function is introduced in the StyleGAN2:
-    http://arxiv.org/abs/1912.04958
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
 
     The bias term comes from the convolution operation. In addition, to keep
     the variance of the feature map or gradients unchanged, they also adopt a
     scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
     your own scale.
 
     TODO: Implement the CPU version.
 
     Args:
-        channel (int): The channel number of the feature map.
+        num_channels (int): The channel number of the feature map.
         negative_slope (float, optional): Same as nn.LeakyRelu.
             Defaults to 0.2.
         scale (float, optional): A scalar to adjust the variance of the feature
             map. Defaults to 2**0.5.
     """
 
-    def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5):
-        super(FusedBiasLeakyReLU, self).__init__()
+    def __init__(self,
+                 num_channels: int,
+                 negative_slope: float = 0.2,
+                 scale: float = 2**0.5):
+        super().__init__()
 
         self.bias = nn.Parameter(torch.zeros(num_channels))
         self.negative_slope = negative_slope
         self.scale = scale
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
                                     self.scale)
 
 
-def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5):
-    """Fused bias leaky ReLU function.
+def fused_bias_leakyrelu(input: torch.Tensor,
+                         bias: nn.Parameter,
+                         negative_slope: float = 0.2,
+                         scale: float = 2**0.5) -> torch.Tensor:
+    r"""Fused bias leaky ReLU function.
 
     This function is introduced in the StyleGAN2:
-    http://arxiv.org/abs/1912.04958
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
 
     The bias term comes from the convolution operation. In addition, to keep
     the variance of the feature map or gradients unchanged, they also adopt a
     scale similarly with Kaiming initialization. However, since the
-    :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the
-    final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
     your own scale.
 
     Args:
@@ -254,7 +265,10 @@ def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5):
                                             negative_slope, scale)
 
 
-def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5):
+def bias_leakyrelu_ref(x: torch.Tensor,
+                       bias: nn.Parameter,
+                       negative_slope: float = 0.2,
+                       scale: float = 2**0.5) -> torch.Tensor:
 
     if bias is not None:
         assert bias.ndim == 1
diff --git a/mmcv/ops/gather_points.py b/mmcv/ops/gather_points.py
index f52f1677d8ea0facafc56a3672d37adb44677ff3..895bfab643ba5c9da218e398501c12a646b869e8 100644
--- a/mmcv/ops/gather_points.py
+++ b/mmcv/ops/gather_points.py
@@ -1,3 +1,5 @@
+from typing import Tuple
+
 import torch
 from torch.autograd import Function
 
@@ -15,18 +17,18 @@ class GatherPoints(Function):
                 indices: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            features (Tensor): (B, C, N) features to gather.
-            indices (Tensor): (B, M) where M is the number of points.
+            features (torch.Tensor): (B, C, N) features to gather.
+            indices (torch.Tensor): (B, M) where M is the number of points.
 
         Returns:
-            Tensor: (B, C, M) where M is the number of points.
+            torch.Tensor: (B, C, M) where M is the number of points.
         """
         assert features.is_contiguous()
         assert indices.is_contiguous()
 
         B, npoint = indices.size()
         _, C, N = features.size()
-        output = torch.cuda.FloatTensor(B, C, npoint)
+        output = features.new_zeros((B, C, npoint))
 
         ext_module.gather_points_forward(
             features, indices, output, b=B, c=C, n=N, npoints=npoint)
@@ -37,11 +39,11 @@ class GatherPoints(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_out):
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
         idx, C, N = ctx.for_backwards
         B, npoint = idx.size()
 
-        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+        grad_features = grad_out.new_zeros((B, C, N))
         grad_out_data = grad_out.data.contiguous()
         ext_module.gather_points_backward(
             grad_out_data,
diff --git a/mmcv/ops/group_points.py b/mmcv/ops/group_points.py
index b7d7613e46f9056cad099bf397c95396868c2e90..5268a265f140984694bd5ef576d212938b47184b 100644
--- a/mmcv/ops/group_points.py
+++ b/mmcv/ops/group_points.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn as nn
@@ -37,15 +37,15 @@ class QueryAndGroup(nn.Module):
     """
 
     def __init__(self,
-                 max_radius,
-                 sample_num,
-                 min_radius=0,
-                 use_xyz=True,
-                 return_grouped_xyz=False,
-                 normalize_xyz=False,
-                 uniform_sample=False,
-                 return_unique_cnt=False,
-                 return_grouped_idx=False):
+                 max_radius: float,
+                 sample_num: int,
+                 min_radius: float = 0.,
+                 use_xyz: bool = True,
+                 return_grouped_xyz: bool = False,
+                 normalize_xyz: bool = False,
+                 uniform_sample: bool = False,
+                 return_unique_cnt: bool = False,
+                 return_grouped_idx: bool = False):
         super().__init__()
         self.max_radius = max_radius
         self.min_radius = min_radius
@@ -64,15 +64,24 @@ class QueryAndGroup(nn.Module):
             assert not self.normalize_xyz, \
                 'can not normalize grouped xyz when max_radius is None'
 
-    def forward(self, points_xyz, center_xyz, features=None):
+    def forward(
+        self,
+        points_xyz: torch.Tensor,
+        center_xyz: torch.Tensor,
+        features: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple]:
         """
         Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods.
-            features (Tensor): (B, C, N) Descriptors of the features.
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
+                points.
+            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
+                centriods.
+            features (torch.Tensor): (B, C, N) The features of grouped
+                points.
 
         Returns:
-            Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
+            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
+            concatenated coordinates and features of points.
         """
         # if self.max_radius is None, we will perform kNN instead of ball query
         # idx is of shape [B, npoint, sample_num]
@@ -145,7 +154,7 @@ class GroupAll(nn.Module):
     def forward(self,
                 xyz: torch.Tensor,
                 new_xyz: torch.Tensor,
-                features: torch.Tensor = None):
+                features: Optional[torch.Tensor] = None) -> torch.Tensor:
         """
         Args:
             xyz (Tensor): (B, N, 3) xyz coordinates of the features.
@@ -206,8 +215,7 @@ class GroupingOperation(Function):
         return output
 
     @staticmethod
-    def backward(ctx,
-                 grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
         """
         Args:
             grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
diff --git a/mmcv/ops/iou3d.py b/mmcv/ops/iou3d.py
old mode 100644
new mode 100755
index 8c4cc82b8d9a3ee7c5601c79c43da351b87ae0c9..dc45ee94b62e8e74fed257b5e443fc87bde61df6
--- a/mmcv/ops/iou3d.py
+++ b/mmcv/ops/iou3d.py
@@ -1,89 +1,224 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
 import torch
+from torch import Tensor
 
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', [
-    'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward',
-    'iou3d_nms_normal_forward'
+    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
+    'iou3d_nms3d_normal_forward'
 ])
 
 
-def boxes_iou_bev(boxes_a, boxes_b):
-    """Calculate boxes IoU in the Bird's Eye View.
+def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes BEV overlap.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: BEV overlap result with shape (M, N).
+    """
+    ans_overlap = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               ans_overlap)
+
+    return ans_overlap
+
+
+def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes 3D IoU.
 
     Args:
-        boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
-        boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
 
     Returns:
-        ans_iou (torch.Tensor): IoU result with shape (M, N).
+        torch.Tensor: 3D IoU result with shape (M, N).
     """
-    ans_iou = boxes_a.new_zeros(
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7,\
+        'Input boxes shape should be (N, 7)'
+
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
+    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
+
+    overlaps_bev = boxes_a.new_zeros(
         torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+    overlaps_3d = overlaps_bev * overlaps_h
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
+    return iou3d
+
+
+def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """3D NMS function GPU implementation (for BEV boxes).
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    return keep
+
+
+def nms3d_normal(boxes: Tensor, scores: Tensor,
+                 iou_threshold: float) -> Tensor:
+    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
+    IoU calculation is defined as the exact overlapping area of the two boxes
+    WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 7).
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
 
-    ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(),
-                                           boxes_b.contiguous(), ans_iou)
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
+
+
+def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
+    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.
+
+    Args:
+        box (torch.Tensor): Input boxes with shape (N, 5).
+
+    Returns:
+        torch.Tensor: Converted boxes with shape (N, 7).
+    """
+    warnings.warn(
+        'This function is deprecated and will be removed in the future.',
+        DeprecationWarning)
+    return torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+
+def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
+            ([x1, y1, x2, y2, ry]).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+
+    Returns:
+        torch.Tensor: IoU result with shape (M, N).
+    """
+    from .box_iou_rotated import box_iou_rotated
 
-    return ans_iou
+    warnings.warn(
+        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
+        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
+        DeprecationWarning)
 
+    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))
 
-def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None):
-    """NMS function GPU implementation (for BEV boxes). The overlap of two
+
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
+    """NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two
     boxes for IoU calculation is defined as the exact overlapping area of the
     two boxes. In this function, one can also set ``pre_max_size`` and
     ``post_max_size``.
-
     Args:
-        boxes (torch.Tensor): Input boxes with the shape of [N, 5]
+        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
             ([x1, y1, x2, y2, ry]).
-        scores (torch.Tensor): Scores of boxes with the shape of [N].
+        scores (torch.Tensor): Scores of boxes with the shape of (N,).
         thresh (float): Overlap threshold of NMS.
         pre_max_size (int, optional): Max size of boxes before NMS.
             Default: None.
         post_max_size (int, optional): Max size of boxes after NMS.
             Default: None.
-
     Returns:
         torch.Tensor: Indexes after NMS.
     """
-    assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]'
+    from .nms import nms_rotated
+
+    warnings.warn(
+        '`iou3d.nms_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
+    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
     order = scores.sort(0, descending=True)[1]
 
     if pre_max_size is not None:
         order = order[:pre_max_size]
-    boxes = boxes[order].contiguous()
+    boxes = _xyxyr2xywhr(boxes)[order]
+    scores = scores[order]
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
 
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = torch.zeros(size=(), dtype=torch.long)
-    ext_module.iou3d_nms_forward(
-        boxes, keep, num_out, nms_overlap_thresh=thresh)
-    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
     if post_max_size is not None:
         keep = keep[:post_max_size]
     return keep
 
 
-def nms_normal_bev(boxes, scores, thresh):
-    """Normal NMS function GPU implementation (for BEV boxes). The overlap of
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
+    """Normal NMS function GPU implementation (for BEV boxes).
+
+    The overlap of
     two boxes for IoU calculation is defined as the exact overlapping area of
     the two boxes WITH their yaw angle set to 0.
-
     Args:
-        boxes (torch.Tensor): Input boxes with shape (N, 5).
-        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        boxes (torch.Tensor): Input boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
         thresh (float): Overlap threshold of NMS.
-
     Returns:
         torch.Tensor: Remaining indices with scores in descending order.
     """
-    assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]'
-    order = scores.sort(0, descending=True)[1]
+    from .nms import nms
 
-    boxes = boxes[order].contiguous()
+    warnings.warn(
+        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms`.', DeprecationWarning)
+    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'
 
-    keep = torch.zeros(boxes.size(0), dtype=torch.long)
-    num_out = torch.zeros(size=(), dtype=torch.long)
-    ext_module.iou3d_nms_normal_forward(
-        boxes, keep, num_out, nms_overlap_thresh=thresh)
-    return order[keep[:num_out].cuda(boxes.device)].contiguous()
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/mmcv/ops/knn.py b/mmcv/ops/knn.py
index f335785036669fc19239825b0aae6dde3f73bf92..48ce92f9259bdcec166a23be2ba81544a69bc8c1 100644
--- a/mmcv/ops/knn.py
+++ b/mmcv/ops/knn.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import torch
 from torch.autograd import Function
 
@@ -8,6 +10,7 @@ ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
 
 class KNN(Function):
     r"""KNN (CUDA) based on heap data structure.
+
     Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
     scene_seg/lib/pointops/src/knnquery_heap>`_.
 
@@ -18,15 +21,15 @@ class KNN(Function):
     def forward(ctx,
                 k: int,
                 xyz: torch.Tensor,
-                center_xyz: torch.Tensor = None,
+                center_xyz: Optional[torch.Tensor] = None,
                 transposed: bool = False) -> torch.Tensor:
         """
         Args:
             k (int): number of nearest neighbors.
-            xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N).
-                xyz coordinates of the features.
-            center_xyz (Tensor, optional): (B, npoint, 3) if transposed ==
-                False, else (B, 3, npoint). centers of the knn query.
+            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
+                (B, 3, N). xyz coordinates of the features.
+            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
+                is False, else (B, 3, npoint). centers of the knn query.
                 Default: None.
             transposed (bool, optional): whether the input tensors are
                 transposed. Should not explicitly use this keyword when
@@ -34,8 +37,8 @@ class KNN(Function):
                 Default: False.
 
         Returns:
-            Tensor: (B, k, npoint) tensor with the indices of
-                the features that form k-nearest neighbours.
+            torch.Tensor: (B, k, npoint) tensor with the indices of the
+            features that form k-nearest neighbours.
         """
         assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
 
diff --git a/mmcv/ops/masked_conv.py b/mmcv/ops/masked_conv.py
index cd514cc204c1d571ea5dc7e74b038c0f477a008b..a6f258d04478729d8e99df35206d1b50cdec139a 100644
--- a/mmcv/ops/masked_conv.py
+++ b/mmcv/ops/masked_conv.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -27,7 +28,13 @@ class MaskedConv2dFunction(Function):
             stride_i=stride)
 
     @staticmethod
-    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
+    def forward(ctx,
+                features: torch.Tensor,
+                mask: torch.Tensor,
+                weight: torch.nn.Parameter,
+                bias: torch.nn.Parameter,
+                padding: int = 0,
+                stride: int = 1) -> torch.Tensor:
         assert mask.dim() == 3 and mask.size(0) == 1
         assert features.dim() == 4 and features.size(0) == 1
         assert features.size()[2:] == mask.size()[1:]
@@ -61,7 +68,6 @@ class MaskedConv2dFunction(Function):
                 kernel_w=kernel_w,
                 pad_h=pad_h,
                 pad_w=pad_w)
-
             masked_output = torch.addmm(1, bias[:, None], 1,
                                         weight.view(out_channel, -1), data_col)
             ext_module.masked_col2im_forward(
@@ -76,7 +82,7 @@ class MaskedConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         return (None, ) * 5
 
 
@@ -91,21 +97,22 @@ class MaskedConv2d(nn.Conv2d):
     """
 
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=True):
-        super(MaskedConv2d,
-              self).__init__(in_channels, out_channels, kernel_size, stride,
-                             padding, dilation, groups, bias)
-
-    def forward(self, input, mask=None):
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias)
+
+    def forward(self,
+                input: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
         if mask is None:  # fallback to the normal Conv2d
-            return super(MaskedConv2d, self).forward(input)
+            return super().forward(input)
         else:
             return masked_conv2d(input, mask, self.weight, self.bias,
                                  self.padding)
diff --git a/mmcv/ops/merge_cells.py b/mmcv/ops/merge_cells.py
index 48ca8cc0a8aca8432835bd760c0403a3c35b34cf..19c3fe6582bc04390819b1da9b2620548b462836 100644
--- a/mmcv/ops/merge_cells.py
+++ b/mmcv/ops/merge_cells.py
@@ -1,5 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import math
 from abc import abstractmethod
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -18,7 +20,7 @@ class BaseMergeCell(nn.Module):
     another convolution layer.
 
     Args:
-        in_channels (int): number of input channels in out_conv layer.
+        fused_channels (int): number of input channels in out_conv layer.
         out_channels (int): number of output channels in out_conv layer.
         with_out_conv (bool): Whether to use out_conv layer
         out_conv_cfg (dict): Config dict for convolution layer, which should
@@ -41,19 +43,19 @@ class BaseMergeCell(nn.Module):
     """
 
     def __init__(self,
-                 fused_channels=256,
-                 out_channels=256,
-                 with_out_conv=True,
-                 out_conv_cfg=dict(
+                 fused_channels: Optional[int] = 256,
+                 out_channels: Optional[int] = 256,
+                 with_out_conv: bool = True,
+                 out_conv_cfg: dict = dict(
                      groups=1, kernel_size=3, padding=1, bias=True),
-                 out_norm_cfg=None,
-                 out_conv_order=('act', 'conv', 'norm'),
-                 with_input1_conv=False,
-                 with_input2_conv=False,
-                 input_conv_cfg=None,
-                 input_norm_cfg=None,
-                 upsample_mode='nearest'):
-        super(BaseMergeCell, self).__init__()
+                 out_norm_cfg: Optional[dict] = None,
+                 out_conv_order: tuple = ('act', 'conv', 'norm'),
+                 with_input1_conv: bool = False,
+                 with_input2_conv: bool = False,
+                 input_conv_cfg: Optional[dict] = None,
+                 input_norm_cfg: Optional[dict] = None,
+                 upsample_mode: str = 'nearest'):
+        super().__init__()
         assert upsample_mode in ['nearest', 'bilinear']
         self.with_out_conv = with_out_conv
         self.with_input1_conv = with_input1_conv
@@ -62,8 +64,8 @@ class BaseMergeCell(nn.Module):
 
         if self.with_out_conv:
             self.out_conv = ConvModule(
-                fused_channels,
-                out_channels,
+                fused_channels,  # type: ignore
+                out_channels,  # type: ignore
                 **out_conv_cfg,
                 norm_cfg=out_norm_cfg,
                 order=out_conv_order)
@@ -95,12 +97,25 @@ class BaseMergeCell(nn.Module):
         elif x.shape[-2:] < size:
             return F.interpolate(x, size=size, mode=self.upsample_mode)
         else:
-            assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0
-            kernel_size = x.shape[-1] // size[-1]
+            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
+                h, w = x.shape[-2:]
+                target_h, target_w = size
+                pad_h = math.ceil(h / target_h) * target_h - h
+                pad_w = math.ceil(w / target_w) * target_w - w
+                pad_l = pad_w // 2
+                pad_r = pad_w - pad_l
+                pad_t = pad_h // 2
+                pad_b = pad_h - pad_t
+                pad = (pad_l, pad_r, pad_t, pad_b)
+                x = F.pad(x, pad, mode='constant', value=0.0)
+            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
             x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
             return x
 
-    def forward(self, x1, x2, out_size=None):
+    def forward(self,
+                x1: torch.Tensor,
+                x2: torch.Tensor,
+                out_size: Optional[tuple] = None) -> torch.Tensor:
         assert x1.shape[:2] == x2.shape[:2]
         assert out_size is None or len(out_size) == 2
         if out_size is None:  # resize to larger one
@@ -120,8 +135,8 @@ class BaseMergeCell(nn.Module):
 
 class SumCell(BaseMergeCell):
 
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(SumCell, self).__init__(in_channels, out_channels, **kwargs)
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
 
     def _binary_op(self, x1, x2):
         return x1 + x2
@@ -129,9 +144,8 @@ class SumCell(BaseMergeCell):
 
 class ConcatCell(BaseMergeCell):
 
-    def __init__(self, in_channels, out_channels, **kwargs):
-        super(ConcatCell, self).__init__(in_channels * 2, out_channels,
-                                         **kwargs)
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels * 2, out_channels, **kwargs)
 
     def _binary_op(self, x1, x2):
         ret = torch.cat([x1, x2], dim=1)
@@ -140,7 +154,10 @@ class ConcatCell(BaseMergeCell):
 
 class GlobalPoolingCell(BaseMergeCell):
 
-    def __init__(self, in_channels=None, out_channels=None, **kwargs):
+    def __init__(self,
+                 in_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 **kwargs):
         super().__init__(in_channels, out_channels, **kwargs)
         self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
 
diff --git a/mmcv/ops/min_area_polygons.py b/mmcv/ops/min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95f58796f4a894ab5cc48e2d766319f4c3640c7
--- /dev/null
+++ b/mmcv/ops/min_area_polygons.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
+
+
+def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
+    """Find the smallest polygons that surrounds all points in the point sets.
+
+    Args:
+        pointsets (Tensor): point sets with shape  (N, 18).
+
+    Returns:
+        torch.Tensor: Return the smallest polygons with shape (N, 8).
+    """
+    polygons = pointsets.new_zeros((pointsets.size(0), 8))
+    ext_module.min_area_polygons(pointsets, polygons)
+    return polygons
diff --git a/mmcv/ops/modulated_deform_conv.py b/mmcv/ops/modulated_deform_conv.py
index 341798059e4d4df72e55d8eca1f2df98c97a7d74..933428a2217ecc48c7c63792ff6d4dedc2ca1dd7 100644
--- a/mmcv/ops/modulated_deform_conv.py
+++ b/mmcv/ops/modulated_deform_conv.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
+from typing import Optional, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -35,16 +36,16 @@ class ModulatedDeformConv2dFunction(Function):
 
     @staticmethod
     def forward(ctx,
-                input,
-                offset,
-                mask,
-                weight,
-                bias=None,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                deform_groups=1):
+                input: torch.Tensor,
+                offset: torch.Tensor,
+                mask: torch.Tensor,
+                weight: nn.Parameter,
+                bias: Optional[nn.Parameter] = None,
+                stride: int = 1,
+                padding: int = 0,
+                dilation: int = 1,
+                groups: int = 1,
+                deform_groups: int = 1) -> torch.Tensor:
         if input is not None and input.dim() != 4:
             raise ValueError(
                 f'Expected 4D tensor as input, got {input.dim()}D tensor \
@@ -66,6 +67,7 @@ class ModulatedDeformConv2dFunction(Function):
         # whatever the pytorch version is.
         input = input.type_as(offset)
         weight = weight.type_as(input)
+        bias = bias.type_as(input)  # type: ignore
         ctx.save_for_backward(input, offset, mask, weight, bias)
         output = input.new_empty(
             ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
@@ -94,7 +96,7 @@ class ModulatedDeformConv2dFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         input, offset, mask, weight, bias = ctx.saved_tensors
         grad_input = torch.zeros_like(input)
         grad_offset = torch.zeros_like(offset)
@@ -158,16 +160,16 @@ class ModulatedDeformConv2d(nn.Module):
     @deprecated_api_warning({'deformable_groups': 'deform_groups'},
                             cls_name='ModulatedDeformConv2d')
     def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 deform_groups=1,
-                 bias=True):
-        super(ModulatedDeformConv2d, self).__init__()
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: Union[bool, str] = True):
+        super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = _pair(kernel_size)
@@ -198,7 +200,8 @@ class ModulatedDeformConv2d(nn.Module):
         if self.bias is not None:
             self.bias.data.zero_()
 
-    def forward(self, x, offset, mask):
+    def forward(self, x: torch.Tensor, offset: torch.Tensor,
+                mask: torch.Tensor) -> torch.Tensor:
         return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
                                        self.stride, self.padding,
                                        self.dilation, self.groups,
@@ -226,7 +229,7 @@ class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
     _version = 2
 
     def __init__(self, *args, **kwargs):
-        super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
         self.conv_offset = nn.Conv2d(
             self.in_channels,
             self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
@@ -237,13 +240,13 @@ class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
             bias=True)
         self.init_weights()
 
-    def init_weights(self):
-        super(ModulatedDeformConv2dPack, self).init_weights()
+    def init_weights(self) -> None:
+        super().init_weights()
         if hasattr(self, 'conv_offset'):
             self.conv_offset.weight.data.zero_()
             self.conv_offset.bias.data.zero_()
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
         out = self.conv_offset(x)
         o1, o2, mask = torch.chunk(out, 3, dim=1)
         offset = torch.cat((o1, o2), dim=1)
diff --git a/mmcv/ops/multi_scale_deform_attn.py b/mmcv/ops/multi_scale_deform_attn.py
index d0efc8de4ba44588ed363e43d380c5750156ac7c..a06466fa5be4b75c18f975976f7c7025ebf789ae 100644
--- a/mmcv/ops/multi_scale_deform_attn.py
+++ b/mmcv/ops/multi_scale_deform_attn.py
@@ -1,12 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 import warnings
+from typing import Optional, no_type_check
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd.function import Function, once_differentiable
 
+import mmcv
 from mmcv import deprecated_api_warning
 from mmcv.cnn import constant_init, xavier_init
 from mmcv.cnn.bricks.registry import ATTENTION
@@ -20,27 +22,30 @@ ext_module = ext_loader.load_ext(
 class MultiScaleDeformableAttnFunction(Function):
 
     @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
-                sampling_locations, attention_weights, im2col_step):
+    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                value_level_start_index: torch.Tensor,
+                sampling_locations: torch.Tensor,
+                attention_weights: torch.Tensor,
+                im2col_step: torch.Tensor) -> torch.Tensor:
         """GPU version of multi-scale deformable attention.
 
         Args:
-            value (Tensor): The value has shape
+            value (torch.Tensor): The value has shape
                 (bs, num_keys, mum_heads, embed_dims//num_heads)
-            value_spatial_shapes (Tensor): Spatial shape of
+            value_spatial_shapes (torch.Tensor): Spatial shape of
                 each feature map, has shape (num_levels, 2),
                 last dimension 2 represent (h, w)
-            sampling_locations (Tensor): The location of sampling points,
+            sampling_locations (torch.Tensor): The location of sampling points,
                 has shape
                 (bs ,num_queries, num_heads, num_levels, num_points, 2),
                 the last dimension 2 represent (x, y).
-            attention_weights (Tensor): The weight of sampling points used
-                when calculate the attention, has shape
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
                 (bs ,num_queries, num_heads, num_levels, num_points),
-            im2col_step (Tensor): The step used in image to column.
+            im2col_step (torch.Tensor): The step used in image to column.
 
         Returns:
-            Tensor: has shape (bs, num_queries, embed_dims)
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
         """
 
         ctx.im2col_step = im2col_step
@@ -58,16 +63,14 @@ class MultiScaleDeformableAttnFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
         """GPU version of backward function.
 
         Args:
-            grad_output (Tensor): Gradient
-                of output tensor of forward.
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
 
         Returns:
-             Tuple[Tensor]: Gradient
-                of input tensors in forward.
+            tuple[Tensor]: Gradient of input tensors in forward.
         """
         value, value_spatial_shapes, value_level_start_index,\
             sampling_locations, attention_weights = ctx.saved_tensors
@@ -91,26 +94,28 @@ class MultiScaleDeformableAttnFunction(Function):
             grad_sampling_loc, grad_attn_weight, None
 
 
-def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes,
-                                        sampling_locations, attention_weights):
+def multi_scale_deformable_attn_pytorch(
+        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+        sampling_locations: torch.Tensor,
+        attention_weights: torch.Tensor) -> torch.Tensor:
     """CPU version of multi-scale deformable attention.
 
     Args:
-        value (Tensor): The value has shape
-            (bs, num_keys, mum_heads, embed_dims//num_heads)
-        value_spatial_shapes (Tensor): Spatial shape of
+        value (torch.Tensor): The value has shape
+            (bs, num_keys, num_heads, embed_dims//num_heads)
+        value_spatial_shapes (torch.Tensor): Spatial shape of
             each feature map, has shape (num_levels, 2),
             last dimension 2 represent (h, w)
-        sampling_locations (Tensor): The location of sampling points,
+        sampling_locations (torch.Tensor): The location of sampling points,
             has shape
             (bs ,num_queries, num_heads, num_levels, num_points, 2),
             the last dimension 2 represent (x, y).
-        attention_weights (Tensor): The weight of sampling points used
+        attention_weights (torch.Tensor): The weight of sampling points used
             when calculate the attention, has shape
             (bs ,num_queries, num_heads, num_levels, num_points),
 
     Returns:
-        Tensor: has shape (bs, num_queries, embed_dims)
+        torch.Tensor: has shape (bs, num_queries, embed_dims)
     """
 
     bs, _, num_heads, embed_dims = value.shape
@@ -180,15 +185,15 @@ class MultiScaleDeformableAttention(BaseModule):
     """
 
     def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4,
-                 im2col_step=64,
-                 dropout=0.1,
-                 batch_first=False,
-                 norm_cfg=None,
-                 init_cfg=None):
+                 embed_dims: int = 256,
+                 num_heads: int = 8,
+                 num_levels: int = 4,
+                 num_points: int = 4,
+                 im2col_step: int = 64,
+                 dropout: float = 0.1,
+                 batch_first: bool = False,
+                 norm_cfg: Optional[dict] = None,
+                 init_cfg: Optional[mmcv.ConfigDict] = None):
         super().__init__(init_cfg)
         if embed_dims % num_heads != 0:
             raise ValueError(f'embed_dims must be divisible by num_heads, '
@@ -227,7 +232,7 @@ class MultiScaleDeformableAttention(BaseModule):
         self.output_proj = nn.Linear(embed_dims, embed_dims)
         self.init_weights()
 
-    def init_weights(self):
+    def init_weights(self) -> None:
         """Default initialization for Parameters of Module."""
         constant_init(self.sampling_offsets, 0.)
         thetas = torch.arange(
@@ -247,53 +252,53 @@ class MultiScaleDeformableAttention(BaseModule):
         xavier_init(self.output_proj, distribution='uniform', bias=0.)
         self._is_init = True
 
+    @no_type_check
     @deprecated_api_warning({'residual': 'identity'},
                             cls_name='MultiScaleDeformableAttention')
     def forward(self,
-                query,
-                key=None,
-                value=None,
-                identity=None,
-                query_pos=None,
-                key_padding_mask=None,
-                reference_points=None,
-                spatial_shapes=None,
-                level_start_index=None,
-                **kwargs):
+                query: torch.Tensor,
+                key: Optional[torch.Tensor] = None,
+                value: Optional[torch.Tensor] = None,
+                identity: Optional[torch.Tensor] = None,
+                query_pos: Optional[torch.Tensor] = None,
+                key_padding_mask: Optional[torch.Tensor] = None,
+                reference_points: Optional[torch.Tensor] = None,
+                spatial_shapes: Optional[torch.Tensor] = None,
+                level_start_index: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
         """Forward Function of MultiScaleDeformAttention.
 
         Args:
-            query (Tensor): Query of Transformer with shape
+            query (torch.Tensor): Query of Transformer with shape
                 (num_query, bs, embed_dims).
-            key (Tensor): The key tensor with shape
+            key (torch.Tensor): The key tensor with shape
                 `(num_key, bs, embed_dims)`.
-            value (Tensor): The value tensor with shape
+            value (torch.Tensor): The value tensor with shape
                 `(num_key, bs, embed_dims)`.
-            identity (Tensor): The tensor used for addition, with the
+            identity (torch.Tensor): The tensor used for addition, with the
                 same shape as `query`. Default None. If None,
                 `query` will be used.
-            query_pos (Tensor): The positional encoding for `query`.
+            query_pos (torch.Tensor): The positional encoding for `query`.
                 Default: None.
-            key_pos (Tensor): The positional encoding for `key`. Default
-                None.
-            reference_points (Tensor):  The normalized reference
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            reference_points (torch.Tensor):  The normalized reference
                 points with shape (bs, num_query, num_levels, 2),
                 all elements is range in [0, 1], top-left (0,0),
                 bottom-right (1, 1), including padding area.
                 or (N, Length_{query}, num_levels, 4), add
                 additional two dimensions is (w, h) to
                 form reference boxes.
-            key_padding_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            spatial_shapes (Tensor): Spatial shape of features in
+            spatial_shapes (torch.Tensor): Spatial shape of features in
                 different levels. With shape (num_levels, 2),
                 last dimension represents (h, w).
-            level_start_index (Tensor): The start index of each level.
+            level_start_index (torch.Tensor): The start index of each level.
                 A tensor has shape ``(num_levels, )`` and can be represented
                 as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
 
         Returns:
-             Tensor: forwarded results with shape [num_query, bs, embed_dims].
+            torch.Tensor: forwarded results with shape
+            [num_query, bs, embed_dims].
         """
 
         if value is None:
diff --git a/mmcv/ops/nms.py b/mmcv/ops/nms.py
index 0d2467a0d9343e4a104dd9fa3bf8f8d0d4698f46..d41b1ac966b1b302952e354dcc5e0b6724bef5a0 100644
--- a/mmcv/ops/nms.py
+++ b/mmcv/ops/nms.py
@@ -1,7 +1,9 @@
 import os
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
+from torch import Tensor
 
 from mmcv.utils import deprecated_api_warning
 from ..utils import ext_loader
@@ -14,8 +16,8 @@ ext_module = ext_loader.load_ext(
 class NMSop(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold,
-                max_num):
+    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
+                offset: int, score_threshold: float, max_num: int) -> Tensor:
         is_filtering_by_score = score_threshold > 0
         if is_filtering_by_score:
             valid_mask = scores > score_threshold
@@ -48,6 +50,7 @@ class NMSop(torch.autograd.Function):
                 offset_i=int(offset))
         else:
             from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+
             from ..onnx.onnx_utils.symbolic_helper import _size_helper
 
             boxes = unsqueeze(g, bboxes, 0)
@@ -82,8 +85,9 @@ class NMSop(torch.autograd.Function):
 class SoftNMSop(torch.autograd.Function):
 
     @staticmethod
-    def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method,
-                offset):
+    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
+                sigma: float, min_score: float, method: int,
+                offset: int) -> Tuple[Tensor, Tensor]:
         dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
         inds = ext_module.softnms(
             boxes.cpu(),
@@ -114,8 +118,16 @@ class SoftNMSop(torch.autograd.Function):
         return nms_out
 
 
+array_like_type = Union[Tensor, np.ndarray]
+
+
 @deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
+def nms(boxes: array_like_type,
+        scores: array_like_type,
+        iou_threshold: float,
+        offset: int = 0,
+        score_threshold: float = 0,
+        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
     """Dispatch to either CPU or GPU NMS implementations.
 
     The input can be either torch tensor or numpy array. GPU NMS will be used
@@ -131,8 +143,8 @@ def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
         max_num (int): maximum number of boxes after NMS.
 
     Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
 
     Example:
         >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
@@ -148,8 +160,8 @@ def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
         >>> dets, inds = nms(boxes, scores, iou_threshold)
         >>> assert len(inds) == len(dets) == 3
     """
-    assert isinstance(boxes, (torch.Tensor, np.ndarray))
-    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
     is_numpy = False
     if isinstance(boxes, np.ndarray):
         is_numpy = True
@@ -160,16 +172,8 @@ def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
     assert boxes.size(0) == scores.size(0)
     assert offset in (0, 1)
 
-    if torch.__version__ == 'parrots':
-        indata_list = [boxes, scores]
-        indata_dict = {
-            'iou_threshold': float(iou_threshold),
-            'offset': int(offset)
-        }
-        inds = ext_module.nms(*indata_list, **indata_dict)
-    else:
-        inds = NMSop.apply(boxes, scores, iou_threshold, offset,
-                           score_threshold, max_num)
+    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
+                       max_num)
     dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
     if is_numpy:
         dets = dets.cpu().numpy()
@@ -178,19 +182,19 @@ def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1):
 
 
 @deprecated_api_warning({'iou_thr': 'iou_threshold'})
-def soft_nms(boxes,
-             scores,
-             iou_threshold=0.3,
-             sigma=0.5,
-             min_score=1e-3,
-             method='linear',
-             offset=0):
+def soft_nms(boxes: array_like_type,
+             scores: array_like_type,
+             iou_threshold: float = 0.3,
+             sigma: float = 0.5,
+             min_score: float = 1e-3,
+             method: str = 'linear',
+             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
     """Dispatch to only CPU Soft NMS implementations.
 
     The input can be either a torch tensor or numpy array.
     The returned type will always be the same as inputs.
 
-    Arguments:
+    Args:
         boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
         scores (torch.Tensor or np.ndarray): scores in shape (N, ).
         iou_threshold (float): IoU threshold for NMS.
@@ -200,8 +204,8 @@ def soft_nms(boxes,
         offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
 
     Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
 
     Example:
         >>> boxes = np.array([[4., 3., 5., 3.],
@@ -216,8 +220,8 @@ def soft_nms(boxes,
         >>> assert len(inds) == len(dets) == 5
     """
 
-    assert isinstance(boxes, (torch.Tensor, np.ndarray))
-    assert isinstance(scores, (torch.Tensor, np.ndarray))
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
     is_numpy = False
     if isinstance(boxes, np.ndarray):
         is_numpy = True
@@ -257,46 +261,85 @@ def soft_nms(boxes,
         return dets.to(device=boxes.device), inds.to(device=boxes.device)
 
 
-def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
-    """Performs non-maximum suppression in a batched fashion.
+def batched_nms(boxes: Tensor,
+                scores: Tensor,
+                idxs: Tensor,
+                nms_cfg: Optional[Dict],
+                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Performs non-maximum suppression in a batched fashion.
 
-    Modified from https://github.com/pytorch/vision/blob
-    /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39.
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
     In order to perform NMS independently per class, we add an offset to all
     the boxes. The offset is dependent only on the class idx, and is large
     enough so that boxes from different classes do not overlap.
 
-    Arguments:
-        boxes (torch.Tensor): boxes in shape (N, 4).
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
         scores (torch.Tensor): scores in shape (N, ).
         idxs (torch.Tensor): each index value correspond to a bbox cluster,
             and NMS will not be applied between elements of different idxs,
             shape (N, ).
-        nms_cfg (dict): specify nms type and other parameters like iou_thr.
-            Possible keys includes the following.
+        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
 
-            - iou_thr (float): IoU threshold used for NMS.
+            - iou_threshold (float): IoU threshold used for NMS.
             - split_thr (float): threshold number of boxes. In some cases the
-                number of boxes is large (e.g., 200k). To avoid OOM during
-                training, the users could set `split_thr` to a small value.
-                If the number of boxes is greater than the threshold, it will
-                perform NMS on each group of boxes separately and sequentially.
-                Defaults to 10000.
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
         class_agnostic (bool): if true, nms is class agnostic,
             i.e. IoU thresholding happens over all boxes,
-            regardless of the predicted class.
+            regardless of the predicted class. Defaults to False.
 
     Returns:
         tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
     """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
     nms_cfg_ = nms_cfg.copy()
     class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
     if class_agnostic:
         boxes_for_nms = boxes
     else:
-        max_coordinate = boxes.max()
-        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes))
-        boxes_for_nms = boxes + offsets[:, None]
+        # When using rotated boxes, only apply offsets on center.
+        if boxes.size(-1) == 5:
+            # Strictly, the maximum coordinates of the rotating box
+            # (x,y,w,h,a) should be calculated by polygon coordinates.
+            # But the conversion from rotated box to polygon will
+            # slow down the speed.
+            # So we use max(x,y) + max(w,h) as max coordinate
+            # which is larger than polygon max coordinate
+            # max(x1, y1, x2, y2,x3, y3, x4, y4)
+            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
+            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
+                                      dim=-1)
+        else:
+            max_coordinate = boxes.max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_for_nms = boxes + offsets[:, None]
 
     nms_type = nms_cfg_.pop('type', 'nms')
     nms_op = eval(nms_type)
@@ -306,12 +349,13 @@ def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
     if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
         dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
         boxes = boxes[keep]
-        # -1 indexing works abnormal in TensorRT
-        # This assumes `dets` has 5 dimensions where
+
+        # This assumes `dets` has arbitrary dimensions where
         # the last dimension is score.
-        # TODO: more elegant way to handle the dimension issue.
-        # Some type of nms would reweight the score, such as SoftNMS
-        scores = dets[:, 4]
+        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
+        # rotated boxes [cx, cy, w, h, angle_radian, score].
+
+        scores = dets[:, -1]
     else:
         max_num = nms_cfg_.pop('max_num', -1)
         total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
@@ -333,31 +377,33 @@ def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False):
             boxes = boxes[:max_num]
             scores = scores[:max_num]
 
-    return torch.cat([boxes, scores[:, None]], -1), keep
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
 
 
-def nms_match(dets, iou_threshold):
+def nms_match(dets: array_like_type,
+              iou_threshold: float) -> List[array_like_type]:
     """Matched dets into different groups by NMS.
 
     NMS match is Similar to NMS but when a bbox is suppressed, nms match will
     record the indice of suppressed bbox and form a group with the indice of
     kept bbox. In each group, indice is sorted as score order.
 
-    Arguments:
+    Args:
         dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
-        iou_thr (float): IoU thresh for NMS.
+        iou_threshold (float): IoU thresh for NMS.
 
     Returns:
-        List[torch.Tensor | np.ndarray]: The outer list corresponds different
-            matched group, the inner Tensor corresponds the indices for a group
-            in score order.
+        list[torch.Tensor | np.ndarray]: The outer list corresponds different
+        matched group, the inner Tensor corresponds the indices for a group
+        in score order.
     """
     if dets.shape[0] == 0:
         matched = []
     else:
         assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
                                     f'but get {dets.shape}'
-        if isinstance(dets, torch.Tensor):
+        if isinstance(dets, Tensor):
             dets_t = dets.detach().cpu()
         else:
             dets_t = torch.from_numpy(dets)
@@ -365,15 +411,19 @@ def nms_match(dets, iou_threshold):
         indata_dict = {'iou_threshold': float(iou_threshold)}
         matched = ext_module.nms_match(*indata_list, **indata_dict)
         if torch.__version__ == 'parrots':
-            matched = matched.tolist()
+            matched = matched.tolist()  # type: ignore
 
-    if isinstance(dets, torch.Tensor):
+    if isinstance(dets, Tensor):
         return [dets.new_tensor(m, dtype=torch.long) for m in matched]
     else:
-        return [np.array(m, dtype=np.int) for m in matched]
+        return [np.array(m, dtype=int) for m in matched]
 
 
-def nms_rotated(dets, scores, iou_threshold, labels=None):
+def nms_rotated(dets: Tensor,
+                scores: Tensor,
+                iou_threshold: float,
+                labels: Optional[Tensor] = None,
+                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
     """Performs non-maximum suppression (NMS) on the rotated boxes according to
     their intersection-over-union (IoU).
 
@@ -381,23 +431,33 @@ def nms_rotated(dets, scores, iou_threshold, labels=None):
     IoU greater than iou_threshold with another (higher scoring) rotated box.
 
     Args:
-        boxes (Tensor):  Rotated boxes in shape (N, 5). They are expected to \
-            be in (x_ctr, y_ctr, width, height, angle_radian) format.
-        scores (Tensor): scores in shape (N, ).
+        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
+            They are expected to be in
+            (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (torch.Tensor): scores in shape (N, ).
         iou_threshold (float): IoU thresh for NMS.
-        labels (Tensor): boxes' label in shape (N,).
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
 
     Returns:
-        tuple: kept dets(boxes and scores) and indice, which is always the \
-            same data type as the input.
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
     """
     if dets.shape[0] == 0:
         return dets, None
+    if not clockwise:
+        flip_mat = dets.new_ones(dets.shape[-1])
+        flip_mat[-1] = -1
+        dets_cw = dets * flip_mat
+    else:
+        dets_cw = dets
     multi_label = labels is not None
     if multi_label:
-        dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1)
+        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
     else:
-        dets_wl = dets
+        dets_wl = dets_cw
     _, order = scores.sort(0, descending=True)
     dets_sorted = dets_wl.index_select(0, order)
 
diff --git a/mmcv/ops/pixel_group.py b/mmcv/ops/pixel_group.py
index 2143c75f835a467c802fc3c37ecd3ac0f85bcda4..cf73e326da8f46bf899b84955d0b911dd3f65014 100644
--- a/mmcv/ops/pixel_group.py
+++ b/mmcv/ops/pixel_group.py
@@ -1,33 +1,44 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
 import numpy as np
 import torch
+from torch import Tensor
 
 from ..utils import ext_loader
 
 ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
 
 
-def pixel_group(score, mask, embedding, kernel_label, kernel_contour,
-                kernel_region_num, distance_threshold):
+def pixel_group(
+    score: Union[np.ndarray, Tensor],
+    mask: Union[np.ndarray, Tensor],
+    embedding: Union[np.ndarray, Tensor],
+    kernel_label: Union[np.ndarray, Tensor],
+    kernel_contour: Union[np.ndarray, Tensor],
+    kernel_region_num: int,
+    distance_threshold: float,
+) -> List[List[float]]:
     """Group pixels into text instances, which is widely used text detection
     methods.
 
     Arguments:
-        score (np.array or Tensor): The foreground score with size hxw.
+        score (np.array or torch.Tensor): The foreground score with size hxw.
         mask (np.array or Tensor): The foreground mask with size hxw.
-        embedding (np.array or Tensor): The embedding with size hxwxc to
+        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
             distinguish instances.
-        kernel_label (np.array or Tensor): The instance kernel index with
+        kernel_label (np.array or torch.Tensor): The instance kernel index with
+            size hxw.
+        kernel_contour (np.array or torch.Tensor): The kernel contour with
             size hxw.
-        kernel_contour (np.array or Tensor): The kernel contour with size hxw.
         kernel_region_num (int): The instance kernel region number.
         distance_threshold (float): The embedding distance threshold between
             kernel and pixel in one instance.
 
     Returns:
-        pixel_assignment (List[List[float]]): The instance coordinate list.
-            Each element consists of averaged confidence, pixel number, and
-            coordinates (x_i, y_i for all pixels) in order.
+        list[list[float]]: The instance coordinates and attributes list. Each
+        element consists of averaged confidence, pixel number, and coordinates
+        (x_i, y_i for all pixels) in order.
     """
     assert isinstance(score, (torch.Tensor, np.ndarray))
     assert isinstance(mask, (torch.Tensor, np.ndarray))
diff --git a/mmcv/ops/point_sample.py b/mmcv/ops/point_sample.py
index c084a8c2206d3b72d5a764505d1c53cb645ce515..b40ccaba8275990f900cfae82df4f21b81a9c0c2 100644
--- a/mmcv/ops/point_sample.py
+++ b/mmcv/ops/point_sample.py
@@ -1,15 +1,19 @@
 # Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
 
 from os import path as osp
+from typing import Tuple, Union
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch import Tensor
 from torch.nn.modules.utils import _pair
 from torch.onnx.operators import shape_as_tensor
 
 
-def bilinear_grid_sample(im, grid, align_corners=False):
+def bilinear_grid_sample(im: Tensor,
+                         grid: Tensor,
+                         align_corners: bool = False) -> Tensor:
     """Given an input and a flow-field grid, computes the output using input
     values and pixel locations from grid. Supported only bilinear interpolation
     method to sample the input pixels.
@@ -17,11 +21,12 @@ def bilinear_grid_sample(im, grid, align_corners=False):
     Args:
         im (torch.Tensor): Input feature map, shape (N, C, H, W)
         grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
-        align_corners {bool}: If set to True, the extrema (-1 and 1) are
+        align_corners (bool): If set to True, the extrema (-1 and 1) are
             considered as referring to the center points of the input’s
             corner pixels. If set to False, they are instead considered as
             referring to the corner points of the input’s corner pixels,
             making the sampling more resolution agnostic.
+
     Returns:
         torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
     """
@@ -84,47 +89,52 @@ def bilinear_grid_sample(im, grid, align_corners=False):
     return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
 
 
-def is_in_onnx_export_without_custom_ops():
+def is_in_onnx_export_without_custom_ops() -> bool:
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
     return torch.onnx.is_in_onnx_export(
     ) and not osp.exists(ort_custom_op_path)
 
 
-def normalize(grid):
+def normalize(grid: Tensor) -> Tensor:
     """Normalize input grid from [-1, 1] to [0, 1]
+
     Args:
-        grid (Tensor): The grid to be normalize, range [-1, 1].
+        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
+
     Returns:
-        Tensor: Normalized grid, range [0, 1].
+        torch.Tensor: Normalized grid, range [0, 1].
     """
 
     return (grid + 1.0) / 2.0
 
 
-def denormalize(grid):
+def denormalize(grid: Tensor) -> Tensor:
     """Denormalize input grid from range [0, 1] to [-1, 1]
+
     Args:
-        grid (Tensor): The grid to be denormalize, range [0, 1].
+        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
+
     Returns:
-        Tensor: Denormalized grid, range [-1, 1].
+        torch.Tensor: Denormalized grid, range [-1, 1].
     """
 
     return grid * 2.0 - 1.0
 
 
-def generate_grid(num_grid, size, device):
+def generate_grid(num_grid: int, size: Tuple[int, int],
+                  device: torch.device) -> Tensor:
     """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
     space.
 
     Args:
         num_grid (int): The number of grids to sample, one for each region.
-        size (tuple(int, int)): The side size of the regular grid.
+        size (tuple[int, int]): The side size of the regular grid.
         device (torch.device): Desired device of returned tensor.
 
     Returns:
-        (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that
-            contains coordinates for the regular grids.
+        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
+        contains coordinates for the regular grids.
     """
 
     affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
@@ -134,16 +144,17 @@ def generate_grid(num_grid, size, device):
     return grid.view(1, -1, 2).expand(num_grid, -1, -1)
 
 
-def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
+def rel_roi_point_to_abs_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor) -> Tensor:
     """Convert roi based relative point coordinates to image based absolute
     point coordinates.
 
     Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
     Returns:
-        Tensor: Image based absolute point coordinates, shape (N, P, 2)
+        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
     """
 
     with torch.no_grad():
@@ -165,12 +176,13 @@ def rel_roi_point_to_abs_img_point(rois, rel_roi_points):
     return abs_img_points
 
 
-def get_shape_from_feature_map(x):
+def get_shape_from_feature_map(x: Tensor) -> Tensor:
     """Get spatial resolution of input feature map considering exporting to
     onnx mode.
 
     Args:
         x (torch.Tensor): Input tensor, shape (N, C, H, W)
+
     Returns:
         torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
     """
@@ -183,19 +195,22 @@ def get_shape_from_feature_map(x):
     return img_shape
 
 
-def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
+def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
     """Convert image based absolute point coordinates to image based relative
     coordinates for sampling.
 
     Args:
-        abs_img_points (Tensor): Image based absolute point coordinates,
+        abs_img_points (torch.Tensor): Image based absolute point coordinates,
             shape (N, P, 2)
-        img (tuple/Tensor): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
 
     Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
+        Tensor: Image based relative point coordinates for sampling, shape
+        (N, P, 2).
     """
 
     assert (isinstance(img, tuple) and len(img) == 2) or \
@@ -213,23 +228,24 @@ def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.):
     return abs_img_points / scale * spatial_scale
 
 
-def rel_roi_point_to_rel_img_point(rois,
-                                   rel_roi_points,
-                                   img,
-                                   spatial_scale=1.):
+def rel_roi_point_to_rel_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
     """Convert roi based relative point coordinates to image based absolute
     point coordinates.
 
     Args:
-        rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
-        rel_roi_points (Tensor): Point coordinates inside RoI, relative to
-            RoI, location, range (0, 1), shape (N, P, 2)
-        img (tuple/Tensor): (height, width) of image or feature map.
-        spatial_scale (float): Scale points by this factor. Default: 1.
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
 
     Returns:
-        Tensor: Image based relative point coordinates for sampling,
-            shape (N, P, 2)
+        torch.Tensor: Image based relative point coordinates for sampling,
+        shape (N, P, 2).
     """
 
     abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
@@ -239,20 +255,25 @@ def rel_roi_point_to_rel_img_point(rois,
     return rel_img_point
 
 
-def point_sample(input, points, align_corners=False, **kwargs):
+def point_sample(input: Tensor,
+                 points: Tensor,
+                 align_corners: bool = False,
+                 **kwargs) -> Tensor:
     """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
     Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
     lie inside ``[0, 1] x [0, 1]`` square.
 
     Args:
-        input (Tensor): Feature map, shape (N, C, H, W).
-        points (Tensor): Image based absolute point coordinates (normalized),
-            range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2).
-        align_corners (bool): Whether align_corners. Default: False
+        input (torch.Tensor): Feature map, shape (N, C, H, W).
+        points (torch.Tensor): Image based absolute point coordinates
+            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
+            (N, Hgrid, Wgrid, 2).
+        align_corners (bool, optional): Whether align_corners.
+            Default: False
 
     Returns:
-        Tensor: Features of `point` on `input`, shape (N, C, P) or
-            (N, C, Hgrid, Wgrid).
+        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
+        (N, C, Hgrid, Wgrid).
     """
 
     add_dim = False
@@ -275,7 +296,10 @@ def point_sample(input, points, align_corners=False, **kwargs):
 
 class SimpleRoIAlign(nn.Module):
 
-    def __init__(self, output_size, spatial_scale, aligned=True):
+    def __init__(self,
+                 output_size: Tuple[int],
+                 spatial_scale: float,
+                 aligned: bool = True) -> None:
         """Simple RoI align in PointRend, faster than standard RoIAlign.
 
         Args:
@@ -286,14 +310,14 @@ class SimpleRoIAlign(nn.Module):
                 If True, align the results more perfectly.
         """
 
-        super(SimpleRoIAlign, self).__init__()
+        super().__init__()
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
         # to be consistent with other RoI ops
         self.use_torchvision = False
         self.aligned = aligned
 
-    def forward(self, features, rois):
+    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
         num_imgs = features.size(0)
         num_rois = rois.size(0)
         rel_roi_points = generate_grid(
@@ -329,7 +353,7 @@ class SimpleRoIAlign(nn.Module):
 
         return roi_feats
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         format_str = self.__class__.__name__
         format_str += '(output_size={}, spatial_scale={}'.format(
             self.output_size, self.spatial_scale)
diff --git a/mmcv/ops/points_in_boxes.py b/mmcv/ops/points_in_boxes.py
index 4003173a53052161dbcd687a2fa1d755642fdab8..4915e6b573923fe40658d9dca09b39da9dcb31ed 100644
--- a/mmcv/ops/points_in_boxes.py
+++ b/mmcv/ops/points_in_boxes.py
@@ -1,4 +1,5 @@
 import torch
+from torch import Tensor
 
 from ..utils import ext_loader
 
@@ -8,17 +9,18 @@ ext_module = ext_loader.load_ext('_ext', [
 ])
 
 
-def points_in_boxes_part(points, boxes):
+def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
     """Find the box in which each point is (CUDA).
 
     Args:
-        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
         boxes (torch.Tensor): [B, T, 7],
             num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
-            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
 
     Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M), default background = -1
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M). Default background = -1.
     """
     assert points.shape[0] == boxes.shape[0], \
         'Points and boxes should have the same batch size, ' \
@@ -55,7 +57,7 @@ def points_in_boxes_part(points, boxes):
     return box_idxs_of_pts
 
 
-def points_in_boxes_cpu(points, boxes):
+def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
     """Find all boxes in which each point is (CPU). The CPU version of
     :meth:`points_in_boxes_all`.
 
@@ -67,7 +69,8 @@ def points_in_boxes_cpu(points, boxes):
             (x, y, z) is the bottom center.
 
     Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
     """
     assert points.shape[0] == boxes.shape[0], \
         'Points and boxes should have the same batch size, ' \
@@ -92,7 +95,7 @@ def points_in_boxes_cpu(points, boxes):
     return point_indices
 
 
-def points_in_boxes_all(points, boxes):
+def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
     """Find all boxes in which each point is (CUDA).
 
     Args:
@@ -102,7 +105,8 @@ def points_in_boxes_all(points, boxes):
             (x, y, z) is the bottom center.
 
     Returns:
-        box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0.
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
     """
     assert boxes.shape[0] == points.shape[0], \
         'Points and boxes should have the same batch size, ' \
diff --git a/mmcv/ops/points_in_polygons.py b/mmcv/ops/points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d0dbdc908db3a68c1073334295ee43e4ac5f61
--- /dev/null
+++ b/mmcv/ops/points_in_polygons.py
@@ -0,0 +1,38 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
+
+
+def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
+    """Judging whether points are inside polygons, which is used in the ATSS
+    assignment for the rotated boxes.
+
+    It should be noted that when the point is just at the polygon boundary, the
+    judgment will be inaccurate, but the effect on assignment is limited.
+
+    Args:
+        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
+            M means the number of predicted points.
+        polygons (torch.Tensor): It has shape (M, 8), indicating
+            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
+            ground truth polygons.
+
+    Returns:
+        torch.Tensor: Return the result with the shape of (B, M),
+        1 indicates that the point is inside the polygon,
+        0 indicates that the point is outside the polygon.
+    """
+    assert points.shape[1] == 2, \
+        'points dimension should be 2, ' \
+        f'but got unexpected shape {points.shape[1]}'
+    assert polygons.shape[1] == 8, \
+        'polygons dimension should be 8, ' \
+        f'but got unexpected shape {polygons.shape[1]}'
+    output = torch.full([points.shape[0], polygons.shape[0]],
+                        0.).cuda().float()
+    ext_module.points_in_polygons_forward(points.contiguous(),
+                                          polygons.contiguous(), output)
+    return output
diff --git a/mmcv/ops/points_sampler.py b/mmcv/ops/points_sampler.py
index da41248bbf94bb630479b5818f4ee679a79a551b..e1fd376051986464c0bb84aa9010b9692ecdadc5 100644
--- a/mmcv/ops/points_sampler.py
+++ b/mmcv/ops/points_sampler.py
@@ -1,6 +1,7 @@
 from typing import List
 
 import torch
+from torch import Tensor
 from torch import nn as nn
 
 from mmcv.runner import force_fp32
@@ -8,17 +9,19 @@ from .furthest_point_sample import (furthest_point_sample,
                                     furthest_point_sample_with_dist)
 
 
-def calc_square_dist(point_feat_a, point_feat_b, norm=True):
+def calc_square_dist(point_feat_a: Tensor,
+                     point_feat_b: Tensor,
+                     norm: bool = True) -> Tensor:
     """Calculating square distance between a and b.
 
     Args:
-        point_feat_a (Tensor): (B, N, C) Feature vector of each point.
-        point_feat_b (Tensor): (B, M, C) Feature vector of each point.
-        norm (Bool, optional): Whether to normalize the distance.
+        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
+        norm (bool, optional): Whether to normalize the distance.
             Default: True.
 
     Returns:
-        Tensor: (B, N, M) Distance between each pair points.
+        torch.Tensor: (B, N, M) Square distance between each point pair.
     """
     num_channel = point_feat_a.shape[-1]
     # [bs, n, 1]
@@ -34,7 +37,7 @@ def calc_square_dist(point_feat_a, point_feat_b, norm=True):
     return dist
 
 
-def get_sampler_cls(sampler_type):
+def get_sampler_cls(sampler_type: str) -> nn.Module:
     """Get the type and mode of points sampler.
 
     Args:
@@ -74,7 +77,7 @@ class PointsSampler(nn.Module):
     def __init__(self,
                  num_point: List[int],
                  fps_mod_list: List[str] = ['D-FPS'],
-                 fps_sample_range_list: List[int] = [-1]):
+                 fps_sample_range_list: List[int] = [-1]) -> None:
         super().__init__()
         # FPS would be applied to different fps_mod in the list,
         # so the length of the num_point should be equal to
@@ -89,18 +92,18 @@ class PointsSampler(nn.Module):
         self.fp16_enabled = False
 
     @force_fp32()
-    def forward(self, points_xyz, features):
+    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
         """
         Args:
-            points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
-            features (Tensor): (B, C, N) Descriptors of the features.
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
+                the points.
+            features (torch.Tensor): (B, C, N) features of the points.
 
         Returns:
-            Tensor: (B, npoint, sample_num) Indices of sampled points.
+            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
         """
         indices = []
         last_fps_end_index = 0
-
         for fps_sample_range, sampler, npoint in zip(
                 self.fps_sample_range_list, self.samplers, self.num_point):
             assert fps_sample_range < points_xyz.shape[1]
@@ -112,8 +115,8 @@ class PointsSampler(nn.Module):
                 else:
                     sample_features = None
             else:
-                sample_points_xyz = \
-                    points_xyz[:, last_fps_end_index:fps_sample_range]
+                sample_points_xyz = points_xyz[:, last_fps_end_index:
+                                               fps_sample_range]
                 if features is not None:
                     sample_features = features[:, :, last_fps_end_index:
                                                fps_sample_range]
@@ -124,7 +127,7 @@ class PointsSampler(nn.Module):
                               npoint)
 
             indices.append(fps_idx + last_fps_end_index)
-            last_fps_end_index += fps_sample_range
+            last_fps_end_index = fps_sample_range
         indices = torch.cat(indices, dim=1)
 
         return indices
@@ -133,10 +136,10 @@ class PointsSampler(nn.Module):
 class DFPSSampler(nn.Module):
     """Using Euclidean distances of points for FPS."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
-    def forward(self, points, features, npoint):
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
         """Sampling points with D-FPS."""
         fps_idx = furthest_point_sample(points.contiguous(), npoint)
         return fps_idx
@@ -145,10 +148,10 @@ class DFPSSampler(nn.Module):
 class FFPSSampler(nn.Module):
     """Using feature distances for FPS."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
-    def forward(self, points, features, npoint):
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
         """Sampling points with F-FPS."""
         assert features is not None, \
             'feature input to FFPS_Sampler should not be None'
@@ -162,10 +165,10 @@ class FFPSSampler(nn.Module):
 class FSSampler(nn.Module):
     """Using F-FPS and D-FPS simultaneously."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
-    def forward(self, points, features, npoint):
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
         """Sampling points with FS_Sampling."""
         assert features is not None, \
             'feature input to FS_Sampler should not be None'
diff --git a/mmcv/ops/prroi_pool.py b/mmcv/ops/prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c223aa583990c078c94521a287e09920076392
--- /dev/null
+++ b/mmcv/ops/prroi_pool.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])
+
+
+class PrRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, rois, output_size, spatial_scale):
+        return g.op(
+            'mmcv::PrRoIPool',
+            features,
+            rois,
+            pooled_height_i=int(output_size[0]),
+            pooled_width_i=int(output_size[1]),
+            spatial_scale_f=float(spatial_scale))
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Tuple,
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        if 'FloatTensor' not in features.type(
+        ) or 'FloatTensor' not in rois.type():
+            raise ValueError(
+                'Precise RoI Pooling only takes float input, got '
+                f'{features.type()} for features and {rois.type()} for rois.')
+
+        pooled_height = int(output_size[0])
+        pooled_width = int(output_size[1])
+        spatial_scale = float(spatial_scale)
+
+        features = features.contiguous()
+        rois = rois.contiguous()
+        output_shape = (rois.size(0), features.size(1), pooled_height,
+                        pooled_width)
+        output = features.new_zeros(output_shape)
+        params = (pooled_height, pooled_width, spatial_scale)
+
+        ext_module.prroi_pool_forward(features, rois, output, *params)
+        ctx.params = params
+        # everything here is contiguous.
+        ctx.save_for_backward(features, rois, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+        features, rois, output = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(*features.shape)
+        grad_coor = grad_output.new_zeros(*rois.shape)
+
+        if features.requires_grad:
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_backward(grad_output, rois, grad_input,
+                                           *ctx.params)
+        if rois.requires_grad:
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_coor_backward(output, grad_output, features,
+                                                rois, grad_coor, *ctx.params)
+
+        return grad_input, grad_coor, None, None, None
+
+
+prroi_pool = PrRoIPoolFunction.apply
+
+
+class PrRoIPool(nn.Module):
+    """The operation of precision RoI pooling. The implementation of PrRoIPool
+    is modified from https://github.com/vacancy/PreciseRoIPooling/
+
+    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
+    interpolation) average pooling method for RoI Pooling. It avoids any
+    quantization and has a continuous gradient on bounding box coordinates.
+    It is:
+
+    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
+    Pooling uses average pooling instead of max pooling for each bin and has a
+    continuous gradient on bounding box coordinates. That is, one can take the
+    derivatives of some loss function w.r.t the coordinates of each RoI and
+    optimize the RoI coordinates.
+    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
+    a full integration-based average pooling instead of sampling a constant
+    number of points. This makes the gradient w.r.t. the coordinates
+    continuous.
+
+    Args:
+        output_size (Union[int, tuple]): h, w.
+        spatial_scale (float, optional): scale the input boxes by this number.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): The feature map.
+            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
+                format.
+
+        Returns:
+            torch.Tensor: The pooled results.
+        """
+        return prroi_pool(features, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/mmcv/ops/psa_mask.py b/mmcv/ops/psa_mask.py
index cdf14e62b50e8d4dd6856c94333c703bcc4c9ab6..45f4946662c6751fe72fe6fd139f6e4b508d6cba 100644
--- a/mmcv/ops/psa_mask.py
+++ b/mmcv/ops/psa_mask.py
@@ -1,4 +1,7 @@
 # Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
+from typing import Optional, Tuple
+
+import torch
 from torch import nn
 from torch.autograd import Function
 from torch.nn.modules.utils import _pair
@@ -20,7 +23,8 @@ class PSAMaskFunction(Function):
             mask_size_i=mask_size)
 
     @staticmethod
-    def forward(ctx, input, psa_type, mask_size):
+    def forward(ctx, input: torch.Tensor, psa_type: str,
+                mask_size: int) -> torch.Tensor:
         ctx.psa_type = psa_type
         ctx.mask_size = _pair(mask_size)
         ctx.save_for_backward(input)
@@ -45,7 +49,9 @@ class PSAMaskFunction(Function):
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+            ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
         input = ctx.saved_tensors[0]
         psa_type = ctx.psa_type
         h_mask, w_mask = ctx.mask_size
@@ -71,8 +77,8 @@ psa_mask = PSAMaskFunction.apply
 
 class PSAMask(nn.Module):
 
-    def __init__(self, psa_type, mask_size=None):
-        super(PSAMask, self).__init__()
+    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
+        super().__init__()
         assert psa_type in ['collect', 'distribute']
         if psa_type == 'collect':
             psa_type_enum = 0
@@ -82,7 +88,7 @@ class PSAMask(nn.Module):
         self.mask_size = mask_size
         self.psa_type = psa_type
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         return psa_mask(input, self.psa_type_enum, self.mask_size)
 
     def __repr__(self):
diff --git a/mmcv/ops/riroi_align_rotated.py b/mmcv/ops/riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de810cc5f3ac5ae7847ac60184705832d46f5c0
--- /dev/null
+++ b/mmcv/ops/riroi_align_rotated.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader, is_tuple_of
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
+
+
+class RiRoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                out_size: Union[int, tuple],
+                spatial_scale: float,
+                num_samples: int = 0,
+                num_orientations: int = 8,
+                clockwise: bool = False) -> torch.Tensor:
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif is_tuple_of(out_size, int):
+            assert len(out_size) == 2
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                f'"out_size" should be an integer or tuple of integers,'
+                f' but got {out_size}')
+        ctx.spatial_scale = spatial_scale
+        ctx.num_samples = num_samples
+        ctx.num_orientations = num_orientations
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, _, _ = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+
+        ext_module.riroi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            num_samples=num_samples,
+            num_orientations=num_orientations,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        num_orientations = ctx.num_orientations
+        clockwise = ctx.clockwise
+        num_samples = ctx.num_samples
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, feature_h, feature_w = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
+                                        feature_w)
+            ext_module.riroi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                num_samples=num_samples,
+                num_orientations=num_orientations,
+                clockwise=clockwise)
+
+            return grad_input, None, None, None, None, None, None
+        return None
+
+
+riroi_align_rotated = RiRoIAlignRotatedFunction.apply
+
+
+class RiRoIAlignRotated(nn.Module):
+    """Rotation-invariant RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    The details are described in the paper `ReDet: A Rotation-equivariant
+    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
+
+    Args:
+        out_size (tuple): fixed dimensional RoI output with shape (h, w).
+        spatial_scale (float): scale the input boxes by this number
+        num_samples (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        num_orientations (int): number of oriented channels.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+    """
+
+    def __init__(self,
+                 out_size: tuple,
+                 spatial_scale: float,
+                 num_samples: int = 0,
+                 num_orientations: int = 8,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.num_samples = int(num_samples)
+        self.num_orientations = int(num_orientations)
+        self.clockwise = clockwise
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                               self.spatial_scale,
+                                               self.num_samples,
+                                               self.num_orientations,
+                                               self.clockwise)
diff --git a/mmcv/ops/roi_align.py b/mmcv/ops/roi_align.py
index 0755aefc66e67233ceae0f4b77948301c443e9fb..ca802f60cd1eb46374698f923236d42159f00088 100644
--- a/mmcv/ops/roi_align.py
+++ b/mmcv/ops/roi_align.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -30,9 +32,10 @@ class RoIAlignFunction(Function):
                 mode_s=pool_mode,
                 aligned_i=aligned)
         else:
-            from torch.onnx.symbolic_opset9 import sub, squeeze
-            from torch.onnx.symbolic_helper import _slice_helper
             from torch.onnx import TensorProtoDataType
+            from torch.onnx.symbolic_helper import _slice_helper
+            from torch.onnx.symbolic_opset9 import squeeze, sub
+
             # batch_indices = rois[:, 0].long()
             batch_indices = _slice_helper(
                 g, rois, axes=[1], starts=[0], ends=[1])
@@ -61,14 +64,14 @@ class RoIAlignFunction(Function):
                 mode_s=pool_mode)
 
     @staticmethod
-    def forward(ctx,
-                input,
-                rois,
-                output_size,
-                spatial_scale=1.0,
-                sampling_ratio=0,
-                pool_mode='avg',
-                aligned=True):
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: int,
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                pool_mode: str = 'avg',
+                aligned: bool = True) -> torch.Tensor:
         ctx.output_size = _pair(output_size)
         ctx.spatial_scale = spatial_scale
         ctx.sampling_ratio = sampling_ratio
@@ -107,7 +110,7 @@ class RoIAlignFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
         rois, argmax_y, argmax_x = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
         # complex head architecture may cause grad_output uncontiguous.
@@ -174,13 +177,13 @@ class RoIAlign(nn.Module):
         },
         cls_name='RoIAlign')
     def __init__(self,
-                 output_size,
-                 spatial_scale=1.0,
-                 sampling_ratio=0,
-                 pool_mode='avg',
-                 aligned=True,
-                 use_torchvision=False):
-        super(RoIAlign, self).__init__()
+                 output_size: tuple,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 pool_mode: str = 'avg',
+                 aligned: bool = True,
+                 use_torchvision: bool = False):
+        super().__init__()
 
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
@@ -189,7 +192,7 @@ class RoIAlign(nn.Module):
         self.aligned = aligned
         self.use_torchvision = use_torchvision
 
-    def forward(self, input, rois):
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
         """
         Args:
             input: NCHW images
diff --git a/mmcv/ops/roi_align_rotated.py b/mmcv/ops/roi_align_rotated.py
index 0ce4961a3555d4da8bc3e32f1f7d5ad50036587d..f970ef4d8a57cb1e5f1eca47646b15ec19445ef5 100644
--- a/mmcv/ops/roi_align_rotated.py
+++ b/mmcv/ops/roi_align_rotated.py
@@ -1,8 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
 import torch.nn as nn
 from torch.autograd import Function
+from torch.nn.modules.utils import _pair
 
-from ..utils import ext_loader
+from ..utils import deprecated_api_warning, ext_loader
 
 ext_module = ext_loader.load_ext(
     '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
@@ -11,80 +15,70 @@ ext_module = ext_loader.load_ext(
 class RoIAlignRotatedFunction(Function):
 
     @staticmethod
-    def symbolic(g, features, rois, out_size, spatial_scale, sample_num,
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
                  aligned, clockwise):
-        if isinstance(out_size, int):
-            out_h = out_size
-            out_w = out_size
-        elif isinstance(out_size, tuple):
-            assert len(out_size) == 2
-            assert isinstance(out_size[0], int)
-            assert isinstance(out_size[1], int)
-            out_h, out_w = out_size
+        if isinstance(output_size, int):
+            out_h = output_size
+            out_w = output_size
+        elif isinstance(output_size, tuple):
+            assert len(output_size) == 2
+            assert isinstance(output_size[0], int)
+            assert isinstance(output_size[1], int)
+            out_h, out_w = output_size
         else:
             raise TypeError(
-                '"out_size" must be an integer or tuple of integers')
+                '"output_size" must be an integer or tuple of integers')
         return g.op(
             'mmcv::MMCVRoIAlignRotated',
-            features,
+            input,
             rois,
             output_height_i=out_h,
             output_width_i=out_h,
             spatial_scale_f=spatial_scale,
-            sampling_ratio_i=sample_num,
+            sampling_ratio_i=sampling_ratio,
             aligned_i=aligned,
             clockwise_i=clockwise)
 
     @staticmethod
-    def forward(ctx,
-                features,
-                rois,
-                out_size,
-                spatial_scale,
-                sample_num=0,
-                aligned=True,
-                clockwise=False):
-        if isinstance(out_size, int):
-            out_h = out_size
-            out_w = out_size
-        elif isinstance(out_size, tuple):
-            assert len(out_size) == 2
-            assert isinstance(out_size[0], int)
-            assert isinstance(out_size[1], int)
-            out_h, out_w = out_size
-        else:
-            raise TypeError(
-                '"out_size" must be an integer or tuple of integers')
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float,
+                sampling_ratio: int = 0,
+                aligned: bool = True,
+                clockwise: bool = False) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
         ctx.spatial_scale = spatial_scale
-        ctx.sample_num = sample_num
+        ctx.sampling_ratio = sampling_ratio
         ctx.aligned = aligned
         ctx.clockwise = clockwise
         ctx.save_for_backward(rois)
-        ctx.feature_size = features.size()
+        ctx.feature_size = input.size()
 
-        batch_size, num_channels, data_height, data_width = features.size()
+        batch_size, num_channels, data_height, data_width = input.size()
         num_rois = rois.size(0)
 
-        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
+                                 ctx.output_size[1])
         ext_module.roi_align_rotated_forward(
-            features,
+            input,
             rois,
             output,
-            pooled_height=out_h,
-            pooled_width=out_w,
-            spatial_scale=spatial_scale,
-            sample_num=sample_num,
-            aligned=aligned,
-            clockwise=clockwise)
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned,
+            clockwise=ctx.clockwise)
         return output
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
+               None, None, None]:
         feature_size = ctx.feature_size
-        spatial_scale = ctx.spatial_scale
-        aligned = ctx.aligned
-        clockwise = ctx.clockwise
-        sample_num = ctx.sample_num
         rois = ctx.saved_tensors[0]
         assert feature_size is not None
         batch_size, num_channels, data_height, data_width = feature_size
@@ -103,10 +97,10 @@ class RoIAlignRotatedFunction(Function):
                 grad_input,
                 pooled_height=out_h,
                 pooled_width=out_w,
-                spatial_scale=spatial_scale,
-                sample_num=sample_num,
-                aligned=aligned,
-                clockwise=clockwise)
+                spatial_scale=ctx.spatial_scale,
+                sampling_ratio=ctx.sampling_ratio,
+                aligned=ctx.aligned,
+                clockwise=ctx.clockwise)
         return grad_input, grad_rois, None, None, None, None, None
 
 
@@ -121,9 +115,9 @@ class RoIAlignRotated(nn.Module):
     w, h, angle). The angle is in radian.
 
     Args:
-        out_size (tuple): h, w
+        output_size (tuple): h, w
         spatial_scale (float): scale the input boxes by this number
-        sample_num (int): number of inputs samples to take for each
+        sampling_ratio(int): number of inputs samples to take for each
             output sample. 0 to take samples densely for current models.
         aligned (bool): if False, use the legacy implementation in
             MMDetection. If True, align the results more perfectly.
@@ -156,22 +150,37 @@ class RoIAlignRotated(nn.Module):
         performance if ROIAlign is used together with conv layers.
     """
 
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlignRotated')
     def __init__(self,
-                 out_size,
-                 spatial_scale,
-                 sample_num=0,
-                 aligned=True,
-                 clockwise=False):
-        super(RoIAlignRotated, self).__init__()
-
-        self.out_size = out_size
+                 output_size: Union[int, tuple],
+                 spatial_scale: float,
+                 sampling_ratio: int = 0,
+                 aligned: bool = True,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
-        self.sample_num = int(sample_num)
+        self.sampling_ratio = int(sampling_ratio)
         self.aligned = aligned
         self.clockwise = clockwise
 
-    def forward(self, features, rois):
-        return RoIAlignRotatedFunction.apply(features, rois, self.out_size,
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
                                              self.spatial_scale,
-                                             self.sample_num, self.aligned,
+                                             self.sampling_ratio, self.aligned,
                                              self.clockwise)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'aligned={self.aligned}, '
+        s += f'clockwise={self.clockwise})'
+        return s
diff --git a/mmcv/ops/roi_pool.py b/mmcv/ops/roi_pool.py
index d339d8f2941eabc1cbe181a9c6c5ab5ff4ff4e5f..e295b6a0c16b893688be3a574c6ce423df3399e4 100644
--- a/mmcv/ops/roi_pool.py
+++ b/mmcv/ops/roi_pool.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
 import torch
 import torch.nn as nn
 from torch.autograd import Function
@@ -23,7 +25,11 @@ class RoIPoolFunction(Function):
             spatial_scale_f=spatial_scale)
 
     @staticmethod
-    def forward(ctx, input, rois, output_size, spatial_scale=1.0):
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float = 1.0) -> torch.Tensor:
         ctx.output_size = _pair(output_size)
         ctx.spatial_scale = spatial_scale
         ctx.input_shape = input.size()
@@ -49,7 +55,9 @@ class RoIPoolFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(ctx, grad_output):
+    def backward(
+            ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
         rois, argmax = ctx.saved_tensors
         grad_input = grad_output.new_zeros(ctx.input_shape)
 
@@ -70,13 +78,15 @@ roi_pool = RoIPoolFunction.apply
 
 class RoIPool(nn.Module):
 
-    def __init__(self, output_size, spatial_scale=1.0):
-        super(RoIPool, self).__init__()
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
 
         self.output_size = _pair(output_size)
         self.spatial_scale = float(spatial_scale)
 
-    def forward(self, input, rois):
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
         return roi_pool(input, rois, self.output_size, self.spatial_scale)
 
     def __repr__(self):
diff --git a/mmcv/ops/roiaware_pool3d.py b/mmcv/ops/roiaware_pool3d.py
index f259f0654dcf9402eeb943e8730a53b50cfb3b0b..9a09049b55dada86c885bc45df73ac7a723ac507 100644
--- a/mmcv/ops/roiaware_pool3d.py
+++ b/mmcv/ops/roiaware_pool3d.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
 import torch
 from torch import nn as nn
 from torch.autograd import Function
@@ -25,7 +27,10 @@ class RoIAwarePool3d(nn.Module):
             Default: 'max'.
     """
 
-    def __init__(self, out_size, max_pts_per_voxel=128, mode='max'):
+    def __init__(self,
+                 out_size: Union[int, tuple],
+                 max_pts_per_voxel: int = 128,
+                 mode: str = 'max'):
         super().__init__()
 
         self.out_size = out_size
@@ -34,7 +39,8 @@ class RoIAwarePool3d(nn.Module):
         pool_mapping = {'max': 0, 'avg': 1}
         self.mode = pool_mapping[mode]
 
-    def forward(self, rois, pts, pts_feature):
+    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor) -> torch.Tensor:
         """
         Args:
             rois (torch.Tensor): [N, 7], in LiDAR coordinate,
@@ -43,7 +49,8 @@ class RoIAwarePool3d(nn.Module):
             pts_feature (torch.Tensor): [npoints, C], features of input points.
 
         Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C]
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
         """
 
         return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
@@ -54,8 +61,9 @@ class RoIAwarePool3d(nn.Module):
 class RoIAwarePool3dFunction(Function):
 
     @staticmethod
-    def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel,
-                mode):
+    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor, out_size: Union[int, tuple],
+                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
         """
         Args:
             rois (torch.Tensor): [N, 7], in LiDAR coordinate,
@@ -70,8 +78,8 @@ class RoIAwarePool3dFunction(Function):
                 pool).
 
         Returns:
-            pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output
-                pooled features.
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
         """
 
         if isinstance(out_size, int):
@@ -107,7 +115,9 @@ class RoIAwarePool3dFunction(Function):
         return pooled_features
 
     @staticmethod
-    def backward(ctx, grad_out):
+    def backward(
+        ctx: Any, grad_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
         ret = ctx.roiaware_pool3d_for_backward
         pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
 
diff --git a/mmcv/ops/roipoint_pool3d.py b/mmcv/ops/roipoint_pool3d.py
index 0a21412c0728431c04b84245bc2e3109eea9aefc..3c16f5fa67cb3cf6d48d4263b5acf0173ccde7bf 100644
--- a/mmcv/ops/roipoint_pool3d.py
+++ b/mmcv/ops/roipoint_pool3d.py
@@ -1,3 +1,6 @@
+from typing import Any, Tuple
+
+import torch
 from torch import nn as nn
 from torch.autograd import Function
 
@@ -17,11 +20,12 @@ class RoIPointPool3d(nn.Module):
             Default: 512.
     """
 
-    def __init__(self, num_sampled_points=512):
+    def __init__(self, num_sampled_points: int = 512):
         super().__init__()
         self.num_sampled_points = num_sampled_points
 
-    def forward(self, points, point_features, boxes3d):
+    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
+                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
         """
         Args:
             points (torch.Tensor): Input points whose shape is (B, N, C).
@@ -30,9 +34,9 @@ class RoIPointPool3d(nn.Module):
             boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
 
         Returns:
-            pooled_features (torch.Tensor): The output pooled features whose
-                shape is (B, M, 512, 3 + C).
-            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
         """
         return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
                                             self.num_sampled_points)
@@ -41,7 +45,13 @@ class RoIPointPool3d(nn.Module):
 class RoIPointPool3dFunction(Function):
 
     @staticmethod
-    def forward(ctx, points, point_features, boxes3d, num_sampled_points=512):
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            point_features: torch.Tensor,
+            boxes3d: torch.Tensor,
+            num_sampled_points: int = 512
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
             points (torch.Tensor): Input points whose shape is (B, N, C).
@@ -52,9 +62,9 @@ class RoIPointPool3dFunction(Function):
                 Default: 512.
 
         Returns:
-            pooled_features (torch.Tensor): The output pooled features whose
-                shape is (B, M, 512, 3 + C).
-            pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M).
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
         """
         assert len(points.shape) == 3 and points.shape[2] == 3
         batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
@@ -73,5 +83,5 @@ class RoIPointPool3dFunction(Function):
         return pooled_features, pooled_empty_flag
 
     @staticmethod
-    def backward(ctx, grad_out):
+    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
         raise NotImplementedError
diff --git a/mmcv/ops/rotated_feature_align.py b/mmcv/ops/rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d5954ddf29a6505abf007c47394c2ecc78e32d6
--- /dev/null
+++ b/mmcv/ops/rotated_feature_align.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
+
+
+class RotatedFeatureAlignFunction(Function):
+    """Using the feature interpolation to obtain the position information
+    correspond to the refined rotate anchors and reconstruct the feature maps
+    in pixel-wise manner to achieve feature alignment.
+
+    The details are described in the paper
+    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
+    Object <https://arxiv.org/abs/1908.05612>`_.
+    """
+
+    @staticmethod
+    def symbolic(g, features, best_rbboxes, spatial_scale, points):
+        assert points in [1, 5]
+        return g.op(
+            'mmcv::MMCVRotatedFeatureAlign',
+            features,
+            best_rbboxes,
+            spatial_scale_f=spatial_scale,
+            points_i=points)
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
+                spatial_scale: float, points: int) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Input features with shape [N,C,H,W].
+            best_rbboxes (torch.Tensor): Refined rotate anchors with
+                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
+            spatial_scale (float): The scale of feature map size and
+                input image size.
+            points (int, optional): The number of sample points.
+                Only 1 and 5 are supported. Defaults to 1.
+
+        Returns:
+            torch.Tensor: Refined features with shape [N,C,H,W].
+        """
+        ctx.spatial_scale = spatial_scale
+        ctx.points = points
+        ctx.save_for_backward(best_rbboxes)
+        assert points in [1, 5]
+        output = torch.zeros_like(features)
+        ext_module.rotated_feature_align_forward(
+            features,
+            best_rbboxes,
+            output,
+            spatial_scale=spatial_scale,
+            points=points)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [N,C,H,W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape [N,C,H,W].
+        """
+        best_rbboxes = ctx.saved_tensors[0]
+        points = ctx.points
+        spatial_scale = ctx.spatial_scale
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = torch.zeros_like(grad_output)
+            ext_module.rotated_feature_align_backward(
+                grad_output.contiguous(),
+                best_rbboxes,
+                grad_input,
+                spatial_scale=spatial_scale,
+                points=points)
+        return grad_input, None, None, None
+
+
+def rotated_feature_align(features: torch.Tensor,
+                          best_rbboxes: torch.Tensor,
+                          spatial_scale: float = 1 / 8,
+                          points: int = 1) -> torch.Tensor:
+    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
+                                             spatial_scale, points)
diff --git a/mmcv/ops/saconv.py b/mmcv/ops/saconv.py
index 111fab9286f6be8244541c56a83aadc3e59f4821..817ef9496edb3556fb33f8e9b6d049af6c0dcb04 100644
--- a/mmcv/ops/saconv.py
+++ b/mmcv/ops/saconv.py
@@ -12,8 +12,9 @@ from mmcv.utils import TORCH_VERSION, digit_version
 class SAConv2d(ConvAWS2d):
     """SAC (Switchable Atrous Convolution)
 
-    This is an implementation of SAC in DetectoRS
-    (https://arxiv.org/pdf/2006.02334.pdf).
+    This is an implementation of `DetectoRS: Detecting Objects with Recursive
+    Feature Pyramid and Switchable Atrous Convolution
+    <https://arxiv.org/abs/2006.02334>`_.
 
     Args:
         in_channels (int): Number of channels in the input image
diff --git a/mmcv/ops/scatter_points.py b/mmcv/ops/scatter_points.py
index 2b8aa4169e9f6ca4a6f845ce17d6d1e4db416bb8..5d881bfe63309fb406c123ee69d4e37125f45843 100644
--- a/mmcv/ops/scatter_points.py
+++ b/mmcv/ops/scatter_points.py
@@ -1,5 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Optional, Tuple
+
 import torch
+import torch.nn.functional as F
 from torch import nn
 from torch.autograd import Function
 
@@ -13,7 +16,10 @@ ext_module = ext_loader.load_ext(
 class _DynamicScatter(Function):
 
     @staticmethod
-    def forward(ctx, feats, coors, reduce_type='max'):
+    def forward(ctx: Any,
+                feats: torch.Tensor,
+                coors: torch.Tensor,
+                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
         """convert kitti points(N, >=3) to voxels.
 
         Args:
@@ -25,10 +31,10 @@ class _DynamicScatter(Function):
                 'mean'. Default: 'max'.
 
         Returns:
-            voxel_feats (torch.Tensor): [M, C]. Reduced features, input
-                features that shares the same voxel coordinates are reduced to
-                one row.
-            voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates.
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
         """
         results = ext_module.dynamic_point_to_voxel_forward(
             feats, coors, reduce_type)
@@ -41,7 +47,9 @@ class _DynamicScatter(Function):
         return voxel_feats, voxel_coors
 
     @staticmethod
-    def backward(ctx, grad_voxel_feats, grad_voxel_coors=None):
+    def backward(ctx: Any,
+                 grad_voxel_feats: torch.Tensor,
+                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
         (feats, voxel_feats, point2voxel_map,
          voxel_points_count) = ctx.saved_tensors
         grad_feats = torch.zeros_like(feats)
@@ -72,14 +80,17 @@ class DynamicScatter(nn.Module):
             into voxel.
     """
 
-    def __init__(self, voxel_size, point_cloud_range, average_points: bool):
+    def __init__(self, voxel_size: List, point_cloud_range: List,
+                 average_points: bool):
         super().__init__()
 
         self.voxel_size = voxel_size
         self.point_cloud_range = point_cloud_range
         self.average_points = average_points
 
-    def forward_single(self, points, coors):
+    def forward_single(
+            self, points: torch.Tensor,
+            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Scatters points into voxels.
 
         Args:
@@ -88,14 +99,16 @@ class DynamicScatter(nn.Module):
                 multi-dim voxel index) of each points.
 
         Returns:
-            voxel_feats (torch.Tensor): Reduced features, input features that
-                shares the same voxel coordinates are reduced to one row.
-            voxel_coors (torch.Tensor): Voxel coordinates.
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
         """
         reduce = 'mean' if self.average_points else 'max'
         return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
 
-    def forward(self, points, coors):
+    def forward(self, points: torch.Tensor,
+                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """Scatters points/features into voxels.
 
         Args:
@@ -104,9 +117,10 @@ class DynamicScatter(nn.Module):
                 multi-dim voxel index) of each points.
 
         Returns:
-            voxel_feats (torch.Tensor): Reduced features, input features that
-                shares the same voxel coordinates are reduced to one row.
-            voxel_coors (torch.Tensor): Voxel coordinates.
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
         """
         if coors.size(-1) == 3:
             return self.forward_single(points, coors)
@@ -117,8 +131,7 @@ class DynamicScatter(nn.Module):
                 inds = torch.where(coors[:, 0] == i)
                 voxel, voxel_coor = self.forward_single(
                     points[inds], coors[inds][:, 1:])
-                coor_pad = nn.functional.pad(
-                    voxel_coor, (1, 0), mode='constant', value=i)
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
                 voxel_coors.append(coor_pad)
                 voxels.append(voxel)
             features = torch.cat(voxels, dim=0)
diff --git a/mmcv/ops/sparse_conv.py b/mmcv/ops/sparse_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df88d003b7402c06ebed34a2e658a7db17151aa
--- /dev/null
+++ b/mmcv/ops/sparse_conv.py
@@ -0,0 +1,455 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from ..cnn import CONV_LAYERS
+from . import sparse_functional as Fsp
+from . import sparse_ops as ops
+from .sparse_modules import SparseModule
+from .sparse_structure import SparseConvTensor
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super().__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        data = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert data is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and data is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = data
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+@CONV_LAYERS.register_module()
+class SparseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConvTranspose2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConvTranspose3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseInverseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseInverseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
diff --git a/mmcv/ops/sparse_functional.py b/mmcv/ops/sparse_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a80a545aa5411b047518bf1286bb8489bece76b
--- /dev/null
+++ b/mmcv/ops/sparse_functional.py
@@ -0,0 +1,156 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+
+from . import sparse_ops as ops
+
+
+class SparseConvFunction(Function):
+    """Sparse Convolution.
+
+    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
+    more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseInverseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from sparse maxpooling.
+        """
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/mmcv/ops/sparse_modules.py b/mmcv/ops/sparse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a92aa279754767da493eee876cb1ab716bc770
--- /dev/null
+++ b/mmcv/ops/sparse_modules.py
@@ -0,0 +1,203 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import torch
+from torch import nn
+
+from .sparse_structure import SparseConvTensor
+
+
+def is_spconv_module(module: nn.Module) -> bool:
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module: nn.Module) -> bool:
+    from .sparse_conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
+                 t: float) -> List:
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """place holder, All module subclass from this will take sptensor in
+    SparseSequential."""
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+    Example:
+        >>> # using Sequential:
+        >>> from mmcv.ops import SparseSequential
+        >>> model = SparseSequential(
+                    SparseConv2d(1,20,5),
+                    nn.ReLU(),
+                    SparseConv2d(20,64,5),
+                    nn.ReLU()
+                    )
+
+        >>> # using Sequential with OrderedDict
+        >>> model = SparseSequential(OrderedDict([
+                      ('conv1', SparseConv2d(1,20,5)),
+                      ('relu1', nn.ReLU()),
+                      ('conv2', SparseConv2d(20,64,5)),
+                      ('relu2', nn.ReLU())
+                    ]))
+
+        >>> # using Sequential with kwargs(python 3.6+)
+        >>> model = SparseSequential(
+                      conv1=SparseConv2d(1,20,5),
+                      relu1=nn.ReLU(),
+                      conv2=SparseConv2d(20,64,5),
+                      relu2=nn.ReLU()
+                    )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module: Any, name: Optional[str] = None) -> None:
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        for k, module in self._modules.items():
+            if is_spconv_module(module):
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        from .sparse_conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor."""
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer."""
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/mmcv/ops/sparse_ops.py b/mmcv/ops/sparse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3f54bcffe7f6d8aae166ab06bceb9d2494b93
--- /dev/null
+++ b/mmcv/ops/sparse_ops.py
@@ -0,0 +1,174 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
+    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
+    'get_indice_pairs_3d_backward', 'indice_conv_forward',
+    'indice_conv_backward', 'fused_indice_conv_forward',
+    'indice_maxpool_forward', 'indice_maxpool_backward'
+])
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
+        elif ndim == 4:
+            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_forward(features, filters, indice_pairs,
+                                              indice_pair_num,
+                                              num_activate_out, int(inverse),
+                                              int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half or filters.dtypes == torch.float32:
+        func = ext_module.fused_indice_conv_forward
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_backward(features, filters, out_bp,
+                                               indice_pairs, indice_pair_num,
+                                               int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_forward(features, indice_pairs,
+                                                 indice_pair_num,
+                                                 num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_backward(features, out_features,
+                                                  out_bp, indice_pairs,
+                                                  indice_pair_num)
+    else:
+        raise NotImplementedError
diff --git a/mmcv/ops/sparse_pool.py b/mmcv/ops/sparse_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4edb1d4e4029bfff2978bc9ea7961719d873110
--- /dev/null
+++ b/mmcv/ops/sparse_pool.py
@@ -0,0 +1,86 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import sparse_functional as Fsp
+# import sparse_ops as ops
+from .sparse_functional import indice_maxpool
+from .sparse_modules import SparseModule
+from .sparse_ops import get_conv_output_size, get_indice_pairs
+from .sparse_structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super().__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = get_conv_output_size(spatial_shape,
+                                                     self.kernel_size,
+                                                     self.stride, self.padding,
+                                                     self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = indice_maxpool(features, indice_pairs.to(device),
+                                      indice_pairs_num.to(device),
+                                      outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(2, kernel_size, stride, padding, dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(3, kernel_size, stride, padding, dilation)
diff --git a/mmcv/ops/sparse_structure.py b/mmcv/ops/sparse_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..83907ab5563ff292e8c48715f5b1149a7d31f460
--- /dev/null
+++ b/mmcv/ops/sparse_structure.py
@@ -0,0 +1,66 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
+               shape: torch.Tensor) -> torch.Tensor:
+    """pytorch edition of tensorflow scatter_nd.
+
+    this function don't contain except handle code. so use this carefully when
+    indice repeats, don't support repeat add which is supported in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor:
+
+    def __init__(self,
+                 features: torch.Tensor,
+                 indices: torch.Tensor,
+                 spatial_shape: Union[List, Tuple],
+                 batch_size: int,
+                 grid: Optional[torch.Tensor] = None):
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict: dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first: bool = True) -> torch.Tensor:
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
diff --git a/mmcv/ops/sync_bn.py b/mmcv/ops/sync_bn.py
index 04302f03131785c99430868142a6ba5bb8600b1d..ce8727cb379ec2448156661a0795e8df6cb5c900 100644
--- a/mmcv/ops/sync_bn.py
+++ b/mmcv/ops/sync_bn.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
@@ -35,8 +37,10 @@ class SyncBatchNormFunction(Function):
             stats_mode=stats_mode)
 
     @staticmethod
-    def forward(self, input, running_mean, running_var, weight, bias, momentum,
-                eps, group, group_size, stats_mode):
+    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
+                running_var: torch.Tensor, weight: torch.Tensor,
+                bias: torch.Tensor, momentum: float, eps: float, group: int,
+                group_size: int, stats_mode: str) -> torch.Tensor:
         self.momentum = momentum
         self.eps = eps
         self.group = group
@@ -126,7 +130,7 @@ class SyncBatchNormFunction(Function):
 
     @staticmethod
     @once_differentiable
-    def backward(self, grad_output):
+    def backward(self, grad_output: torch.Tensor) -> tuple:
         norm, std, weight = self.saved_tensors
         grad_weight = torch.zeros_like(weight)
         grad_bias = torch.zeros_like(weight)
@@ -191,14 +195,14 @@ class SyncBatchNorm(Module):
     """
 
     def __init__(self,
-                 num_features,
-                 eps=1e-5,
-                 momentum=0.1,
-                 affine=True,
-                 track_running_stats=True,
-                 group=None,
-                 stats_mode='default'):
-        super(SyncBatchNorm, self).__init__()
+                 num_features: int,
+                 eps: float = 1e-5,
+                 momentum: float = 0.1,
+                 affine: bool = True,
+                 track_running_stats: bool = True,
+                 group: Optional[int] = None,
+                 stats_mode: str = 'default'):
+        super().__init__()
         self.num_features = num_features
         self.eps = eps
         self.momentum = momentum
@@ -239,7 +243,7 @@ class SyncBatchNorm(Module):
             self.weight.data.uniform_()  # pytorch use ones_()
             self.bias.data.zero_()
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         if input.dim() < 2:
             raise ValueError(
                 f'expected at least 2D input, got {input.dim()}D input')
diff --git a/mmcv/ops/three_interpolate.py b/mmcv/ops/three_interpolate.py
index 203f47f05d58087e034fb3cd8cd6a09233947b4a..12b2f7611e738f48ff24fe26e36e82cee424d6b9 100644
--- a/mmcv/ops/three_interpolate.py
+++ b/mmcv/ops/three_interpolate.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Any, Tuple
 
 import torch
 from torch.autograd import Function
@@ -17,18 +17,19 @@ class ThreeInterpolate(Function):
     """
 
     @staticmethod
-    def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
+    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
                 weight: torch.Tensor) -> torch.Tensor:
         """
         Args:
-            features (Tensor): (B, C, M) Features descriptors to be
-                interpolated
-            indices (Tensor): (B, n, 3) index three nearest neighbors
-                of the target features in features
-            weight (Tensor): (B, n, 3) weights of interpolation
+            features (torch.Tensor): (B, C, M) Features descriptors to be
+                interpolated.
+            indices (torch.Tensor): (B, n, 3) indices of three nearest
+                neighbor features for the target features.
+            weight (torch.Tensor): (B, n, 3) weights of three nearest
+                neighbor features for the target features.
 
         Returns:
-            Tensor: (B, C, N) tensor of the interpolated features
+            torch.Tensor: (B, C, N) tensor of the interpolated features
         """
         assert features.is_contiguous()
         assert indices.is_contiguous()
@@ -49,10 +50,10 @@ class ThreeInterpolate(Function):
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Args:
-            grad_out (Tensor): (B, C, N) tensor with gradients of outputs
+            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
 
         Returns:
-            Tensor: (B, C, M) tensor with gradients of features
+            torch.Tensor: (B, C, M) tensor with gradients of features
         """
         idx, weight, m = ctx.three_interpolate_for_backward
         B, c, n = grad_out.size()
diff --git a/mmcv/ops/three_nn.py b/mmcv/ops/three_nn.py
index 2b01047a129989cd5545a0a86f23a487f4a13ce1..7893c8363b1ceead58d3f6f554886bba61948fad 100644
--- a/mmcv/ops/three_nn.py
+++ b/mmcv/ops/three_nn.py
@@ -1,4 +1,4 @@
-from typing import Tuple
+from typing import Any, Tuple
 
 import torch
 from torch.autograd import Function
@@ -16,18 +16,18 @@ class ThreeNN(Function):
     """
 
     @staticmethod
-    def forward(ctx, target: torch.Tensor,
+    def forward(ctx: Any, target: torch.Tensor,
                 source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Args:
-            target (Tensor): shape (B, N, 3), points set that needs to
+            target (torch.Tensor): shape (B, N, 3), points set that needs to
                 find the nearest neighbors.
-            source (Tensor): shape (B, M, 3), points set that is used
+            source (torch.Tensor): shape (B, M, 3), points set that is used
                 to find the nearest neighbors of points in target set.
 
         Returns:
-            Tensor: shape (B, N, 3), L2 distance of each point in target
-                set to their corresponding nearest neighbors.
+            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
+            set to their corresponding top three nearest neighbors.
         """
         target = target.contiguous()
         source = source.contiguous()
diff --git a/mmcv/ops/tin_shift.py b/mmcv/ops/tin_shift.py
old mode 100644
new mode 100755
index 472c9fcfe45a124e819b7ed5653e585f94a8811e..473231cc0de002bbf8bdb22cc19755487fbddb48
--- a/mmcv/ops/tin_shift.py
+++ b/mmcv/ops/tin_shift.py
@@ -18,6 +18,10 @@ class TINShiftFunction(Function):
 
     @staticmethod
     def forward(ctx, input, shift):
+        if input.size(0) != shift.size(0):
+            raise ValueError(
+                'The first dim (batch) of `input` and `shift` should be '
+                f'same, but got {input.size(0)} and {shift.size(0)}.')
         C = input.size(2)
         num_segments = shift.size(1)
         if C // num_segments <= 0 or C % num_segments != 0:
@@ -51,7 +55,9 @@ class TINShift(nn.Module):
     Temporal Interlace shift is a differentiable temporal-wise frame shifting
     which is proposed in "Temporal Interlacing Network"
 
-    Please refer to https://arxiv.org/abs/2001.06499 for more details.
+    Please refer to `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_ for more details.
+
     Code is modified from https://github.com/mit-han-lab/temporal-shift-module
     """
 
@@ -59,8 +65,9 @@ class TINShift(nn.Module):
         """Perform temporal interlace shift.
 
         Args:
-            input (Tensor): Feature map with shape [N, num_segments, C, H * W].
-            shift (Tensor): Shift tensor with shape [N, num_segments].
+            input (torch.Tensor): Feature map with shape
+                [N, num_segments, C, H * W].
+            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
 
         Returns:
             Feature map after temporal interlace shift.
diff --git a/mmcv/ops/upfirdn2d.py b/mmcv/ops/upfirdn2d.py
index 1d2f3214109caf729ede256cf7063533720eb316..434238359a04cdf42bdcb71d1266668f2534b296 100644
--- a/mmcv/ops/upfirdn2d.py
+++ b/mmcv/ops/upfirdn2d.py
@@ -95,6 +95,8 @@
 
 # =======================================================================
 
+from typing import Any, List, Tuple, Union
+
 import torch
 from torch.autograd import Function
 from torch.nn import functional as F
@@ -108,8 +110,10 @@ upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d'])
 class UpFirDn2dBackward(Function):
 
     @staticmethod
-    def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad,
-                in_size, out_size):
+    def forward(ctx: Any, grad_output: torch.Tensor, kernel: torch.Tensor,
+                grad_kernel: torch.Tensor, up: tuple, down: tuple, pad: tuple,
+                g_pad: tuple, in_size: Union[List, Tuple],
+                out_size: Union[List, Tuple]) -> torch.Tensor:
 
         up_x, up_y = up
         down_x, down_y = down
@@ -149,7 +153,7 @@ class UpFirDn2dBackward(Function):
         return grad_input
 
     @staticmethod
-    def backward(ctx, gradgrad_input):
+    def backward(ctx: Any, gradgrad_input: torch.Tensor) -> tuple:
         kernel, = ctx.saved_tensors
 
         gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
@@ -177,7 +181,8 @@ class UpFirDn2dBackward(Function):
 class UpFirDn2d(Function):
 
     @staticmethod
-    def forward(ctx, input, kernel, up, down, pad):
+    def forward(ctx: Any, input: torch.Tensor, kernel: torch.Tensor, up: tuple,
+                down: tuple, pad: tuple) -> torch.Tensor:
         up_x, up_y = up
         down_x, down_y = down
         pad_x0, pad_x1, pad_y0, pad_y1 = pad
@@ -222,7 +227,7 @@ class UpFirDn2d(Function):
         return out
 
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
         kernel, grad_kernel = ctx.saved_tensors
 
         grad_input = UpFirDn2dBackward.apply(
@@ -240,7 +245,12 @@ class UpFirDn2d(Function):
         return grad_input, None, None, None, None
 
 
-def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
+def upfirdn2d(
+    input: torch.Tensor,
+    kernel: torch.Tensor,
+    up: Union[int, tuple] = 1,
+    down: Union[int, tuple] = 1,
+    pad: tuple = (0, 0)) -> torch.Tensor:  # noqa E125
     """UpFRIDn for 2d features.
 
     UpFIRDn is short for upsample, apply FIR filter and downsample. More
@@ -248,8 +258,8 @@ def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
     https://www.mathworks.com/help/signal/ref/upfirdn.html
 
     Args:
-        input (Tensor): Tensor with shape of (n, c, h, w).
-        kernel (Tensor): Filter kernel.
+        input (torch.Tensor): Tensor with shape of (n, c, h, w).
+        kernel (torch.Tensor): Filter kernel.
         up (int | tuple[int], optional): Upsampling factor. If given a number,
             we will use this factor for the both height and width side.
             Defaults to 1.
@@ -260,18 +270,18 @@ def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
             (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0).
 
     Returns:
-        Tensor: Tensor after UpFIRDn.
+        torch.Tensor: Tensor after UpFIRDn.
     """
     if input.device.type == 'cpu':
         if len(pad) == 2:
-            pad = (pad[0], pad[1], pad[0], pad[1])
+            pad = (pad[0], pad[1], pad[0], pad[1])  # type: ignore
 
-        up = to_2tuple(up)
+        _up = to_2tuple(up)
 
-        down = to_2tuple(down)
+        _down = to_2tuple(down)
 
-        out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1],
-                               pad[0], pad[1], pad[2], pad[3])
+        out = upfirdn2d_native(input, kernel, _up[0], _up[1], _down[0],
+                               _down[1], pad[0], pad[1], pad[2], pad[3])
     else:
         _up = to_2tuple(up)
 
@@ -287,8 +297,9 @@ def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
     return out
 
 
-def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1,
-                     pad_y0, pad_y1):
+def upfirdn2d_native(input: torch.Tensor, kernel: torch.Tensor, up_x: int,
+                     up_y: int, down_x: int, down_y: int, pad_x0: int,
+                     pad_x1: int, pad_y0: int, pad_y1: int) -> torch.Tensor:
     _, channel, in_h, in_w = input.shape
     input = input.reshape(-1, in_h, in_w, 1)
 
diff --git a/mmcv/ops/voxelize.py b/mmcv/ops/voxelize.py
index d6fc855606c16c1b5bd38e6d633a3a7cca8c03c5..992ce68fd2a970bd475abaae68e62c78fec0e4c8 100644
--- a/mmcv/ops/voxelize.py
+++ b/mmcv/ops/voxelize.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple, Union
+
 import torch
 from torch import nn
 from torch.autograd import Function
@@ -13,12 +15,14 @@ ext_module = ext_loader.load_ext(
 class _Voxelization(Function):
 
     @staticmethod
-    def forward(ctx,
-                points,
-                voxel_size,
-                coors_range,
-                max_points=35,
-                max_voxels=20000):
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            voxel_size: Union[tuple, float],
+            coors_range: Union[tuple, float],
+            max_points: int = 35,
+            max_voxels: int = 20000,
+            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
         """Convert kitti points(N, >=3) to voxels.
 
         Args:
@@ -34,15 +38,24 @@ class _Voxelization(Function):
                 for second, 20000 is a good choice. Users should shuffle points
                 before call this function because max_voxels may drop points.
                 Default: 20000.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
 
         Returns:
-            voxels_out (torch.Tensor): Output voxels with the shape of [M,
-                max_points, ndim]. Only contain points and returned when
-                max_points != -1.
-            coors_out (torch.Tensor): Output coordinates with the shape of
-                [M, 3].
-            num_points_per_voxel_out (torch.Tensor): Num points per voxel with
-                the shape of [M]. Only returned when max_points != -1.
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
         """
         if max_points == -1 or max_voxels == -1:
             coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
@@ -70,7 +83,8 @@ class _Voxelization(Function):
                 voxel_num,
                 max_points=max_points,
                 max_voxels=max_voxels,
-                NDim=3)
+                NDim=3,
+                deterministic=deterministic)
             # select the valid voxels
             voxels_out = voxels[:voxel_num]
             coors_out = coors[:voxel_num]
@@ -84,8 +98,8 @@ voxelization = _Voxelization.apply
 class Voxelization(nn.Module):
     """Convert kitti points(N, >=3) to voxels.
 
-    Please refer to `PVCNN <https://arxiv.org/abs/1907.03739>`_ for more
-    details.
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
 
     Args:
         voxel_size (tuple or float): The size of voxel with the shape of [3].
@@ -100,10 +114,30 @@ class Voxelization(nn.Module):
     """
 
     def __init__(self,
-                 voxel_size,
-                 point_cloud_range,
-                 max_num_points,
-                 max_voxels=20000):
+                 voxel_size: List,
+                 point_cloud_range: List,
+                 max_num_points: int,
+                 max_voxels: Union[tuple, int] = 20000,
+                 deterministic: bool = True):
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
         super().__init__()
 
         self.voxel_size = voxel_size
@@ -113,12 +147,14 @@ class Voxelization(nn.Module):
             self.max_voxels = max_voxels
         else:
             self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
 
         point_cloud_range = torch.tensor(
             point_cloud_range, dtype=torch.float32)
         voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
-        grid_size = (point_cloud_range[3:] -
-                     point_cloud_range[:3]) / voxel_size
+        grid_size = (
+            point_cloud_range[3:] -  # type: ignore
+            point_cloud_range[:3]) / voxel_size  # type: ignore
         grid_size = torch.round(grid_size).long()
         input_feat_shape = grid_size[:2]
         self.grid_size = grid_size
@@ -126,14 +162,15 @@ class Voxelization(nn.Module):
         # [w, h, d] -> [d, h, w]
         self.pcd_shape = [*input_feat_shape, 1][::-1]
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
         if self.training:
             max_voxels = self.max_voxels[0]
         else:
             max_voxels = self.max_voxels[1]
 
         return voxelization(input, self.voxel_size, self.point_cloud_range,
-                            self.max_num_points, max_voxels)
+                            self.max_num_points, max_voxels,
+                            self.deterministic)
 
     def __repr__(self):
         s = self.__class__.__name__ + '('
@@ -141,5 +178,6 @@ class Voxelization(nn.Module):
         s += ', point_cloud_range=' + str(self.point_cloud_range)
         s += ', max_num_points=' + str(self.max_num_points)
         s += ', max_voxels=' + str(self.max_voxels)
+        s += ', deterministic=' + str(self.deterministic)
         s += ')'
         return s
diff --git a/mmcv/parallel/_functions.py b/mmcv/parallel/_functions.py
index 9b5a8a44483ab991411d07122b22a1d027e4be8e..43580b46f982421f7fb8ebcac4b55625fe8f9935 100644
--- a/mmcv/parallel/_functions.py
+++ b/mmcv/parallel/_functions.py
@@ -1,9 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
 import torch
+from torch import Tensor
 from torch.nn.parallel._functions import _get_stream
 
 
-def scatter(input, devices, streams=None):
+def scatter(input: Union[List, Tensor],
+            devices: List,
+            streams: Optional[List] = None) -> Union[List, Tensor]:
     """Scatters tensor across multiple GPUs."""
     if streams is None:
         streams = [None] * len(devices)
@@ -15,30 +20,28 @@ def scatter(input, devices, streams=None):
                     [streams[i // chunk_size]]) for i in range(len(input))
         ]
         return outputs
-    elif isinstance(input, torch.Tensor):
+    elif isinstance(input, Tensor):
         output = input.contiguous()
         # TODO: copy to a pinned buffer first (if copying from CPU)
         stream = streams[0] if output.numel() > 0 else None
         if devices != [-1]:
             with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
                 output = output.cuda(devices[0], non_blocking=True)
-        else:
-            # unsqueeze the first dimension thus the tensor's shape is the
-            # same as those scattered with GPU.
-            output = output.unsqueeze(0)
+
         return output
     else:
         raise Exception(f'Unknown type {type(input)}.')
 
 
-def synchronize_stream(output, devices, streams):
+def synchronize_stream(output: Union[List, Tensor], devices: List,
+                       streams: List) -> None:
     if isinstance(output, list):
         chunk_size = len(output) // len(devices)
         for i in range(len(devices)):
             for j in range(chunk_size):
                 synchronize_stream(output[i * chunk_size + j], [devices[i]],
                                    [streams[i]])
-    elif isinstance(output, torch.Tensor):
+    elif isinstance(output, Tensor):
         if output.numel() != 0:
             with torch.cuda.device(devices[0]):
                 main_stream = torch.cuda.current_stream()
@@ -48,14 +51,14 @@ def synchronize_stream(output, devices, streams):
         raise Exception(f'Unknown type {type(output)}.')
 
 
-def get_input_device(input):
+def get_input_device(input: Union[List, Tensor]) -> int:
     if isinstance(input, list):
         for item in input:
             input_device = get_input_device(item)
             if input_device != -1:
                 return input_device
         return -1
-    elif isinstance(input, torch.Tensor):
+    elif isinstance(input, Tensor):
         return input.get_device() if input.is_cuda else -1
     else:
         raise Exception(f'Unknown type {type(input)}.')
@@ -64,7 +67,7 @@ def get_input_device(input):
 class Scatter:
 
     @staticmethod
-    def forward(target_gpus, input):
+    def forward(target_gpus: List[int], input: Union[List, Tensor]) -> tuple:
         input_device = get_input_device(input)
         streams = None
         if input_device == -1 and target_gpus != [-1]:
@@ -76,4 +79,4 @@ class Scatter:
         if streams is not None:
             synchronize_stream(outputs, target_gpus, streams)
 
-        return tuple(outputs)
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/parallel/collate.py b/mmcv/parallel/collate.py
index ad749197df21b0d74297548be5f66a696adebf7f..50c408bedc90ed7ee3fcce471ea29d7d65beb7d5 100644
--- a/mmcv/parallel/collate.py
+++ b/mmcv/parallel/collate.py
@@ -8,7 +8,7 @@ from torch.utils.data.dataloader import default_collate
 from .data_container import DataContainer
 
 
-def collate(batch, samples_per_gpu=1):
+def collate(batch: Sequence, samples_per_gpu: int = 1):
     """Puts each data field into a tensor/DataContainer with outer dimension
     batch size.
 
diff --git a/mmcv/parallel/data_container.py b/mmcv/parallel/data_container.py
index cedb0d32a51a1f575a622b38de2cee3ab4757821..62f257311095e96ba5547acd44687362947f9185 100644
--- a/mmcv/parallel/data_container.py
+++ b/mmcv/parallel/data_container.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
+from typing import Callable, Type, Union
 
+import numpy as np
 import torch
 
 
-def assert_tensor_type(func):
+def assert_tensor_type(func: Callable) -> Callable:
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
@@ -35,11 +37,11 @@ class DataContainer:
     """
 
     def __init__(self,
-                 data,
-                 stack=False,
-                 padding_value=0,
-                 cpu_only=False,
-                 pad_dims=2):
+                 data: Union[torch.Tensor, np.ndarray],
+                 stack: bool = False,
+                 padding_value: int = 0,
+                 cpu_only: bool = False,
+                 pad_dims: int = 2):
         self._data = data
         self._cpu_only = cpu_only
         self._stack = stack
@@ -47,43 +49,43 @@ class DataContainer:
         assert pad_dims in [None, 1, 2, 3]
         self._pad_dims = pad_dims
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f'{self.__class__.__name__}({repr(self.data)})'
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._data)
 
     @property
-    def data(self):
+    def data(self) -> Union[torch.Tensor, np.ndarray]:
         return self._data
 
     @property
-    def datatype(self):
+    def datatype(self) -> Union[Type, str]:
         if isinstance(self.data, torch.Tensor):
             return self.data.type()
         else:
             return type(self.data)
 
     @property
-    def cpu_only(self):
+    def cpu_only(self) -> bool:
         return self._cpu_only
 
     @property
-    def stack(self):
+    def stack(self) -> bool:
         return self._stack
 
     @property
-    def padding_value(self):
+    def padding_value(self) -> int:
         return self._padding_value
 
     @property
-    def pad_dims(self):
+    def pad_dims(self) -> int:
         return self._pad_dims
 
     @assert_tensor_type
-    def size(self, *args, **kwargs):
+    def size(self, *args, **kwargs) -> torch.Size:
         return self.data.size(*args, **kwargs)
 
     @assert_tensor_type
-    def dim(self):
+    def dim(self) -> int:
         return self.data.dim()
diff --git a/mmcv/parallel/data_parallel.py b/mmcv/parallel/data_parallel.py
index 7a5abeb6ed4e7159634a43a299b8747937242e7b..eea088fa0c74804b2949894167cd3f2a6f6d540f 100644
--- a/mmcv/parallel/data_parallel.py
+++ b/mmcv/parallel/data_parallel.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from itertools import chain
+from typing import List, Tuple
 
 from torch.nn.parallel import DataParallel
 
-from .scatter_gather import scatter_kwargs
+from .scatter_gather import ScatterInputs, scatter_kwargs
 
 
 class MMDataParallel(DataParallel):
@@ -13,7 +14,7 @@ class MMDataParallel(DataParallel):
 
     - It supports a custom type :class:`DataContainer` which allows more
       flexible control of input data during both GPU and CPU inference.
-    - It implement two more APIs ``train_step()`` and ``val_step()``.
+    - It implements two more APIs ``train_step()`` and ``val_step()``.
 
     .. warning::
         MMDataParallel only supports single GPU training, if you need to
@@ -31,8 +32,8 @@ class MMDataParallel(DataParallel):
         dim (int): Dimension used to scatter the data. Defaults to 0.
     """
 
-    def __init__(self, *args, dim=0, **kwargs):
-        super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs)
+    def __init__(self, *args, dim: int = 0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
         self.dim = dim
 
     def forward(self, *inputs, **kwargs):
@@ -49,7 +50,8 @@ class MMDataParallel(DataParallel):
         else:
             return super().forward(*inputs, **kwargs)
 
-    def scatter(self, inputs, kwargs, device_ids):
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
     def train_step(self, *inputs, **kwargs):
diff --git a/mmcv/parallel/distributed.py b/mmcv/parallel/distributed.py
index b799a213d88163600b040eb28720720705cf3f00..bf34cb5906e72c294ddf66c2c8b10af127acf4ba 100644
--- a/mmcv/parallel/distributed.py
+++ b/mmcv/parallel/distributed.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple
+
 import torch
 from torch.nn.parallel.distributed import (DistributedDataParallel,
                                            _find_tensors)
 
 from mmcv import print_log
 from mmcv.utils import TORCH_VERSION, digit_version
-from .scatter_gather import scatter_kwargs
+from .scatter_gather import ScatterInputs, scatter_kwargs
 
 
 class MMDistributedDataParallel(DistributedDataParallel):
@@ -18,12 +20,14 @@ class MMDistributedDataParallel(DistributedDataParallel):
     - It implement two APIs ``train_step()`` and ``val_step()``.
     """
 
-    def to_kwargs(self, inputs, kwargs, device_id):
+    def to_kwargs(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                  device_id: int) -> Tuple[tuple, tuple]:
         # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
         # to move all tensors to device_id
         return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
 
-    def scatter(self, inputs, kwargs, device_ids):
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
     def train_step(self, *inputs, **kwargs):
@@ -44,8 +48,15 @@ class MMDistributedDataParallel(DistributedDataParallel):
                 'Reducer buckets have been rebuilt in this iteration.',
                 logger='mmcv')
 
-        if getattr(self, 'require_forward_param_sync', True):
-            self._sync_params()
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
         if self.device_ids:
             inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
             if len(self.device_ids) == 1:
@@ -57,8 +68,14 @@ class MMDistributedDataParallel(DistributedDataParallel):
         else:
             output = self.module.train_step(*inputs, **kwargs)
 
-        if torch.is_grad_enabled() and getattr(
-                self, 'require_backward_grad_sync', True):
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
             if self.find_unused_parameters:
                 self.reducer.prepare_for_backward(list(_find_tensors(output)))
             else:
@@ -86,8 +103,15 @@ class MMDistributedDataParallel(DistributedDataParallel):
                 'Reducer buckets have been rebuilt in this iteration.',
                 logger='mmcv')
 
-        if getattr(self, 'require_forward_param_sync', True):
-            self._sync_params()
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
         if self.device_ids:
             inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
             if len(self.device_ids) == 1:
@@ -99,8 +123,14 @@ class MMDistributedDataParallel(DistributedDataParallel):
         else:
             output = self.module.val_step(*inputs, **kwargs)
 
-        if torch.is_grad_enabled() and getattr(
-                self, 'require_backward_grad_sync', True):
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
             if self.find_unused_parameters:
                 self.reducer.prepare_for_backward(list(_find_tensors(output)))
             else:
@@ -110,3 +140,28 @@ class MMDistributedDataParallel(DistributedDataParallel):
                     and digit_version(TORCH_VERSION) > digit_version('1.2')):
                 self.require_forward_param_sync = False
         return output
+
+    def _run_ddp_forward(self, *inputs, **kwargs) -> Any:
+        """Processes inputs and runs ``self.module.forward``.
+
+        Pytorch 1.12.0 performs ``self.module.forward`` in ``_run_ddp_forward``
+        and deprecates using ``DistributedDataParallel.to_kwargs`` to
+        process inputs, which leads to inputs cannot be processed by
+        :meth:`MMDistributedDataParallel.to_kwargs` anymore. Therefore,
+        ``MMDistributedDataParallel`` overrides this method to call
+        :meth:`to_kwargs` explicitly.
+
+        See more information in `<https://github.com/open-mmlab/mmsegmentation/issues/1742>`_.  # noqa: E501
+
+        Returns:
+            Any: Forward result of :attr:`module`.
+        """
+        module_to_run = self._replicated_tensor_module if \
+            self._use_replicated_tensor_module else self.module
+
+        if self.device_ids:
+            inputs, kwargs = self.to_kwargs(  # type: ignore
+                inputs, kwargs, self.device_ids[0])
+            return module_to_run(*inputs[0], **kwargs[0])  # type: ignore
+        else:
+            return module_to_run(*inputs, **kwargs)
diff --git a/mmcv/parallel/distributed_deprecated.py b/mmcv/parallel/distributed_deprecated.py
index b593d4a9e04a53adb9cc5850cd7e30321e1528ce..21b6c4ec1599a9c30b5afbc0d8e8dd4f1fe49e21 100644
--- a/mmcv/parallel/distributed_deprecated.py
+++ b/mmcv/parallel/distributed_deprecated.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -7,18 +9,18 @@ from torch._utils import (_flatten_dense_tensors, _take_tensors,
 
 from mmcv.utils import TORCH_VERSION, digit_version
 from .registry import MODULE_WRAPPERS
-from .scatter_gather import scatter_kwargs
+from .scatter_gather import ScatterInputs, scatter_kwargs
 
 
 @MODULE_WRAPPERS.register_module()
 class MMDistributedDataParallel(nn.Module):
 
     def __init__(self,
-                 module,
-                 dim=0,
-                 broadcast_buffers=True,
-                 bucket_cap_mb=25):
-        super(MMDistributedDataParallel, self).__init__()
+                 module: nn.Module,
+                 dim: int = 0,
+                 broadcast_buffers: bool = True,
+                 bucket_cap_mb: int = 25):
+        super().__init__()
         self.module = module
         self.dim = dim
         self.broadcast_buffers = broadcast_buffers
@@ -26,7 +28,8 @@ class MMDistributedDataParallel(nn.Module):
         self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
         self._sync_params()
 
-    def _dist_broadcast_coalesced(self, tensors, buffer_size):
+    def _dist_broadcast_coalesced(self, tensors: Sequence[torch.Tensor],
+                                  buffer_size: int) -> None:
         for tensors in _take_tensors(tensors, buffer_size):
             flat_tensors = _flatten_dense_tensors(tensors)
             dist.broadcast(flat_tensors, 0)
@@ -34,7 +37,7 @@ class MMDistributedDataParallel(nn.Module):
                     tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
                 tensor.copy_(synced)
 
-    def _sync_params(self):
+    def _sync_params(self) -> None:
         module_states = list(self.module.state_dict().values())
         if len(module_states) > 0:
             self._dist_broadcast_coalesced(module_states,
@@ -49,7 +52,8 @@ class MMDistributedDataParallel(nn.Module):
                 self._dist_broadcast_coalesced(buffers,
                                                self.broadcast_bucket_size)
 
-    def scatter(self, inputs, kwargs, device_ids):
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
 
     def forward(self, *inputs, **kwargs):
diff --git a/mmcv/parallel/scatter_gather.py b/mmcv/parallel/scatter_gather.py
index 900ff88566f8f14830590459dc4fd16d4b382e47..3133b253c9a7a0f19c75bd6c99e8df6e9fb15bb9 100644
--- a/mmcv/parallel/scatter_gather.py
+++ b/mmcv/parallel/scatter_gather.py
@@ -1,12 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import torch
+from typing import List, Tuple, Union
+
+from torch import Tensor
 from torch.nn.parallel._functions import Scatter as OrigScatter
 
 from ._functions import Scatter
 from .data_container import DataContainer
 
+ScatterInputs = Union[Tensor, DataContainer, tuple, list, dict]
+
 
-def scatter(inputs, target_gpus, dim=0):
+def scatter(inputs: ScatterInputs,
+            target_gpus: List[int],
+            dim: int = 0) -> list:
     """Scatter inputs to target gpus.
 
     The only difference from original :func:`scatter` is to add support for
@@ -14,7 +20,7 @@ def scatter(inputs, target_gpus, dim=0):
     """
 
     def scatter_map(obj):
-        if isinstance(obj, torch.Tensor):
+        if isinstance(obj, Tensor):
             if target_gpus != [-1]:
                 return OrigScatter.apply(target_gpus, None, dim, obj)
             else:
@@ -33,7 +39,7 @@ def scatter(inputs, target_gpus, dim=0):
         if isinstance(obj, dict) and len(obj) > 0:
             out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
             return out
-        return [obj for targets in target_gpus]
+        return [obj for _ in target_gpus]
 
     # After scatter_map is called, a scatter_map cell will exist. This cell
     # has a reference to the actual function scatter_map, which has references
@@ -43,17 +49,22 @@ def scatter(inputs, target_gpus, dim=0):
     try:
         return scatter_map(inputs)
     finally:
-        scatter_map = None
+        scatter_map = None  # type: ignore
 
 
-def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
+def scatter_kwargs(inputs: ScatterInputs,
+                   kwargs: ScatterInputs,
+                   target_gpus: List[int],
+                   dim: int = 0) -> Tuple[tuple, tuple]:
     """Scatter with support for kwargs dictionary."""
     inputs = scatter(inputs, target_gpus, dim) if inputs else []
     kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
     if len(inputs) < len(kwargs):
-        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+        length = len(kwargs) - len(inputs)
+        inputs.extend([() for _ in range(length)])  # type: ignore
     elif len(kwargs) < len(inputs):
-        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+        length = len(inputs) - len(kwargs)
+        kwargs.extend([{} for _ in range(length)])  # type: ignore
     inputs = tuple(inputs)
     kwargs = tuple(kwargs)
     return inputs, kwargs
diff --git a/mmcv/parallel/utils.py b/mmcv/parallel/utils.py
index 0f5712cb42c38a2e8563bf563efb6681383cab9b..bd52622b1bb5b9b217d7be1651004e179ff7b86b 100644
--- a/mmcv/parallel/utils.py
+++ b/mmcv/parallel/utils.py
@@ -1,14 +1,17 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from torch import nn
+
 from .registry import MODULE_WRAPPERS
 
 
-def is_module_wrapper(module):
+def is_module_wrapper(module: nn.Module) -> bool:
     """Check if a module is a module wrapper.
 
     The following 3 modules in MMCV (and their subclasses) are regarded as
     module wrappers: DataParallel, DistributedDataParallel,
     MMDistributedDataParallel (the deprecated version). You may add you own
-    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
+    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS or
+    its children registries.
 
     Args:
         module (nn.Module): The module to be checked.
@@ -16,5 +19,14 @@ def is_module_wrapper(module):
     Returns:
         bool: True if the input module is a module wrapper.
     """
-    module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
-    return isinstance(module, module_wrappers)
+
+    def is_module_in_wrapper(module, module_wrapper):
+        module_wrappers = tuple(module_wrapper.module_dict.values())
+        if isinstance(module, module_wrappers):
+            return True
+        for child in module_wrapper.children.values():
+            if is_module_in_wrapper(module, child):
+                return True
+        return False
+
+    return is_module_in_wrapper(module, MODULE_WRAPPERS)
diff --git a/mmcv/runner/__init__.py b/mmcv/runner/__init__.py
index 52e4b48d383a84a055dcd7f6236f6e8e58eab924..183d536727e4a217936378d74b3a64ef4a6377e8 100644
--- a/mmcv/runner/__init__.py
+++ b/mmcv/runner/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .base_module import BaseModule, ModuleList, Sequential
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
 from .base_runner import BaseRunner
 from .builder import RUNNERS, build_runner
 from .checkpoint import (CheckpointLoader, _load_checkpoint,
@@ -10,14 +10,29 @@ from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
                          init_dist, master_only)
 from .epoch_based_runner import EpochBasedRunner, Runner
 from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
-from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook,
-                    DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook,
-                    Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+from .hooks import (HOOKS, CheckpointHook, ClearMLLoggerHook, ClosureHook,
+                    DistEvalHook, DistSamplerSeedHook, DvcliveLoggerHook,
+                    EMAHook, EvalHook, Fp16OptimizerHook,
+                    GradientCumulativeFp16OptimizerHook,
                     GradientCumulativeOptimizerHook, Hook, IterTimerHook,
-                    LoggerHook, LrUpdaterHook, MlflowLoggerHook,
-                    NeptuneLoggerHook, OptimizerHook, PaviLoggerHook,
+                    LoggerHook, MlflowLoggerHook, NeptuneLoggerHook,
+                    OptimizerHook, PaviLoggerHook, SegmindLoggerHook,
                     SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
                     WandbLoggerHook)
+from .hooks.lr_updater import StepLrUpdaterHook  # noqa
+from .hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
+                               CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                               ExpLrUpdaterHook, FixedLrUpdaterHook,
+                               FlatCosineAnnealingLrUpdaterHook,
+                               InvLrUpdaterHook, LinearAnnealingLrUpdaterHook,
+                               LrUpdaterHook, OneCycleLrUpdaterHook,
+                               PolyLrUpdaterHook)
+from .hooks.momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                                     CyclicMomentumUpdaterHook,
+                                     LinearAnnealingMomentumUpdaterHook,
+                                     MomentumUpdaterHook,
+                                     OneCycleMomentumUpdaterHook,
+                                     StepMomentumUpdaterHook)
 from .iter_based_runner import IterBasedRunner, IterLoader
 from .log_buffer import LogBuffer
 from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
@@ -26,9 +41,18 @@ from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
 from .priority import Priority, get_priority
 from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
 
+# initialize ipu to registor ipu runner to RUNNERS
+from mmcv.device import ipu  # isort:skip  # noqa
+
 __all__ = [
     'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
     'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
     'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
     'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
     'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
@@ -42,6 +66,8 @@ __all__ = [
     'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
     'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
     '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
-    'ModuleList', 'GradientCumulativeOptimizerHook',
-    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor'
+    'ModuleDict', 'ModuleList', 'GradientCumulativeOptimizerHook',
+    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor',
+    'SegmindLoggerHook', 'LinearAnnealingMomentumUpdaterHook',
+    'LinearAnnealingLrUpdaterHook', 'ClearMLLoggerHook'
 ]
diff --git a/mmcv/runner/base_module.py b/mmcv/runner/base_module.py
index 529575b8102184e9f3ab832ee351924b13ec5bb4..845e8c8ff2c1bd29593865d5707a33a79c741c76 100644
--- a/mmcv/runner/base_module.py
+++ b/mmcv/runner/base_module.py
@@ -4,6 +4,7 @@ import warnings
 from abc import ABCMeta
 from collections import defaultdict
 from logging import FileHandler
+from typing import Iterable, Optional
 
 import torch.nn as nn
 
@@ -18,25 +19,24 @@ class BaseModule(nn.Module, metaclass=ABCMeta):
     functionality of parameter initialization. Compared with
     ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
 
-        - ``init_cfg``: the config to control the initialization.
-        - ``init_weights``: The function of parameter
-            initialization and recording initialization
-            information.
-        - ``_params_init_info``: Used to track the parameter
-            initialization information. This attribute only
-            exists during executing the ``init_weights``.
+    - ``init_cfg``: the config to control the initialization.
+    - ``init_weights``: The function of parameter initialization and recording
+      initialization information.
+    - ``_params_init_info``: Used to track the parameter initialization
+      information. This attribute only exists during executing the
+      ``init_weights``.
 
     Args:
         init_cfg (dict, optional): Initialization config dict.
     """
 
-    def __init__(self, init_cfg=None):
+    def __init__(self, init_cfg: Optional[dict] = None):
         """Initialize BaseModule, inherited from `torch.nn.Module`"""
 
         # NOTE init_cfg can be defined in different levels, but init_cfg
         # in low levels has a higher priority.
 
-        super(BaseModule, self).__init__()
+        super().__init__()
         # define default value of init_cfg instead of hard code
         # in init_weights() function
         self._is_init = False
@@ -50,10 +50,10 @@ class BaseModule(nn.Module, metaclass=ABCMeta):
         #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
 
     @property
-    def is_init(self):
+    def is_init(self) -> bool:
         return self._is_init
 
-    def init_weights(self):
+    def init_weights(self) -> None:
         """Initialize the weights."""
 
         is_top_level_module = False
@@ -68,7 +68,7 @@ class BaseModule(nn.Module, metaclass=ABCMeta):
             #       which indicates whether the parameter has been modified.
             # this attribute would be deleted after all parameters
             # is initialized.
-            self._params_init_info = defaultdict(dict)
+            self._params_init_info: defaultdict = defaultdict(dict)
             is_top_level_module = True
 
             # Initialize the `_params_init_info`,
@@ -134,7 +134,7 @@ class BaseModule(nn.Module, metaclass=ABCMeta):
                 del sub_module._params_init_info
 
     @master_only
-    def _dump_init_info(self, logger_name):
+    def _dump_init_info(self, logger_name: str) -> None:
         """Dump the initialization information to a file named
         `initialization.log.json` in workdir.
 
@@ -177,7 +177,7 @@ class Sequential(BaseModule, nn.Sequential):
         init_cfg (dict, optional): Initialization config dict.
     """
 
-    def __init__(self, *args, init_cfg=None):
+    def __init__(self, *args, init_cfg: Optional[dict] = None):
         BaseModule.__init__(self, init_cfg)
         nn.Sequential.__init__(self, *args)
 
@@ -190,6 +190,24 @@ class ModuleList(BaseModule, nn.ModuleList):
         init_cfg (dict, optional): Initialization config dict.
     """
 
-    def __init__(self, modules=None, init_cfg=None):
+    def __init__(self,
+                 modules: Optional[Iterable] = None,
+                 init_cfg: Optional[dict] = None):
         BaseModule.__init__(self, init_cfg)
         nn.ModuleList.__init__(self, modules)
+
+
+class ModuleDict(BaseModule, nn.ModuleDict):
+    """ModuleDict in openmmlab.
+
+    Args:
+        modules (dict, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module).
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleDict.__init__(self, modules)
diff --git a/mmcv/runner/base_runner.py b/mmcv/runner/base_runner.py
index 25cd98f51c6183ccd9ce65f4f9c3b887d41b3c86..2c5a9ddd008ad7ab1c241b1cbaff238c3a2b0165 100644
--- a/mmcv/runner/base_runner.py
+++ b/mmcv/runner/base_runner.py
@@ -4,9 +4,13 @@ import logging
 import os.path as osp
 import warnings
 from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Union,
+                    no_type_check)
 
 import torch
 from torch.optim import Optimizer
+from torch.utils.data import DataLoader
 
 import mmcv
 from ..parallel import is_module_wrapper
@@ -49,20 +53,22 @@ class BaseRunner(metaclass=ABCMeta):
     """
 
     def __init__(self,
-                 model,
-                 batch_processor=None,
-                 optimizer=None,
-                 work_dir=None,
-                 logger=None,
-                 meta=None,
-                 max_iters=None,
-                 max_epochs=None):
+                 model: torch.nn.Module,
+                 batch_processor: Optional[Callable] = None,
+                 optimizer: Union[Dict, torch.optim.Optimizer, None] = None,
+                 work_dir: Optional[str] = None,
+                 logger: Optional[logging.Logger] = None,
+                 meta: Optional[Dict] = None,
+                 max_iters: Optional[int] = None,
+                 max_epochs: Optional[int] = None) -> None:
         if batch_processor is not None:
             if not callable(batch_processor):
                 raise TypeError('batch_processor must be callable, '
                                 f'but got {type(batch_processor)}')
-            warnings.warn('batch_processor is deprecated, please implement '
-                          'train_step() and val_step() in the model instead.')
+            warnings.warn(
+                'batch_processor is deprecated, please implement '
+                'train_step() and val_step() in the model instead.',
+                DeprecationWarning)
             # raise an error is `batch_processor` is not None and
             # `model.train_step()` exists.
             if is_module_wrapper(model):
@@ -104,8 +110,8 @@ class BaseRunner(metaclass=ABCMeta):
         self.logger = logger
         self.meta = meta
         # create work_dir
-        if mmcv.is_str(work_dir):
-            self.work_dir = osp.abspath(work_dir)
+        if isinstance(work_dir, str):
+            self.work_dir: Optional[str] = osp.abspath(work_dir)
             mmcv.mkdir_or_exist(self.work_dir)
         elif work_dir is None:
             self.work_dir = None
@@ -120,8 +126,8 @@ class BaseRunner(metaclass=ABCMeta):
 
         self._rank, self._world_size = get_dist_info()
         self.timestamp = get_time_str()
-        self.mode = None
-        self._hooks = []
+        self.mode: Optional[str] = None
+        self._hooks: List[Hook] = []
         self._epoch = 0
         self._iter = 0
         self._inner_iter = 0
@@ -136,38 +142,38 @@ class BaseRunner(metaclass=ABCMeta):
         self.log_buffer = LogBuffer()
 
     @property
-    def model_name(self):
+    def model_name(self) -> str:
         """str: Name of the model, usually the module class name."""
         return self._model_name
 
     @property
-    def rank(self):
+    def rank(self) -> int:
         """int: Rank of current process. (distributed training)"""
         return self._rank
 
     @property
-    def world_size(self):
+    def world_size(self) -> int:
         """int: Number of processes participating in the job.
         (distributed training)"""
         return self._world_size
 
     @property
-    def hooks(self):
+    def hooks(self) -> List[Hook]:
         """list[:obj:`Hook`]: A list of registered hooks."""
         return self._hooks
 
     @property
-    def epoch(self):
+    def epoch(self) -> int:
         """int: Current epoch."""
         return self._epoch
 
     @property
-    def iter(self):
+    def iter(self) -> int:
         """int: Current iteration."""
         return self._iter
 
     @property
-    def inner_iter(self):
+    def inner_iter(self) -> int:
         """int: Iteration in an epoch."""
         return self._inner_iter
 
@@ -190,26 +196,28 @@ class BaseRunner(metaclass=ABCMeta):
         pass
 
     @abstractmethod
-    def run(self, data_loaders, workflow, **kwargs):
+    def run(self, data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]], **kwargs) -> Any:
         pass
 
     @abstractmethod
     def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl,
-                        save_optimizer=True,
-                        meta=None,
-                        create_symlink=True):
+                        out_dir: str,
+                        filename_tmpl: str,
+                        save_optimizer: bool = True,
+                        meta: Optional[Dict] = None,
+                        create_symlink: bool = True) -> None:
         pass
 
-    def current_lr(self):
+    def current_lr(self) -> Union[List[float], Dict[str, List[float]]]:
         """Get current learning rates.
 
         Returns:
             list[float] | dict[str, list[float]]: Current learning rates of all
-                param groups. If the runner has a dict of optimizers, this
-                method will return a dict.
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
         """
+        lr: Union[List[float], Dict[str, List[float]]]
         if isinstance(self.optimizer, torch.optim.Optimizer):
             lr = [group['lr'] for group in self.optimizer.param_groups]
         elif isinstance(self.optimizer, dict):
@@ -221,13 +229,13 @@ class BaseRunner(metaclass=ABCMeta):
                 'lr is not applicable because optimizer does not exist.')
         return lr
 
-    def current_momentum(self):
+    def current_momentum(self) -> Union[List[float], Dict[str, List[float]]]:
         """Get current momentums.
 
         Returns:
             list[float] | dict[str, list[float]]: Current momentums of all
-                param groups. If the runner has a dict of optimizers, this
-                method will return a dict.
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
         """
 
         def _get_momentum(optimizer):
@@ -252,7 +260,9 @@ class BaseRunner(metaclass=ABCMeta):
                 momentums[name] = _get_momentum(optim)
         return momentums
 
-    def register_hook(self, hook, priority='NORMAL'):
+    def register_hook(self,
+                      hook: Hook,
+                      priority: Union[int, str, Priority] = 'NORMAL') -> None:
         """Register a hook into the hook list.
 
         The hook will be inserted into a priority queue, with the specified
@@ -269,25 +279,25 @@ class BaseRunner(metaclass=ABCMeta):
         if hasattr(hook, 'priority'):
             raise ValueError('"priority" is a reserved attribute for hooks')
         priority = get_priority(priority)
-        hook.priority = priority
+        hook.priority = priority  # type: ignore
         # insert the hook to a sorted list
         inserted = False
         for i in range(len(self._hooks) - 1, -1, -1):
-            if priority >= self._hooks[i].priority:
+            if priority >= self._hooks[i].priority:  # type: ignore
                 self._hooks.insert(i + 1, hook)
                 inserted = True
                 break
         if not inserted:
             self._hooks.insert(0, hook)
 
-    def register_hook_from_cfg(self, hook_cfg):
+    def register_hook_from_cfg(self, hook_cfg: Dict) -> None:
         """Register a hook from its cfg.
 
         Args:
             hook_cfg (dict): Hook config. It should have at least keys 'type'
               and 'priority' indicating its type and priority.
 
-        Notes:
+        Note:
             The specific hook class to register should not use 'type' and
             'priority' arguments during initialization.
         """
@@ -296,7 +306,7 @@ class BaseRunner(metaclass=ABCMeta):
         hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
         self.register_hook(hook, priority=priority)
 
-    def call_hook(self, fn_name):
+    def call_hook(self, fn_name: str) -> None:
         """Call all hooks.
 
         Args:
@@ -306,14 +316,14 @@ class BaseRunner(metaclass=ABCMeta):
         for hook in self._hooks:
             getattr(hook, fn_name)(self)
 
-    def get_hook_info(self):
+    def get_hook_info(self) -> str:
         # Get hooks info in each stage
-        stage_hook_map = {stage: [] for stage in Hook.stages}
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
         for hook in self.hooks:
             try:
-                priority = Priority(hook.priority).name
+                priority = Priority(hook.priority).name  # type: ignore
             except ValueError:
-                priority = hook.priority
+                priority = hook.priority  # type: ignore
             classname = hook.__class__.__name__
             hook_info = f'({priority:<12}) {classname:<35}'
             for trigger_stage in hook.get_triggered_stages():
@@ -329,11 +339,13 @@ class BaseRunner(metaclass=ABCMeta):
                 stage_hook_infos.append(info)
         return '\n'.join(stage_hook_infos)
 
-    def load_checkpoint(self,
-                        filename,
-                        map_location='cpu',
-                        strict=False,
-                        revise_keys=[(r'^module.', '')]):
+    def load_checkpoint(
+        self,
+        filename: str,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: List = [(r'^module.', '')],
+    ) -> Union[Dict, OrderedDict]:
         return load_checkpoint(
             self.model,
             filename,
@@ -342,10 +354,11 @@ class BaseRunner(metaclass=ABCMeta):
             self.logger,
             revise_keys=revise_keys)
 
+    @no_type_check
     def resume(self,
-               checkpoint,
-               resume_optimizer=True,
-               map_location='default'):
+               checkpoint: str,
+               resume_optimizer: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
         if map_location == 'default':
             if torch.cuda.is_available():
                 device_id = torch.cuda.current_device()
@@ -396,7 +409,7 @@ class BaseRunner(metaclass=ABCMeta):
 
         self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
 
-    def register_lr_hook(self, lr_config):
+    def register_lr_hook(self, lr_config: Union[Dict, Hook, None]) -> None:
         if lr_config is None:
             return
         elif isinstance(lr_config, dict):
@@ -417,7 +430,8 @@ class BaseRunner(metaclass=ABCMeta):
             hook = lr_config
         self.register_hook(hook, priority='VERY_HIGH')
 
-    def register_momentum_hook(self, momentum_config):
+    def register_momentum_hook(
+            self, momentum_config: Union[Dict, Hook, None]) -> None:
         if momentum_config is None:
             return
         if isinstance(momentum_config, dict):
@@ -438,7 +452,8 @@ class BaseRunner(metaclass=ABCMeta):
             hook = momentum_config
         self.register_hook(hook, priority='HIGH')
 
-    def register_optimizer_hook(self, optimizer_config):
+    def register_optimizer_hook(
+            self, optimizer_config: Union[Dict, Hook, None]) -> None:
         if optimizer_config is None:
             return
         if isinstance(optimizer_config, dict):
@@ -448,7 +463,8 @@ class BaseRunner(metaclass=ABCMeta):
             hook = optimizer_config
         self.register_hook(hook, priority='ABOVE_NORMAL')
 
-    def register_checkpoint_hook(self, checkpoint_config):
+    def register_checkpoint_hook(
+            self, checkpoint_config: Union[Dict, Hook, None]) -> None:
         if checkpoint_config is None:
             return
         if isinstance(checkpoint_config, dict):
@@ -458,7 +474,7 @@ class BaseRunner(metaclass=ABCMeta):
             hook = checkpoint_config
         self.register_hook(hook, priority='NORMAL')
 
-    def register_logger_hooks(self, log_config):
+    def register_logger_hooks(self, log_config: Optional[Dict]) -> None:
         if log_config is None:
             return
         log_interval = log_config['interval']
@@ -467,7 +483,10 @@ class BaseRunner(metaclass=ABCMeta):
                 info, HOOKS, default_args=dict(interval=log_interval))
             self.register_hook(logger_hook, priority='VERY_LOW')
 
-    def register_timer_hook(self, timer_config):
+    def register_timer_hook(
+        self,
+        timer_config: Union[Dict, Hook, None],
+    ) -> None:
         if timer_config is None:
             return
         if isinstance(timer_config, dict):
@@ -477,7 +496,8 @@ class BaseRunner(metaclass=ABCMeta):
             hook = timer_config
         self.register_hook(hook, priority='LOW')
 
-    def register_custom_hooks(self, custom_config):
+    def register_custom_hooks(
+            self, custom_config: Union[List, Dict, Hook, None]) -> None:
         if custom_config is None:
             return
 
@@ -490,7 +510,10 @@ class BaseRunner(metaclass=ABCMeta):
             else:
                 self.register_hook(item, priority='NORMAL')
 
-    def register_profiler_hook(self, profiler_config):
+    def register_profiler_hook(
+        self,
+        profiler_config: Union[Dict, Hook, None],
+    ) -> None:
         if profiler_config is None:
             return
         if isinstance(profiler_config, dict):
@@ -500,14 +523,15 @@ class BaseRunner(metaclass=ABCMeta):
             hook = profiler_config
         self.register_hook(hook)
 
-    def register_training_hooks(self,
-                                lr_config,
-                                optimizer_config=None,
-                                checkpoint_config=None,
-                                log_config=None,
-                                momentum_config=None,
-                                timer_config=dict(type='IterTimerHook'),
-                                custom_hooks_config=None):
+    def register_training_hooks(
+            self,
+            lr_config: Union[Dict, Hook, None],
+            optimizer_config: Union[Dict, Hook, None] = None,
+            checkpoint_config: Union[Dict, Hook, None] = None,
+            log_config: Optional[Dict] = None,
+            momentum_config: Union[Dict, Hook, None] = None,
+            timer_config: Union[Dict, Hook] = dict(type='IterTimerHook'),
+            custom_hooks_config: Union[List, Dict, Hook, None] = None) -> None:
         """Register default and custom hooks for training.
 
         Default and custom hooks include:
diff --git a/mmcv/runner/builder.py b/mmcv/runner/builder.py
index 77c96ba0b2f30ead9da23f293c5dc84dd3e4a74f..008da32aa00f289487895c144d84df3332380553 100644
--- a/mmcv/runner/builder.py
+++ b/mmcv/runner/builder.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+from typing import Optional
 
 from ..utils import Registry
 
@@ -7,11 +8,11 @@ RUNNERS = Registry('runner')
 RUNNER_BUILDERS = Registry('runner builder')
 
 
-def build_runner_constructor(cfg):
+def build_runner_constructor(cfg: dict):
     return RUNNER_BUILDERS.build(cfg)
 
 
-def build_runner(cfg, default_args=None):
+def build_runner(cfg: dict, default_args: Optional[dict] = None):
     runner_cfg = copy.deepcopy(cfg)
     constructor_type = runner_cfg.pop('constructor',
                                       'DefaultRunnerConstructor')
diff --git a/mmcv/runner/checkpoint.py b/mmcv/runner/checkpoint.py
index fec055eea387b19994dfb01c72fc343410c25159..08118566423b77d4228af1c93f8f055fc38eaaa0 100644
--- a/mmcv/runner/checkpoint.py
+++ b/mmcv/runner/checkpoint.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import io
+import logging
 import os
 import os.path as osp
 import pkgutil
@@ -9,8 +10,10 @@ import warnings
 from collections import OrderedDict
 from importlib import import_module
 from tempfile import TemporaryDirectory
+from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import torch
+import torch.nn as nn
 import torchvision
 from torch.optim import Optimizer
 
@@ -18,7 +21,7 @@ import mmcv
 from ..fileio import FileClient
 from ..fileio import load as load_file
 from ..parallel import is_module_wrapper
-from ..utils import load_url, mkdir_or_exist
+from ..utils import digit_version, load_url, mkdir_or_exist
 from .dist_utils import get_dist_info
 
 ENV_MMCV_HOME = 'MMCV_HOME'
@@ -26,7 +29,7 @@ ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
 DEFAULT_CACHE_DIR = '~/.cache'
 
 
-def _get_mmcv_home():
+def _get_mmcv_home() -> str:
     mmcv_home = os.path.expanduser(
         os.getenv(
             ENV_MMCV_HOME,
@@ -37,7 +40,10 @@ def _get_mmcv_home():
     return mmcv_home
 
 
-def load_state_dict(module, state_dict, strict=False, logger=None):
+def load_state_dict(module: nn.Module,
+                    state_dict: Union[dict, OrderedDict],
+                    strict: bool = False,
+                    logger: Optional[logging.Logger] = None) -> None:
     """Load state_dict to a module.
 
     This method is modified from :meth:`torch.nn.Module.load_state_dict`.
@@ -46,21 +52,21 @@ def load_state_dict(module, state_dict, strict=False, logger=None):
 
     Args:
         module (Module): Module that receives the state_dict.
-        state_dict (OrderedDict): Weights.
+        state_dict (dict or OrderedDict): Weights.
         strict (bool): whether to strictly enforce that the keys
             in :attr:`state_dict` match the keys returned by this module's
             :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
         logger (:obj:`logging.Logger`, optional): Logger to log the error
             message. If not specified, print function will be used.
     """
-    unexpected_keys = []
-    all_missing_keys = []
-    err_msg = []
+    unexpected_keys: List[str] = []
+    all_missing_keys: List[str] = []
+    err_msg: List[str] = []
 
     metadata = getattr(state_dict, '_metadata', None)
-    state_dict = state_dict.copy()
+    state_dict = state_dict.copy()  # type: ignore
     if metadata is not None:
-        state_dict._metadata = metadata
+        state_dict._metadata = metadata  # type: ignore
 
     # use _load_from_state_dict to enable checkpoint version control
     def load(module, prefix=''):
@@ -78,7 +84,8 @@ def load_state_dict(module, state_dict, strict=False, logger=None):
                 load(child, prefix + name + '.')
 
     load(module)
-    load = None  # break load->load reference cycle
+    # break load->load reference cycle
+    load = None  # type: ignore
 
     # ignore "num_batches_tracked" of BN layers
     missing_keys = [
@@ -96,7 +103,7 @@ def load_state_dict(module, state_dict, strict=False, logger=None):
     if len(err_msg) > 0 and rank == 0:
         err_msg.insert(
             0, 'The model and loaded state dict do not match exactly\n')
-        err_msg = '\n'.join(err_msg)
+        err_msg = '\n'.join(err_msg)  # type: ignore
         if strict:
             raise RuntimeError(err_msg)
         elif logger is not None:
@@ -106,14 +113,48 @@ def load_state_dict(module, state_dict, strict=False, logger=None):
 
 
 def get_torchvision_models():
-    model_urls = dict()
-    for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__):
-        if ispkg:
-            continue
-        _zoo = import_module(f'torchvision.models.{name}')
-        if hasattr(_zoo, 'model_urls'):
-            _urls = getattr(_zoo, 'model_urls')
-            model_urls.update(_urls)
+    if digit_version(torchvision.__version__) < digit_version('0.13.0a0'):
+        model_urls = dict()
+        # When the version of torchvision is lower than 0.13, the model url is
+        # not declared in `torchvision.model.__init__.py`, so we need to
+        # iterate through `torchvision.models.__path__` to get the url for each
+        # model.
+        for _, name, ispkg in pkgutil.walk_packages(
+                torchvision.models.__path__):
+            if ispkg:
+                continue
+            _zoo = import_module(f'torchvision.models.{name}')
+            if hasattr(_zoo, 'model_urls'):
+                _urls = getattr(_zoo, 'model_urls')
+                model_urls.update(_urls)
+    else:
+        # Since torchvision bumps to v0.13, the weight loading logic,
+        # model keys and model urls have been changed. Here the URLs of old
+        # version is loaded to avoid breaking back compatibility. If the
+        # torchvision version>=0.13.0, new URLs will be added. Users can get
+        # the resnet50 checkpoint by setting 'resnet50.imagent1k_v1',
+        # 'resnet50' or 'ResNet50_Weights.IMAGENET1K_V1' in the config.
+        json_path = osp.join(mmcv.__path__[0],
+                             'model_zoo/torchvision_0.12.json')
+        model_urls = mmcv.load(json_path)
+        for cls_name, cls in torchvision.models.__dict__.items():
+            # The name of torchvision model weights classes ends with
+            # `_Weights` such as `ResNet18_Weights`. However, some model weight
+            # classes, such as `MNASNet0_75_Weights` does not have any urls in
+            # torchvision 0.13.0 and cannot be iterated. Here we simply check
+            # `DEFAULT` attribute to ensure the class is not empty.
+            if (not cls_name.endswith('_Weights')
+                    or not hasattr(cls, 'DEFAULT')):
+                continue
+            # Since `cls.DEFAULT` can not be accessed by iterating cls, we set
+            # default urls explicitly.
+            cls_key = cls_name.replace('_Weights', '').lower()
+            model_urls[f'{cls_key}.default'] = cls.DEFAULT.url
+            for weight_enum in cls:
+                cls_key = cls_name.replace('_Weights', '').lower()
+                cls_key = f'{cls_key}.{weight_enum.name.lower()}'
+                model_urls[cls_key] = weight_enum.url
+
     return model_urls
 
 
@@ -147,7 +188,7 @@ def get_deprecated_model_names():
     return deprecate_urls
 
 
-def _process_mmcls_checkpoint(checkpoint):
+def _process_mmcls_checkpoint(checkpoint: Dict) -> Dict:
     if 'state_dict' in checkpoint:
         state_dict = checkpoint['state_dict']
     else:
@@ -166,10 +207,13 @@ def _process_mmcls_checkpoint(checkpoint):
 class CheckpointLoader:
     """A general checkpoint loader to manage all schemes."""
 
-    _schemes = {}
+    _schemes: dict = {}
 
     @classmethod
-    def _register_scheme(cls, prefixes, loader, force=False):
+    def _register_scheme(cls,
+                         prefixes: Union[str, List, Tuple],
+                         loader: Callable,
+                         force: bool = False) -> None:
         if isinstance(prefixes, str):
             prefixes = [prefixes]
         else:
@@ -186,13 +230,16 @@ class CheckpointLoader:
             sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
 
     @classmethod
-    def register_scheme(cls, prefixes, loader=None, force=False):
+    def register_scheme(cls,
+                        prefixes: Union[str, List[str], Tuple[str, ...]],
+                        loader: Optional[Callable] = None,
+                        force: bool = False) -> Callable:
         """Register a loader to CheckpointLoader.
 
         This method can be used as a normal class method or a decorator.
 
         Args:
-            prefixes (str or list[str] or tuple[str]):
+            prefixes (str or Sequence[str]):
             The prefix of the registered loader.
             loader (function, optional): The loader function to be registered.
                 When this method is used as a decorator, loader is None.
@@ -203,7 +250,7 @@ class CheckpointLoader:
 
         if loader is not None:
             cls._register_scheme(prefixes, loader, force=force)
-            return
+            return  # type: ignore
 
         def _register(loader_cls):
             cls._register_scheme(prefixes, loader_cls, force=force)
@@ -212,7 +259,7 @@ class CheckpointLoader:
         return _register
 
     @classmethod
-    def _get_checkpoint_loader(cls, path):
+    def _get_checkpoint_loader(cls, path: str):
         """Finds a loader that supports the given path. Falls back to the local
         loader if no other loader is found.
 
@@ -220,15 +267,22 @@ class CheckpointLoader:
             path (str): checkpoint path
 
         Returns:
-            loader (function): checkpoint loader
+            callable: checkpoint loader
         """
-
         for p in cls._schemes:
-            if path.startswith(p):
+            # use regular match to handle some cases that where the prefix of
+            # loader has a prefix. For example, both 's3://path' and
+            # 'open-mmlab:s3://path' should return `load_from_ceph`
+            if re.match(p, path) is not None:
                 return cls._schemes[p]
 
     @classmethod
-    def load_checkpoint(cls, filename, map_location=None, logger=None):
+    def load_checkpoint(
+            cls,
+            filename: str,
+            map_location: Union[str, Callable, None] = None,
+            logger: Optional[logging.Logger] = None
+    ) -> Union[dict, OrderedDict]:
         """load checkpoint through URL scheme path.
 
         Args:
@@ -243,14 +297,17 @@ class CheckpointLoader:
         """
 
         checkpoint_loader = cls._get_checkpoint_loader(filename)
-        class_name = checkpoint_loader.__name__
+        class_name = checkpoint_loader.__name__  # type: ignore
         mmcv.print_log(
             f'load checkpoint from {class_name[10:]} path: {filename}', logger)
-        return checkpoint_loader(filename, map_location)
+        return checkpoint_loader(filename, map_location)  # type: ignore
 
 
 @CheckpointLoader.register_scheme(prefixes='')
-def load_from_local(filename, map_location):
+def load_from_local(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """load checkpoint by local file path.
 
     Args:
@@ -260,15 +317,18 @@ def load_from_local(filename, map_location):
     Returns:
         dict or OrderedDict: The loaded checkpoint.
     """
-
+    filename = osp.expanduser(filename)
     if not osp.isfile(filename):
-        raise IOError(f'{filename} is not a checkpoint file')
+        raise FileNotFoundError(f'{filename} can not be found.')
     checkpoint = torch.load(filename, map_location=map_location)
     return checkpoint
 
 
 @CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
-def load_from_http(filename, map_location=None, model_dir=None):
+def load_from_http(
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        model_dir: Optional[str] = None) -> Union[dict, OrderedDict]:
     """load checkpoint through HTTP or HTTPS scheme path. In distributed
     setting, this function only download checkpoint at local rank 0.
 
@@ -276,7 +336,7 @@ def load_from_http(filename, map_location=None, model_dir=None):
         filename (str): checkpoint file path with modelzoo or
             torchvision prefix
         map_location (str, optional): Same as :func:`torch.load`.
-        model_dir (string, optional): directory in which to save the object,
+        model_dir (str, optional): directory in which to save the object,
             Default: None
 
     Returns:
@@ -295,7 +355,10 @@ def load_from_http(filename, map_location=None, model_dir=None):
 
 
 @CheckpointLoader.register_scheme(prefixes='pavi://')
-def load_from_pavi(filename, map_location=None):
+def load_from_pavi(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """load checkpoint through the file path prefixed with pavi. In distributed
     setting, this function download ckpt at all ranks to different temporary
     directories.
@@ -326,16 +389,23 @@ def load_from_pavi(filename, map_location=None):
     return checkpoint
 
 
-@CheckpointLoader.register_scheme(prefixes='s3://')
-def load_from_ceph(filename, map_location=None, backend='petrel'):
+@CheckpointLoader.register_scheme(prefixes=r'(\S+\:)?s3://')
+def load_from_ceph(filename: str,
+                   map_location: Union[str, Callable, None] = None,
+                   backend: str = 'petrel') -> Union[dict, OrderedDict]:
     """load checkpoint through the file path prefixed with s3.  In distributed
     setting, this function download ckpt at all ranks to different temporary
     directories.
 
+    Note:
+        Since v1.4.1, the registered scheme prefixes have been enhanced to
+        support bucket names in the path prefix, e.g. 's3://xx.xx/xx.path',
+        'bucket1:s3://xx.xx/xx.path'.
+
     Args:
         filename (str): checkpoint file path with s3 prefix
         map_location (str, optional): Same as :func:`torch.load`.
-        backend (str, optional): The storage backend type. Options are 'ceph',
+        backend (str): The storage backend type. Options are 'ceph',
             'petrel'. Default: 'petrel'.
 
     .. warning::
@@ -351,7 +421,8 @@ def load_from_ceph(filename, map_location=None, backend='petrel'):
 
     if backend == 'ceph':
         warnings.warn(
-            'CephBackend will be deprecated, please use PetrelBackend instead')
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
 
     # CephClient and PetrelBackend have the same prefix 's3://' and the latter
     # will be chosen as default. If PetrelBackend can not be instantiated
@@ -368,7 +439,10 @@ def load_from_ceph(filename, map_location=None, backend='petrel'):
 
 
 @CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
-def load_from_torchvision(filename, map_location=None):
+def load_from_torchvision(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """load checkpoint through the file path prefixed with modelzoo or
     torchvision.
 
@@ -382,16 +456,25 @@ def load_from_torchvision(filename, map_location=None):
     """
     model_urls = get_torchvision_models()
     if filename.startswith('modelzoo://'):
-        warnings.warn('The URL scheme of "modelzoo://" is deprecated, please '
-                      'use "torchvision://" instead')
+        warnings.warn(
+            'The URL scheme of "modelzoo://" is deprecated, please '
+            'use "torchvision://" instead', DeprecationWarning)
         model_name = filename[11:]
     else:
         model_name = filename[14:]
+
+    # Support getting model urls in the same way as torchvision
+    # `ResNet50_Weights.IMAGENET1K_V1` will be mapped to
+    # resnet50.imagenet1k_v1.
+    model_name = model_name.lower().replace('_weights', '')
     return load_from_http(model_urls[model_name], map_location=map_location)
 
 
 @CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
-def load_from_openmmlab(filename, map_location=None):
+def load_from_openmmlab(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """load checkpoint through the file path prefixed with open-mmlab or
     openmmlab.
 
@@ -415,8 +498,10 @@ def load_from_openmmlab(filename, map_location=None):
 
     deprecated_urls = get_deprecated_model_names()
     if model_name in deprecated_urls:
-        warnings.warn(f'{prefix_str}{model_name} is deprecated in favor '
-                      f'of {prefix_str}{deprecated_urls[model_name]}')
+        warnings.warn(
+            f'{prefix_str}{model_name} is deprecated in favor '
+            f'of {prefix_str}{deprecated_urls[model_name]}',
+            DeprecationWarning)
         model_name = deprecated_urls[model_name]
     model_url = model_urls[model_name]
     # check if is url
@@ -425,13 +510,16 @@ def load_from_openmmlab(filename, map_location=None):
     else:
         filename = osp.join(_get_mmcv_home(), model_url)
         if not osp.isfile(filename):
-            raise IOError(f'{filename} is not a checkpoint file')
+            raise FileNotFoundError(f'{filename} can not be found.')
         checkpoint = torch.load(filename, map_location=map_location)
     return checkpoint
 
 
 @CheckpointLoader.register_scheme(prefixes='mmcls://')
-def load_from_mmcls(filename, map_location=None):
+def load_from_mmcls(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """load checkpoint through the file path prefixed with mmcls.
 
     Args:
@@ -450,7 +538,10 @@ def load_from_mmcls(filename, map_location=None):
     return checkpoint
 
 
-def _load_checkpoint(filename, map_location=None, logger=None):
+def _load_checkpoint(
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        logger: Optional[logging.Logger] = None) -> Union[dict, OrderedDict]:
     """Load checkpoint from somewhere (modelzoo, file, url).
 
     Args:
@@ -470,7 +561,11 @@ def _load_checkpoint(filename, map_location=None, logger=None):
     return CheckpointLoader.load_checkpoint(filename, map_location, logger)
 
 
-def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
+def _load_checkpoint_with_prefix(
+    prefix: str,
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
     """Load partial pretrained model with specific prefix.
 
     Args:
@@ -503,12 +598,13 @@ def _load_checkpoint_with_prefix(prefix, filename, map_location=None):
     return state_dict
 
 
-def load_checkpoint(model,
-                    filename,
-                    map_location=None,
-                    strict=False,
-                    logger=None,
-                    revise_keys=[(r'^module\.', '')]):
+def load_checkpoint(
+        model: torch.nn.Module,
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        strict: bool = False,
+        logger: Optional[logging.Logger] = None,
+        revise_keys: list = [(r'^module\.', '')]) -> Union[dict, OrderedDict]:
     """Load checkpoint from a file or URI.
 
     Args:
@@ -553,7 +649,7 @@ def load_checkpoint(model,
     return checkpoint
 
 
-def weights_to_cpu(state_dict):
+def weights_to_cpu(state_dict: OrderedDict) -> OrderedDict:
     """Copy a model state_dict to cpu.
 
     Args:
@@ -566,11 +662,13 @@ def weights_to_cpu(state_dict):
     for key, val in state_dict.items():
         state_dict_cpu[key] = val.cpu()
     # Keep metadata in state_dict
-    state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict())
+    state_dict_cpu._metadata = getattr(  # type: ignore
+        state_dict, '_metadata', OrderedDict())
     return state_dict_cpu
 
 
-def _save_to_state_dict(module, destination, prefix, keep_vars):
+def _save_to_state_dict(module: torch.nn.Module, destination: dict,
+                        prefix: str, keep_vars: bool) -> None:
     """Saves module state to `destination` dictionary.
 
     This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
@@ -590,7 +688,10 @@ def _save_to_state_dict(module, destination, prefix, keep_vars):
             destination[prefix + name] = buf if keep_vars else buf.detach()
 
 
-def get_state_dict(module, destination=None, prefix='', keep_vars=False):
+def get_state_dict(module: torch.nn.Module,
+                   destination: Optional[OrderedDict] = None,
+                   prefix: str = '',
+                   keep_vars: bool = False) -> OrderedDict:
     """Returns a dictionary containing a whole state of the module.
 
     Both parameters and persistent buffers (e.g. running averages) are
@@ -619,10 +720,10 @@ def get_state_dict(module, destination=None, prefix='', keep_vars=False):
     # below is the same as torch.nn.Module.state_dict()
     if destination is None:
         destination = OrderedDict()
-        destination._metadata = OrderedDict()
-    destination._metadata[prefix[:-1]] = local_metadata = dict(
+        destination._metadata = OrderedDict()  # type: ignore
+    destination._metadata[prefix[:-1]] = local_metadata = dict(  # type: ignore
         version=module._version)
-    _save_to_state_dict(module, destination, prefix, keep_vars)
+    _save_to_state_dict(module, destination, prefix, keep_vars)  # type: ignore
     for name, child in module._modules.items():
         if child is not None:
             get_state_dict(
@@ -631,14 +732,14 @@ def get_state_dict(module, destination=None, prefix='', keep_vars=False):
         hook_result = hook(module, destination, prefix, local_metadata)
         if hook_result is not None:
             destination = hook_result
-    return destination
+    return destination  # type: ignore
 
 
-def save_checkpoint(model,
-                    filename,
-                    optimizer=None,
-                    meta=None,
-                    file_client_args=None):
+def save_checkpoint(model: torch.nn.Module,
+                    filename: str,
+                    optimizer: Optional[Optimizer] = None,
+                    meta: Optional[dict] = None,
+                    file_client_args: Optional[dict] = None) -> None:
     """Save checkpoint to file.
 
     The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
@@ -669,7 +770,7 @@ def save_checkpoint(model,
 
     checkpoint = {
         'meta': meta,
-        'state_dict': weights_to_cpu(get_state_dict(model))
+        'state_dict': weights_to_cpu(get_state_dict(model))  # type: ignore
     }
     # save optimizer state dict in the checkpoint
     if isinstance(optimizer, Optimizer):
@@ -685,8 +786,7 @@ def save_checkpoint(model,
                 'file_client_args should be "None" if filename starts with'
                 f'"pavi://", but got {file_client_args}')
         try:
-            from pavi import modelcloud
-            from pavi import exception
+            from pavi import exception, modelcloud
         except ImportError:
             raise ImportError(
                 'Please install pavi to load checkpoint from modelcloud.')
diff --git a/mmcv/runner/default_constructor.py b/mmcv/runner/default_constructor.py
index 0bad847f2f2b36fa23560a587d1fb9c22276d549..394b51cfd773108c704c731c0136884a17956122 100644
--- a/mmcv/runner/default_constructor.py
+++ b/mmcv/runner/default_constructor.py
@@ -1,3 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 from .builder import RUNNER_BUILDERS, RUNNERS
 
 
@@ -33,7 +36,7 @@ class DefaultRunnerConstructor:
         >>> runner = build_runner(runner_cfg)
     """
 
-    def __init__(self, runner_cfg, default_args=None):
+    def __init__(self, runner_cfg: dict, default_args: Optional[dict] = None):
         if not isinstance(runner_cfg, dict):
             raise TypeError('runner_cfg should be a dict',
                             f'but got {type(runner_cfg)}')
diff --git a/mmcv/runner/dist_utils.py b/mmcv/runner/dist_utils.py
index d3a1ef3fda5ceeb31bf15a73779da1b1903ab0fe..ee55dfda36ed439c41896b8d207385679146e2f5 100644
--- a/mmcv/runner/dist_utils.py
+++ b/mmcv/runner/dist_utils.py
@@ -1,8 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 import functools
 import os
+import socket
 import subprocess
 from collections import OrderedDict
+from typing import Callable, List, Optional, Tuple
 
 import torch
 import torch.multiprocessing as mp
@@ -10,8 +13,28 @@ from torch import distributed as dist
 from torch._utils import (_flatten_dense_tensors, _take_tensors,
                           _unflatten_dense_tensors)
 
+from mmcv.utils import IS_MLU_AVAILABLE
 
-def init_dist(launcher, backend='nccl', **kwargs):
+
+def _find_free_port() -> str:
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def _is_free_port(port: int) -> bool:
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+
+
+def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
     if mp.get_start_method(allow_none=True) is None:
         mp.set_start_method('spawn')
     if launcher == 'pytorch':
@@ -24,23 +47,37 @@ def init_dist(launcher, backend='nccl', **kwargs):
         raise ValueError(f'Invalid launcher type: {launcher}')
 
 
-def _init_dist_pytorch(backend, **kwargs):
+def _init_dist_pytorch(backend: str, **kwargs) -> None:
     # TODO: use local_rank instead of rank % num_gpus
     rank = int(os.environ['RANK'])
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(rank % num_gpus)
-    dist.init_process_group(backend=backend, **kwargs)
+    if IS_MLU_AVAILABLE:
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(rank)
+        dist.init_process_group(
+            backend='cncl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    else:
+        num_gpus = torch.cuda.device_count()
+        torch.cuda.set_device(rank % num_gpus)
+        dist.init_process_group(backend=backend, **kwargs)
 
 
-def _init_dist_mpi(backend, **kwargs):
-    # TODO: use local_rank instead of rank % num_gpus
-    rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(rank % num_gpus)
+def _init_dist_mpi(backend: str, **kwargs) -> None:
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
     dist.init_process_group(backend=backend, **kwargs)
 
 
-def _init_dist_slurm(backend, port=None):
+def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
     """Initialize slurm distributed training environment.
 
     If argument ``port`` is not specified, then the master port will be system
@@ -64,8 +101,12 @@ def _init_dist_slurm(backend, port=None):
     elif 'MASTER_PORT' in os.environ:
         pass  # use MASTER_PORT in the environment variable
     else:
-        # 29500 is torch.distributed default port
-        os.environ['MASTER_PORT'] = '29500'
+        # if torch.distributed default port(29500) is available
+        # then use it, else find a free port
+        if _is_free_port(29500):
+            os.environ['MASTER_PORT'] = '29500'
+        else:
+            os.environ['MASTER_PORT'] = str(_find_free_port())
     # use MASTER_ADDR in the environment variable if it already exists
     if 'MASTER_ADDR' not in os.environ:
         os.environ['MASTER_ADDR'] = addr
@@ -75,7 +116,7 @@ def _init_dist_slurm(backend, port=None):
     dist.init_process_group(backend=backend)
 
 
-def get_dist_info():
+def get_dist_info() -> Tuple[int, int]:
     if dist.is_available() and dist.is_initialized():
         rank = dist.get_rank()
         world_size = dist.get_world_size()
@@ -85,7 +126,7 @@ def get_dist_info():
     return rank, world_size
 
 
-def master_only(func):
+def master_only(func: Callable) -> Callable:
 
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
@@ -96,12 +137,14 @@ def master_only(func):
     return wrapper
 
 
-def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
+def allreduce_params(params: List[torch.nn.Parameter],
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1) -> None:
     """Allreduce parameters.
 
     Args:
-        params (list[torch.Parameters]): List of parameters or buffers of a
-            model.
+        params (list[torch.nn.Parameter]): List of parameters or buffers
+            of a model.
         coalesce (bool, optional): Whether allreduce parameters as a whole.
             Defaults to True.
         bucket_size_mb (int, optional): Size of bucket, the unit is MB.
@@ -118,11 +161,13 @@ def allreduce_params(params, coalesce=True, bucket_size_mb=-1):
             dist.all_reduce(tensor.div_(world_size))
 
 
-def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+def allreduce_grads(params: List[torch.nn.Parameter],
+                    coalesce: bool = True,
+                    bucket_size_mb: int = -1) -> None:
     """Allreduce gradients.
 
     Args:
-        params (list[torch.Parameters]): List of parameters of a model
+        params (list[torch.nn.Parameter]): List of parameters of a model.
         coalesce (bool, optional): Whether allreduce parameters as a whole.
             Defaults to True.
         bucket_size_mb (int, optional): Size of bucket, the unit is MB.
@@ -142,7 +187,9 @@ def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
             dist.all_reduce(tensor.div_(world_size))
 
 
-def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+def _allreduce_coalesced(tensors: torch.Tensor,
+                         world_size: int,
+                         bucket_size_mb: int = -1) -> None:
     if bucket_size_mb > 0:
         bucket_size_bytes = bucket_size_mb * 1024 * 1024
         buckets = _take_tensors(tensors, bucket_size_bytes)
diff --git a/mmcv/runner/epoch_based_runner.py b/mmcv/runner/epoch_based_runner.py
index 2dd29357a0de24cf51ee1d1d24b1c4e45e4a290c..d6e9069289dd4ff69717b076413218e071812cfb 100644
--- a/mmcv/runner/epoch_based_runner.py
+++ b/mmcv/runner/epoch_based_runner.py
@@ -4,8 +4,10 @@ import platform
 import shutil
 import time
 import warnings
+from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+from torch.utils.data import DataLoader
 
 import mmcv
 from .base_runner import BaseRunner
@@ -21,7 +23,7 @@ class EpochBasedRunner(BaseRunner):
     This runner train models epoch by epoch.
     """
 
-    def run_iter(self, data_batch, train_mode, **kwargs):
+    def run_iter(self, data_batch: Any, train_mode: bool, **kwargs) -> None:
         if self.batch_processor is not None:
             outputs = self.batch_processor(
                 self.model, data_batch, train_mode=train_mode, **kwargs)
@@ -45,10 +47,12 @@ class EpochBasedRunner(BaseRunner):
         self.call_hook('before_train_epoch')
         time.sleep(2)  # Prevent possible deadlock during epoch transition
         for i, data_batch in enumerate(self.data_loader):
+            self.data_batch = data_batch
             self._inner_iter = i
             self.call_hook('before_train_iter')
             self.run_iter(data_batch, train_mode=True, **kwargs)
             self.call_hook('after_train_iter')
+            del self.data_batch
             self._iter += 1
 
         self.call_hook('after_train_epoch')
@@ -62,14 +66,19 @@ class EpochBasedRunner(BaseRunner):
         self.call_hook('before_val_epoch')
         time.sleep(2)  # Prevent possible deadlock during epoch transition
         for i, data_batch in enumerate(self.data_loader):
+            self.data_batch = data_batch
             self._inner_iter = i
             self.call_hook('before_val_iter')
             self.run_iter(data_batch, train_mode=False)
             self.call_hook('after_val_iter')
-
+            del self.data_batch
         self.call_hook('after_val_epoch')
 
-    def run(self, data_loaders, workflow, max_epochs=None, **kwargs):
+    def run(self,
+            data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]],
+            max_epochs: Optional[int] = None,
+            **kwargs) -> None:
         """Start running.
 
         Args:
@@ -130,11 +139,11 @@ class EpochBasedRunner(BaseRunner):
         self.call_hook('after_run')
 
     def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl='epoch_{}.pth',
-                        save_optimizer=True,
-                        meta=None,
-                        create_symlink=True):
+                        out_dir: str,
+                        filename_tmpl: str = 'epoch_{}.pth',
+                        save_optimizer: bool = True,
+                        meta: Optional[Dict] = None,
+                        create_symlink: bool = True) -> None:
         """Save the checkpoint.
 
         Args:
@@ -183,5 +192,6 @@ class Runner(EpochBasedRunner):
 
     def __init__(self, *args, **kwargs):
         warnings.warn(
-            'Runner was deprecated, please use EpochBasedRunner instead')
+            'Runner was deprecated, please use EpochBasedRunner instead',
+            DeprecationWarning)
         super().__init__(*args, **kwargs)
diff --git a/mmcv/runner/fp16_utils.py b/mmcv/runner/fp16_utils.py
index 4baab939a74f3dad7e2bd68da695f5db5de8965b..4674d27a447befbe440c663cc41e86ba4541a100 100644
--- a/mmcv/runner/fp16_utils.py
+++ b/mmcv/runner/fp16_utils.py
@@ -3,10 +3,12 @@ import functools
 import warnings
 from collections import abc
 from inspect import getfullargspec
+from typing import Callable, Iterable, List, Optional
 
 import numpy as np
 import torch
 import torch.nn as nn
+from torch.nn.parameter import Parameter
 
 from mmcv.utils import TORCH_VERSION, digit_version
 from .dist_utils import allreduce_grads as _allreduce_grads
@@ -21,9 +23,18 @@ except ImportError:
     pass
 
 
-def cast_tensor_type(inputs, src_type, dst_type):
+def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype):
     """Recursively convert Tensor in inputs from src_type to dst_type.
 
+    Note:
+        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
+        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
+        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
+        potential problems. For example,
+        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
+        tensors in inputs to ``torch.half`` including those originally in
+        ``torch.Int`` or other types, which is not expected.
+
     Args:
         inputs: Inputs that to be casted.
         src_type (torch.dtype): Source type..
@@ -35,24 +46,30 @@ def cast_tensor_type(inputs, src_type, dst_type):
     if isinstance(inputs, nn.Module):
         return inputs
     elif isinstance(inputs, torch.Tensor):
-        return inputs.to(dst_type)
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
     elif isinstance(inputs, str):
         return inputs
     elif isinstance(inputs, np.ndarray):
         return inputs
     elif isinstance(inputs, abc.Mapping):
-        return type(inputs)({
+        return type(inputs)({  # type: ignore
             k: cast_tensor_type(v, src_type, dst_type)
             for k, v in inputs.items()
         })
     elif isinstance(inputs, abc.Iterable):
-        return type(inputs)(
+        return type(inputs)(  # type: ignore
             cast_tensor_type(item, src_type, dst_type) for item in inputs)
     else:
         return inputs
 
 
-def auto_fp16(apply_to=None, out_fp32=False):
+def auto_fp16(
+        apply_to: Optional[Iterable] = None,
+        out_fp32: bool = False,
+        supported_types: tuple = (nn.Module, ),
+) -> Callable:
     """Decorator to enable fp16 training automatically.
 
     This decorator is useful when you write custom modules and want to support
@@ -65,7 +82,8 @@ def auto_fp16(apply_to=None, out_fp32=False):
         apply_to (Iterable, optional): The argument names to be converted.
             `None` indicates all arguments.
         out_fp32 (bool): Whether to convert the output back to fp32.
-
+        supported_types (tuple): Classes can be decorated by ``auto_fp16``.
+            `New in version 1.5.0.`
     Example:
 
         >>> import torch.nn as nn
@@ -85,15 +103,15 @@ def auto_fp16(apply_to=None, out_fp32=False):
         >>>         pass
     """
 
-    def auto_fp16_wrapper(old_func):
+    def auto_fp16_wrapper(old_func: Callable) -> Callable:
 
         @functools.wraps(old_func)
-        def new_func(*args, **kwargs):
+        def new_func(*args, **kwargs) -> Callable:
             # check if the module has set the attribute `fp16_enabled`, if not,
             # just fallback to the original method.
-            if not isinstance(args[0], torch.nn.Module):
+            if not isinstance(args[0], supported_types):
                 raise TypeError('@auto_fp16 can only be used to decorate the '
-                                'method of nn.Module')
+                                f'method of those classes {supported_types}')
             if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
                 return old_func(*args, **kwargs)
 
@@ -138,7 +156,8 @@ def auto_fp16(apply_to=None, out_fp32=False):
     return auto_fp16_wrapper
 
 
-def force_fp32(apply_to=None, out_fp16=False):
+def force_fp32(apply_to: Optional[Iterable] = None,
+               out_fp16: bool = False) -> Callable:
     """Decorator to convert input arguments to fp32 in force.
 
     This decorator is useful when you write custom modules and want to support
@@ -176,7 +195,7 @@ def force_fp32(apply_to=None, out_fp16=False):
     def force_fp32_wrapper(old_func):
 
         @functools.wraps(old_func)
-        def new_func(*args, **kwargs):
+        def new_func(*args, **kwargs) -> Callable:
             # check if the module has set the attribute `fp16_enabled`, if not,
             # just fallback to the original method.
             if not isinstance(args[0], torch.nn.Module):
@@ -224,14 +243,17 @@ def force_fp32(apply_to=None, out_fp16=False):
     return force_fp32_wrapper
 
 
-def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
-    warnings.warning(
+def allreduce_grads(params: List[Parameter],
+                    coalesce: bool = True,
+                    bucket_size_mb: int = -1) -> None:
+    warnings.warn(
         '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
-        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads')
+        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads',
+        DeprecationWarning)
     _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
 
 
-def wrap_fp16_model(model):
+def wrap_fp16_model(model: nn.Module) -> None:
     """Wrap the FP32 model to FP16.
 
     If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
@@ -260,7 +282,7 @@ def wrap_fp16_model(model):
             m.fp16_enabled = True
 
 
-def patch_norm_fp32(module):
+def patch_norm_fp32(module: nn.Module) -> nn.Module:
     """Recursively convert normalization layers from FP16 to FP32.
 
     Args:
@@ -280,7 +302,10 @@ def patch_norm_fp32(module):
     return module
 
 
-def patch_forward_method(func, src_type, dst_type, convert_output=True):
+def patch_forward_method(func: Callable,
+                         src_type: torch.dtype,
+                         dst_type: torch.dtype,
+                         convert_output: bool = True) -> Callable:
     """Patch the forward method of a module.
 
     Args:
@@ -333,10 +358,10 @@ class LossScaler:
     """
 
     def __init__(self,
-                 init_scale=2**32,
-                 mode='dynamic',
-                 scale_factor=2.,
-                 scale_window=1000):
+                 init_scale: float = 2**32,
+                 mode: str = 'dynamic',
+                 scale_factor: float = 2.,
+                 scale_window: int = 1000):
         self.cur_scale = init_scale
         self.cur_iter = 0
         assert mode in ('dynamic',
@@ -346,7 +371,7 @@ class LossScaler:
         self.scale_factor = scale_factor
         self.scale_window = scale_window
 
-    def has_overflow(self, params):
+    def has_overflow(self, params: List[Parameter]) -> bool:
         """Check if params contain overflow."""
         if self.mode != 'dynamic':
             return False
@@ -355,7 +380,7 @@ class LossScaler:
                 return True
         return False
 
-    def _has_inf_or_nan(x):
+    def _has_inf_or_nan(x: torch.Tensor) -> bool:
         """Check if params contain NaN."""
         try:
             cpu_sum = float(x.float().sum())
@@ -369,7 +394,7 @@ class LossScaler:
                 return True
             return False
 
-    def update_scale(self, overflow):
+    def update_scale(self, overflow: bool) -> None:
         """update the current loss scale value when overflow happens."""
         if self.mode != 'dynamic':
             return
@@ -382,7 +407,7 @@ class LossScaler:
                 self.cur_scale *= self.scale_factor
         self.cur_iter += 1
 
-    def state_dict(self):
+    def state_dict(self) -> dict:
         """Returns the state of the scaler as a :class:`dict`."""
         return dict(
             cur_scale=self.cur_scale,
@@ -392,7 +417,7 @@ class LossScaler:
             scale_factor=self.scale_factor,
             scale_window=self.scale_window)
 
-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: dict) -> None:
         """Loads the loss_scaler state dict.
 
         Args:
@@ -406,5 +431,5 @@ class LossScaler:
         self.scale_window = state_dict['scale_window']
 
     @property
-    def loss_scale(self):
+    def loss_scale(self) -> float:
         return self.cur_scale
diff --git a/mmcv/runner/hooks/__init__.py b/mmcv/runner/hooks/__init__.py
index 915af28cefab14a14c1188ed861161080fd138a3..03e2a619e8dd6c516add4a3b23c3c790430255ba 100644
--- a/mmcv/runner/hooks/__init__.py
+++ b/mmcv/runner/hooks/__init__.py
@@ -5,12 +5,24 @@ from .ema import EMAHook
 from .evaluation import DistEvalHook, EvalHook
 from .hook import HOOKS, Hook
 from .iter_timer import IterTimerHook
-from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook,
-                     NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook,
-                     TextLoggerHook, WandbLoggerHook)
-from .lr_updater import LrUpdaterHook
+from .logger import (ClearMLLoggerHook, DvcliveLoggerHook, LoggerHook,
+                     MlflowLoggerHook, NeptuneLoggerHook, PaviLoggerHook,
+                     SegmindLoggerHook, TensorboardLoggerHook, TextLoggerHook,
+                     WandbLoggerHook)
+from .lr_updater import (CosineAnnealingLrUpdaterHook,
+                         CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                         ExpLrUpdaterHook, FixedLrUpdaterHook,
+                         FlatCosineAnnealingLrUpdaterHook, InvLrUpdaterHook,
+                         LinearAnnealingLrUpdaterHook, LrUpdaterHook,
+                         OneCycleLrUpdaterHook, PolyLrUpdaterHook,
+                         StepLrUpdaterHook)
 from .memory import EmptyCacheHook
-from .momentum_updater import MomentumUpdaterHook
+from .momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                               CyclicMomentumUpdaterHook,
+                               LinearAnnealingMomentumUpdaterHook,
+                               MomentumUpdaterHook,
+                               OneCycleMomentumUpdaterHook,
+                               StepMomentumUpdaterHook)
 from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
                         GradientCumulativeOptimizerHook, OptimizerHook)
 from .profiler import ProfilerHook
@@ -19,11 +31,18 @@ from .sync_buffer import SyncBuffersHook
 
 __all__ = [
     'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
-    'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook',
-    'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook',
-    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
-    'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook',
-    'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook',
-    'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook',
-    'GradientCumulativeFp16OptimizerHook'
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'OptimizerHook',
+    'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook',
+    'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook',
+    'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+    'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook',
+    'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook',
+    'SegmindLoggerHook', 'LinearAnnealingLrUpdaterHook',
+    'LinearAnnealingMomentumUpdaterHook', 'ClearMLLoggerHook'
 ]
diff --git a/mmcv/runner/hooks/checkpoint.py b/mmcv/runner/hooks/checkpoint.py
index 7bb75f402a110daf18d6f1966ce5ee1483f16e45..5cc4f356d0ec13c290f9fc5e6af3061893ca4b51 100644
--- a/mmcv/runner/hooks/checkpoint.py
+++ b/mmcv/runner/hooks/checkpoint.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 import warnings
+from typing import Optional
 
 from mmcv.fileio import FileClient
 from ..dist_utils import allreduce_params, master_only
@@ -49,14 +50,14 @@ class CheckpointHook(Hook):
     """
 
     def __init__(self,
-                 interval=-1,
-                 by_epoch=True,
-                 save_optimizer=True,
-                 out_dir=None,
-                 max_keep_ckpts=-1,
-                 save_last=True,
-                 sync_buffer=False,
-                 file_client_args=None,
+                 interval: int = -1,
+                 by_epoch: bool = True,
+                 save_optimizer: bool = True,
+                 out_dir: Optional[str] = None,
+                 max_keep_ckpts: int = -1,
+                 save_last: bool = True,
+                 sync_buffer: bool = False,
+                 file_client_args: Optional[dict] = None,
                  **kwargs):
         self.interval = interval
         self.by_epoch = by_epoch
@@ -83,8 +84,8 @@ class CheckpointHook(Hook):
             basename = osp.basename(runner.work_dir.rstrip(osp.sep))
             self.out_dir = self.file_client.join_path(self.out_dir, basename)
 
-        runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by '
-                            f'{self.file_client.name}.'))
+        runner.logger.info(f'Checkpoints will be saved to {self.out_dir} by '
+                           f'{self.file_client.name}.')
 
         # disable the create_symlink option because some file backends do not
         # allow to create a symlink
@@ -93,9 +94,9 @@ class CheckpointHook(Hook):
                     'create_symlink'] and not self.file_client.allow_symlink:
                 self.args['create_symlink'] = False
                 warnings.warn(
-                    ('create_symlink is set as True by the user but is changed'
-                     'to be False because creating symbolic link is not '
-                     f'allowed in {self.file_client.name}'))
+                    'create_symlink is set as True by the user but is changed'
+                    'to be False because creating symbolic link is not '
+                    f'allowed in {self.file_client.name}')
         else:
             self.args['create_symlink'] = self.file_client.allow_symlink
 
diff --git a/mmcv/runner/hooks/closure.py b/mmcv/runner/hooks/closure.py
index b955f81f425be4ac3e6bb3f4aac653887989e872..73a3e6a90e8ffff8f0ff3c3493dd9fffdb3168b7 100644
--- a/mmcv/runner/hooks/closure.py
+++ b/mmcv/runner/hooks/closure.py
@@ -1,11 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable
+
 from .hook import HOOKS, Hook
 
 
 @HOOKS.register_module()
 class ClosureHook(Hook):
 
-    def __init__(self, fn_name, fn):
+    def __init__(self, fn_name: str, fn: Callable):
         assert hasattr(self, fn_name)
         assert callable(fn)
         setattr(self, fn_name, fn)
diff --git a/mmcv/runner/hooks/ema.py b/mmcv/runner/hooks/ema.py
index 15c7e68088f019802a59e7ae41cc1fe0c7f28f96..b5b578e5e37cd52aeef0b606fe6158d3ae49e1c1 100644
--- a/mmcv/runner/hooks/ema.py
+++ b/mmcv/runner/hooks/ema.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
 from ...parallel import is_module_wrapper
 from ..hooks.hook import HOOKS, Hook
 
@@ -13,8 +15,8 @@ class EMAHook(Hook):
 
         .. math::
 
-            \text{Xema\_{t+1}} = (1 - \text{momentum}) \times
-            \text{Xema\_{t}} +  \text{momentum} \times X_t
+            Xema\_{t+1} = (1 - \text{momentum}) \times
+            Xema\_{t} +  \text{momentum} \times X_t
 
     Args:
         momentum (float): The momentum used for updating ema parameter.
@@ -23,14 +25,14 @@ class EMAHook(Hook):
             Defaults to 1.
         warm_up (int): During first warm_up steps, we may use smaller momentum
             to update ema parameters more slowly. Defaults to 100.
-        resume_from (str): The checkpoint path. Defaults to None.
+        resume_from (str, optional): The checkpoint path. Defaults to None.
     """
 
     def __init__(self,
-                 momentum=0.0002,
-                 interval=1,
-                 warm_up=100,
-                 resume_from=None):
+                 momentum: float = 0.0002,
+                 interval: int = 1,
+                 warm_up: int = 100,
+                 resume_from: Optional[str] = None):
         assert isinstance(interval, int) and interval > 0
         self.warm_up = warm_up
         self.interval = interval
diff --git a/mmcv/runner/hooks/evaluation.py b/mmcv/runner/hooks/evaluation.py
index 1eeb4465068f8da92d25385056eb2e205a307729..181e03409fee13468055c783016c2fc41938224e 100644
--- a/mmcv/runner/hooks/evaluation.py
+++ b/mmcv/runner/hooks/evaluation.py
@@ -2,6 +2,7 @@
 import os.path as osp
 import warnings
 from math import inf
+from typing import Callable, List, Optional
 
 import torch.distributed as dist
 from torch.nn.modules.batchnorm import _BatchNorm
@@ -65,7 +66,7 @@ class EvalHook(Hook):
         **eval_kwargs: Evaluation arguments fed into the evaluate function of
             the dataset.
 
-    Notes:
+    Note:
         If new arguments are added for EvalHook, tools/test.py,
         tools/eval_metric.py may be affected.
     """
@@ -83,17 +84,17 @@ class EvalHook(Hook):
     _default_less_keys = ['loss']
 
     def __init__(self,
-                 dataloader,
-                 start=None,
-                 interval=1,
-                 by_epoch=True,
-                 save_best=None,
-                 rule=None,
-                 test_fn=None,
-                 greater_keys=None,
-                 less_keys=None,
-                 out_dir=None,
-                 file_client_args=None,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
                  **eval_kwargs):
         if not isinstance(dataloader, DataLoader):
             raise TypeError(f'dataloader must be a pytorch DataLoader, '
@@ -131,6 +132,7 @@ class EvalHook(Hook):
             self.greater_keys = self._default_greater_keys
         else:
             if not isinstance(greater_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
                 greater_keys = (greater_keys, )
             assert is_seq_of(greater_keys, str)
             self.greater_keys = greater_keys
@@ -139,6 +141,7 @@ class EvalHook(Hook):
             self.less_keys = self._default_less_keys
         else:
             if not isinstance(less_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
                 less_keys = (less_keys, )
             assert is_seq_of(less_keys, str)
             self.less_keys = less_keys
@@ -150,7 +153,7 @@ class EvalHook(Hook):
         self.out_dir = out_dir
         self.file_client_args = file_client_args
 
-    def _init_rule(self, rule, key_indicator):
+    def _init_rule(self, rule: Optional[str], key_indicator: str):
         """Initialize rule, key_indicator, comparison_func, and best score.
 
         Here is the rule to determine which rule is used for key indicator
@@ -160,10 +163,10 @@ class EvalHook(Hook):
            specified as 'greater'.
         2. Or if the key indicator is in ``self.less_keys``, the rule will be
            specified as 'less'.
-        3. Or if the key indicator is equal to the substring in any one item
-           in ``self.greater_keys``, the rule will be specified as 'greater'.
-        4. Or if the key indicator is equal to the substring in any one item
-           in ``self.less_keys``, the rule will be specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator , the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator , the rule will be specified as 'less'.
 
         Args:
             rule (str | None): Comparison rule for best score.
@@ -178,6 +181,7 @@ class EvalHook(Hook):
             if key_indicator != 'auto':
                 # `_lc` here means we use the lower case of keys for
                 # case-insensitive matching
+                assert isinstance(key_indicator, str)
                 key_indicator_lc = key_indicator.lower()
                 greater_keys = [key.lower() for key in self.greater_keys]
                 less_keys = [key.lower() for key in self.less_keys]
@@ -214,8 +218,8 @@ class EvalHook(Hook):
             basename = osp.basename(runner.work_dir.rstrip(osp.sep))
             self.out_dir = self.file_client.join_path(self.out_dir, basename)
             runner.logger.info(
-                (f'The best checkpoint will be saved to {self.out_dir} by '
-                 f'{self.file_client.name}'))
+                f'The best checkpoint will be saved to {self.out_dir} by '
+                f'{self.file_client.name}')
 
         if self.save_best is not None:
             if runner.meta is None:
@@ -335,8 +339,8 @@ class EvalHook(Hook):
                     self.best_ckpt_path):
                 self.file_client.remove(self.best_ckpt_path)
                 runner.logger.info(
-                    (f'The previous best checkpoint {self.best_ckpt_path} was '
-                     'removed'))
+                    f'The previous best checkpoint {self.best_ckpt_path} was '
+                    'removed')
 
             best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
             self.best_ckpt_path = self.file_client.join_path(
@@ -344,7 +348,9 @@ class EvalHook(Hook):
             runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
 
             runner.save_checkpoint(
-                self.out_dir, best_ckpt_name, create_symlink=False)
+                self.out_dir,
+                filename_tmpl=best_ckpt_name,
+                create_symlink=False)
             runner.logger.info(
                 f'Now best checkpoint is saved as {best_ckpt_name}.')
             runner.logger.info(
@@ -437,20 +443,20 @@ class DistEvalHook(EvalHook):
     """
 
     def __init__(self,
-                 dataloader,
-                 start=None,
-                 interval=1,
-                 by_epoch=True,
-                 save_best=None,
-                 rule=None,
-                 test_fn=None,
-                 greater_keys=None,
-                 less_keys=None,
-                 broadcast_bn_buffer=True,
-                 tmpdir=None,
-                 gpu_collect=False,
-                 out_dir=None,
-                 file_client_args=None,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 broadcast_bn_buffer: bool = True,
+                 tmpdir: Optional[str] = None,
+                 gpu_collect: bool = False,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
                  **eval_kwargs):
 
         if test_fn is None:
diff --git a/mmcv/runner/hooks/logger/__init__.py b/mmcv/runner/hooks/logger/__init__.py
index a0b6b345640a895368ac8a647afef6f24333d90e..062709e704f08fc313a7f422cc7cd1e34bde5f68 100644
--- a/mmcv/runner/hooks/logger/__init__.py
+++ b/mmcv/runner/hooks/logger/__init__.py
@@ -1,9 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import LoggerHook
+from .clearml import ClearMLLoggerHook
 from .dvclive import DvcliveLoggerHook
 from .mlflow import MlflowLoggerHook
 from .neptune import NeptuneLoggerHook
 from .pavi import PaviLoggerHook
+from .segmind import SegmindLoggerHook
 from .tensorboard import TensorboardLoggerHook
 from .text import TextLoggerHook
 from .wandb import WandbLoggerHook
@@ -11,5 +13,6 @@ from .wandb import WandbLoggerHook
 __all__ = [
     'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
     'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
-    'NeptuneLoggerHook', 'DvcliveLoggerHook'
+    'NeptuneLoggerHook', 'DvcliveLoggerHook', 'SegmindLoggerHook',
+    'ClearMLLoggerHook'
 ]
diff --git a/mmcv/runner/hooks/logger/base.py b/mmcv/runner/hooks/logger/base.py
index f845256729458ced821762a1b8ef881e17ff9955..416a1b75106c1b3e1f8153f77bb92701d66f43d0 100644
--- a/mmcv/runner/hooks/logger/base.py
+++ b/mmcv/runner/hooks/logger/base.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import numbers
 from abc import ABCMeta, abstractmethod
+from typing import Dict
 
 import numpy as np
 import torch
@@ -12,20 +13,21 @@ class LoggerHook(Hook):
     """Base class for logger hooks.
 
     Args:
-        interval (int): Logging interval (every k iterations).
+        interval (int): Logging interval (every k iterations). Default 10.
         ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
+            if less than `interval`. Default True.
         reset_flag (bool): Whether to clear the output buffer after logging.
-        by_epoch (bool): Whether EpochBasedRunner is used.
+            Default False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
     """
 
     __metaclass__ = ABCMeta
 
     def __init__(self,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
         self.interval = interval
         self.ignore_last = ignore_last
         self.reset_flag = reset_flag
@@ -36,7 +38,9 @@ class LoggerHook(Hook):
         pass
 
     @staticmethod
-    def is_scalar(val, include_np=True, include_torch=True):
+    def is_scalar(val,
+                  include_np: bool = True,
+                  include_torch: bool = True) -> bool:
         """Tell the input variable is a scalar or not.
 
         Args:
@@ -56,7 +60,7 @@ class LoggerHook(Hook):
         else:
             return False
 
-    def get_mode(self, runner):
+    def get_mode(self, runner) -> str:
         if runner.mode == 'train':
             if 'time' in runner.log_buffer.output:
                 mode = 'train'
@@ -69,7 +73,7 @@ class LoggerHook(Hook):
                              f'but got {runner.mode}')
         return mode
 
-    def get_epoch(self, runner):
+    def get_epoch(self, runner) -> int:
         if runner.mode == 'train':
             epoch = runner.epoch + 1
         elif runner.mode == 'val':
@@ -81,7 +85,7 @@ class LoggerHook(Hook):
                              f'but got {runner.mode}')
         return epoch
 
-    def get_iter(self, runner, inner_iter=False):
+    def get_iter(self, runner, inner_iter: bool = False) -> int:
         """Get the current training iteration step."""
         if self.by_epoch and inner_iter:
             current_iter = runner.inner_iter + 1
@@ -89,7 +93,7 @@ class LoggerHook(Hook):
             current_iter = runner.iter + 1
         return current_iter
 
-    def get_lr_tags(self, runner):
+    def get_lr_tags(self, runner) -> Dict[str, float]:
         tags = {}
         lrs = runner.current_lr()
         if isinstance(lrs, dict):
@@ -99,7 +103,7 @@ class LoggerHook(Hook):
             tags['learning_rate'] = lrs[0]
         return tags
 
-    def get_momentum_tags(self, runner):
+    def get_momentum_tags(self, runner) -> Dict[str, float]:
         tags = {}
         momentums = runner.current_momentum()
         if isinstance(momentums, dict):
@@ -109,12 +113,14 @@ class LoggerHook(Hook):
             tags['momentum'] = momentums[0]
         return tags
 
-    def get_loggable_tags(self,
-                          runner,
-                          allow_scalar=True,
-                          allow_text=False,
-                          add_mode=True,
-                          tags_to_skip=('time', 'data_time')):
+    def get_loggable_tags(
+        self,
+        runner,
+        allow_scalar: bool = True,
+        allow_text: bool = False,
+        add_mode: bool = True,
+        tags_to_skip: tuple = ('time', 'data_time')
+    ) -> Dict:
         tags = {}
         for var, val in runner.log_buffer.output.items():
             if var in tags_to_skip:
@@ -130,16 +136,16 @@ class LoggerHook(Hook):
         tags.update(self.get_momentum_tags(runner))
         return tags
 
-    def before_run(self, runner):
+    def before_run(self, runner) -> None:
         for hook in runner.hooks[::-1]:
             if isinstance(hook, LoggerHook):
                 hook.reset_flag = True
                 break
 
-    def before_epoch(self, runner):
+    def before_epoch(self, runner) -> None:
         runner.log_buffer.clear()  # clear logs of last epoch
 
-    def after_train_iter(self, runner):
+    def after_train_iter(self, runner) -> None:
         if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
             runner.log_buffer.average(self.interval)
         elif not self.by_epoch and self.every_n_iters(runner, self.interval):
@@ -153,13 +159,13 @@ class LoggerHook(Hook):
             if self.reset_flag:
                 runner.log_buffer.clear_output()
 
-    def after_train_epoch(self, runner):
+    def after_train_epoch(self, runner) -> None:
         if runner.log_buffer.ready:
             self.log(runner)
             if self.reset_flag:
                 runner.log_buffer.clear_output()
 
-    def after_val_epoch(self, runner):
+    def after_val_epoch(self, runner) -> None:
         runner.log_buffer.average()
         self.log(runner)
         if self.reset_flag:
diff --git a/mmcv/runner/hooks/logger/clearml.py b/mmcv/runner/hooks/logger/clearml.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db651f03160237fd1fac03dc93d6676864c525f
--- /dev/null
+++ b/mmcv/runner/hooks/logger/clearml.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, Optional
+
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class ClearMLLoggerHook(LoggerHook):
+    """Class to log metrics with clearml.
+
+    It requires `clearml`_ to be installed.
+
+
+    Args:
+        init_kwargs (dict): A dict contains the `clearml.Task.init`
+            initialization keys. See `taskinit`_  for more details.
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _clearml:
+        https://clear.ml/docs/latest/docs/
+    .. _taskinit:
+        https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
+    """
+
+    def __init__(self,
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_clearml()
+        self.init_kwargs = init_kwargs
+
+    def import_clearml(self):
+        try:
+            import clearml
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install clearml" to install clearml')
+        self.clearml = clearml
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        task_kwargs = self.init_kwargs if self.init_kwargs else {}
+        self.task = self.clearml.Task.init(**task_kwargs)
+        self.task_logger = self.task.get_logger()
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        for tag, val in tags.items():
+            self.task_logger.report_scalar(tag, tag, val,
+                                           self.get_iter(runner))
diff --git a/mmcv/runner/hooks/logger/dvclive.py b/mmcv/runner/hooks/logger/dvclive.py
index 687cdc58c0336c92b1e4f9a410ba67ebaab2bc7a..fc0a58c497fe77a580f507f13949a8a81e7274e3 100644
--- a/mmcv/runner/hooks/logger/dvclive.py
+++ b/mmcv/runner/hooks/logger/dvclive.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Optional
+
 from ...dist_utils import master_only
 from ..hook import HOOKS
 from .base import LoggerHook
@@ -11,48 +14,56 @@ class DvcliveLoggerHook(LoggerHook):
     It requires `dvclive`_ to be installed.
 
     Args:
-        path (str): Directory where dvclive will write TSV log files.
-        interval (int): Logging interval (every k iterations).
-            Default 10.
+        model_file (str): Default None. If not None, after each epoch the
+            model will be saved to {model_file}.
+        interval (int): Logging interval (every k iterations). Default 10.
         ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-            Default: True.
+            if less than `interval`. Default: True.
         reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: True.
-        by_epoch (bool): Whether EpochBasedRunner is used.
-            Default: True.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        kwargs: Arguments for instantiating `Live`_.
 
     .. _dvclive:
         https://dvc.org/doc/dvclive
+
+    .. _Live:
+        https://dvc.org/doc/dvclive/api-reference/live#parameters
     """
 
     def __init__(self,
-                 path,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=True,
-                 by_epoch=True):
-
-        super(DvcliveLoggerHook, self).__init__(interval, ignore_last,
-                                                reset_flag, by_epoch)
-        self.path = path
-        self.import_dvclive()
-
-    def import_dvclive(self):
+                 model_file: Optional[str] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True,
+                 **kwargs):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.model_file = model_file
+        self.import_dvclive(**kwargs)
+
+    def import_dvclive(self, **kwargs) -> None:
         try:
-            import dvclive
+            from dvclive import Live
         except ImportError:
             raise ImportError(
                 'Please run "pip install dvclive" to install dvclive')
-        self.dvclive = dvclive
+        self.dvclive = Live(**kwargs)
 
     @master_only
-    def before_run(self, runner):
-        self.dvclive.init(self.path)
-
-    @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner)
         if tags:
+            self.dvclive.set_step(self.get_iter(runner))
             for k, v in tags.items():
-                self.dvclive.log(k, v, step=self.get_iter(runner))
+                self.dvclive.log(k, v)
+
+    @master_only
+    def after_train_epoch(self, runner) -> None:
+        super().after_train_epoch(runner)
+        if self.model_file is not None:
+            runner.save_checkpoint(
+                Path(self.model_file).parent,
+                filename_tmpl=Path(self.model_file).name,
+                create_symlink=False,
+            )
diff --git a/mmcv/runner/hooks/logger/mlflow.py b/mmcv/runner/hooks/logger/mlflow.py
index f9a72592be47b534ce22573775fd5a7e8e86d72d..a76b0426b7ddc216774d0be95628de55ee0d2334 100644
--- a/mmcv/runner/hooks/logger/mlflow.py
+++ b/mmcv/runner/hooks/logger/mlflow.py
@@ -1,4 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from mmcv.utils import TORCH_VERSION
 from ...dist_utils import master_only
 from ..hook import HOOKS
 from .base import LoggerHook
@@ -6,49 +9,46 @@ from .base import LoggerHook
 
 @HOOKS.register_module()
 class MlflowLoggerHook(LoggerHook):
+    """Class to log metrics and (optionally) a trained model to MLflow.
 
-    def __init__(self,
-                 exp_name=None,
-                 tags=None,
-                 log_model=True,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
-        """Class to log metrics and (optionally) a trained model to MLflow.
+    It requires `MLflow`_ to be installed.
 
-        It requires `MLflow`_ to be installed.
+    Args:
+        exp_name (str, optional): Name of the experiment to be used.
+            Default None. If not None, set the active experiment.
+            If experiment does not exist, an experiment with provided name
+            will be created.
+        tags (Dict[str], optional): Tags for the current run.
+            Default None. If not None, set tags for the current run.
+        log_model (bool, optional): Whether to log an MLflow artifact.
+            Default True. If True, log runner.model as an MLflow artifact
+            for the current run.
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
 
-        Args:
-            exp_name (str, optional): Name of the experiment to be used.
-                Default None.
-                If not None, set the active experiment.
-                If experiment does not exist, an experiment with provided name
-                will be created.
-            tags (dict of str: str, optional): Tags for the current run.
-                Default None.
-                If not None, set tags for the current run.
-            log_model (bool, optional): Whether to log an MLflow artifact.
-                Default True.
-                If True, log runner.model as an MLflow artifact
-                for the current run.
-            interval (int): Logging interval (every k iterations).
-            ignore_last (bool): Ignore the log of last iterations in each epoch
-                if less than `interval`.
-            reset_flag (bool): Whether to clear the output buffer after logging
-            by_epoch (bool): Whether EpochBasedRunner is used.
+    .. _MLflow:
+        https://www.mlflow.org/docs/latest/index.html
+    """
 
-        .. _MLflow:
-            https://www.mlflow.org/docs/latest/index.html
-        """
-        super(MlflowLoggerHook, self).__init__(interval, ignore_last,
-                                               reset_flag, by_epoch)
+    def __init__(self,
+                 exp_name: Optional[str] = None,
+                 tags: Optional[Dict] = None,
+                 log_model: bool = True,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.import_mlflow()
         self.exp_name = exp_name
         self.tags = tags
         self.log_model = log_model
 
-    def import_mlflow(self):
+    def import_mlflow(self) -> None:
         try:
             import mlflow
             import mlflow.pytorch as mlflow_pytorch
@@ -59,20 +59,23 @@ class MlflowLoggerHook(LoggerHook):
         self.mlflow_pytorch = mlflow_pytorch
 
     @master_only
-    def before_run(self, runner):
-        super(MlflowLoggerHook, self).before_run(runner)
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
         if self.exp_name is not None:
             self.mlflow.set_experiment(self.exp_name)
         if self.tags is not None:
             self.mlflow.set_tags(self.tags)
 
     @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner)
         if tags:
             self.mlflow.log_metrics(tags, step=self.get_iter(runner))
 
     @master_only
-    def after_run(self, runner):
+    def after_run(self, runner) -> None:
         if self.log_model:
-            self.mlflow_pytorch.log_model(runner.model, 'models')
+            self.mlflow_pytorch.log_model(
+                runner.model,
+                'models',
+                pip_requirements=[f'torch=={TORCH_VERSION}'])
diff --git a/mmcv/runner/hooks/logger/neptune.py b/mmcv/runner/hooks/logger/neptune.py
index 7a38772b0c93a8608f32c6357b8616e77c139dc9..e398fe1e79ae929508c68266de332bde3367a3b9 100644
--- a/mmcv/runner/hooks/logger/neptune.py
+++ b/mmcv/runner/hooks/logger/neptune.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
 from ...dist_utils import master_only
 from ..hook import HOOKS
 from .base import LoggerHook
@@ -8,48 +10,53 @@ from .base import LoggerHook
 class NeptuneLoggerHook(LoggerHook):
     """Class to log metrics to NeptuneAI.
 
-    It requires `neptune-client` to be installed.
+    It requires `Neptune`_ to be installed.
 
     Args:
         init_kwargs (dict): a dict contains the initialization keys as below:
+
             - project (str): Name of a project in a form of
-                namespace/project_name. If None, the value of
-                NEPTUNE_PROJECT environment variable will be taken.
-            - api_token (str): User’s API token.
-                If None, the value of NEPTUNE_API_TOKEN environment
-                variable will be taken. Note: It is strongly recommended
-                to use NEPTUNE_API_TOKEN environment variable rather than
-                placing your API token in plain text in your source code.
-            - name (str, optional, default is 'Untitled'): Editable name of
-                the run. Name is displayed in the run's Details and in
-                Runs table as a column.
-            Check https://docs.neptune.ai/api-reference/neptune#init for
-                more init arguments.
-        interval (int): Logging interval (every k iterations).
+              namespace/project_name. If None, the value of NEPTUNE_PROJECT
+              environment variable will be taken.
+            - api_token (str): User’s API token. If None, the value of
+              NEPTUNE_API_TOKEN environment variable will be taken. Note: It is
+              strongly recommended to use NEPTUNE_API_TOKEN environment
+              variable rather than placing your API token in plain text in your
+              source code.
+            - name (str, optional, default is 'Untitled'): Editable name of the
+              run. Name is displayed in the run's Details and in Runs table as
+              a column.
+
+            Check https://docs.neptune.ai/api-reference/neptune#init for more
+            init arguments.
+        interval (int): Logging interval (every k iterations). Default: 10.
         ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-        reset_flag (bool): Whether to clear the output buffer after logging
-        by_epoch (bool): Whether EpochBasedRunner is used.
+            if less than ``interval``. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
 
-    .. _NeptuneAI:
-        https://docs.neptune.ai/you-should-know/logging-metadata
+    .. _Neptune:
+        https://docs.neptune.ai
     """
 
     def __init__(self,
-                 init_kwargs=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=True,
-                 with_step=True,
-                 by_epoch=True):
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = True,
+                 with_step: bool = True,
+                 by_epoch: bool = True):
 
-        super(NeptuneLoggerHook, self).__init__(interval, ignore_last,
-                                                reset_flag, by_epoch)
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.import_neptune()
         self.init_kwargs = init_kwargs
         self.with_step = with_step
 
-    def import_neptune(self):
+    def import_neptune(self) -> None:
         try:
             import neptune.new as neptune
         except ImportError:
@@ -59,24 +66,24 @@ class NeptuneLoggerHook(LoggerHook):
         self.run = None
 
     @master_only
-    def before_run(self, runner):
+    def before_run(self, runner) -> None:
         if self.init_kwargs:
             self.run = self.neptune.init(**self.init_kwargs)
         else:
             self.run = self.neptune.init()
 
     @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner)
         if tags:
             for tag_name, tag_value in tags.items():
                 if self.with_step:
-                    self.run[tag_name].log(
+                    self.run[tag_name].log(  # type: ignore
                         tag_value, step=self.get_iter(runner))
                 else:
                     tags['global_step'] = self.get_iter(runner)
-                    self.run[tag_name].log(tags)
+                    self.run[tag_name].log(tags)  # type: ignore
 
     @master_only
-    def after_run(self, runner):
-        self.run.stop()
+    def after_run(self, runner) -> None:
+        self.run.stop()  # type: ignore
diff --git a/mmcv/runner/hooks/logger/pavi.py b/mmcv/runner/hooks/logger/pavi.py
index ba2f6e8df818e7520a5d3a5ba5c0182b30d600fc..2d2e12cb8db428836e12ddb7c8a811970c527844 100644
--- a/mmcv/runner/hooks/logger/pavi.py
+++ b/mmcv/runner/hooks/logger/pavi.py
@@ -2,6 +2,7 @@
 import json
 import os
 import os.path as osp
+from typing import Dict, Optional
 
 import torch
 import yaml
@@ -15,37 +16,68 @@ from .base import LoggerHook
 
 @HOOKS.register_module()
 class PaviLoggerHook(LoggerHook):
+    """Class to visual model, log metrics (for internal use).
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys as below:
+
+            - name (str, optional): Custom training name. Defaults to None,
+              which means current work_dir.
+            - project (str, optional): Project name. Defaults to "default".
+            - model (str, optional): Training model name. Defaults to current
+              model.
+            - session_text (str, optional): Session string in YAML format.
+              Defaults to current config.
+            - training_id (int, optional): Training ID in PAVI, if you want to
+              use an existing training. Defaults to None.
+            - compare_id (int, optional): Compare ID in PAVI, if you want to
+              add the task to an existing compare. Defaults to None.
+            - overwrite_last_training (bool, optional): Whether to upload data
+              to the training with the same name in the same project, rather
+              than creating a new one. Defaults to False.
+        add_graph (bool): Whether to visual model. Default: False.
+        add_last_ckpt (bool): Whether to save checkpoint after run.
+            Default: False.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        img_key (string): Get image data from Dataset. Default: 'img_info'.
+    """
 
     def __init__(self,
-                 init_kwargs=None,
-                 add_graph=False,
-                 add_last_ckpt=False,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True,
-                 img_key='img_info'):
-        super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag,
-                                             by_epoch)
+                 init_kwargs: Optional[Dict] = None,
+                 add_graph: bool = False,
+                 add_last_ckpt: bool = False,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True,
+                 img_key: str = 'img_info'):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.init_kwargs = init_kwargs
         self.add_graph = add_graph
         self.add_last_ckpt = add_last_ckpt
         self.img_key = img_key
 
     @master_only
-    def before_run(self, runner):
-        super(PaviLoggerHook, self).before_run(runner)
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
         try:
             from pavi import SummaryWriter
         except ImportError:
-            raise ImportError('Please run "pip install pavi" to install pavi.')
+            raise ImportError(
+                'No module named pavi, please contact pavi team or visit'
+                'document for pavi installation instructions.')
 
         self.run_name = runner.work_dir.split('/')[-1]
 
         if not self.init_kwargs:
             self.init_kwargs = dict()
-        self.init_kwargs['name'] = self.run_name
-        self.init_kwargs['model'] = runner._model_name
+        self.init_kwargs.setdefault('name', self.run_name)
+        self.init_kwargs.setdefault('model', runner._model_name)
         if runner.meta is not None:
             if 'config_dict' in runner.meta:
                 config_dict = runner.meta['config_dict']
@@ -68,10 +100,10 @@ class PaviLoggerHook(LoggerHook):
                 config_dict = json.loads(
                     mmcv.dump(config_dict, file_format='json'))
                 session_text = yaml.dump(config_dict)
-                self.init_kwargs['session_text'] = session_text
+                self.init_kwargs.setdefault('session_text', session_text)
         self.writer = SummaryWriter(**self.init_kwargs)
 
-    def get_step(self, runner):
+    def get_step(self, runner) -> int:
         """Get the total training step/epoch."""
         if self.get_mode(runner) == 'val' and self.by_epoch:
             return self.get_epoch(runner)
@@ -79,14 +111,14 @@ class PaviLoggerHook(LoggerHook):
             return self.get_iter(runner)
 
     @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner, add_mode=False)
         if tags:
             self.writer.add_scalars(
                 self.get_mode(runner), tags, self.get_step(runner))
 
     @master_only
-    def after_run(self, runner):
+    def after_run(self, runner) -> None:
         if self.add_last_ckpt:
             ckpt_path = osp.join(runner.work_dir, 'latest.pth')
             if osp.islink(ckpt_path):
@@ -104,7 +136,7 @@ class PaviLoggerHook(LoggerHook):
         self.writer.close()
 
     @master_only
-    def before_epoch(self, runner):
+    def before_epoch(self, runner) -> None:
         if runner.epoch == 0 and self.add_graph:
             if is_module_wrapper(runner.model):
                 _model = runner.model.module
diff --git a/mmcv/runner/hooks/logger/segmind.py b/mmcv/runner/hooks/logger/segmind.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb3751ed7cb571fd511b6bd788cb236e07fc601
--- /dev/null
+++ b/mmcv/runner/hooks/logger/segmind.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class SegmindLoggerHook(LoggerHook):
+    """Class to log metrics to Segmind.
+
+    It requires `Segmind`_ to be installed.
+
+    Args:
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
+
+    .. _Segmind:
+        https://docs.segmind.com/python-library
+    """
+
+    def __init__(self,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch=True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_segmind()
+
+    def import_segmind(self) -> None:
+        try:
+            import segmind
+        except ImportError:
+            raise ImportError(
+                "Please run 'pip install segmind' to install segmind")
+        self.log_metrics = segmind.tracking.fluent.log_metrics
+        self.mlflow_log = segmind.utils.logging_utils.try_mlflow_log
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            # logging metrics to segmind
+            self.mlflow_log(
+                self.log_metrics, tags, step=runner.epoch, epoch=runner.epoch)
diff --git a/mmcv/runner/hooks/logger/tensorboard.py b/mmcv/runner/hooks/logger/tensorboard.py
index a8d50366f8d335f5c186db2a01cab9f5a0342c87..11d079911262497ba91265d2cbbf89f7010a7c52 100644
--- a/mmcv/runner/hooks/logger/tensorboard.py
+++ b/mmcv/runner/hooks/logger/tensorboard.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
+from typing import Optional
 
 from mmcv.utils import TORCH_VERSION, digit_version
 from ...dist_utils import master_only
@@ -9,20 +10,31 @@ from .base import LoggerHook
 
 @HOOKS.register_module()
 class TensorboardLoggerHook(LoggerHook):
+    """Class to log metrics to Tensorboard.
+
+    Args:
+        log_dir (string): Save directory location. Default: None. If default
+            values are used, directory location is ``runner.work_dir``/tf_logs.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+    """
 
     def __init__(self,
-                 log_dir=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 by_epoch=True):
-        super(TensorboardLoggerHook, self).__init__(interval, ignore_last,
-                                                    reset_flag, by_epoch)
+                 log_dir: Optional[str] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.log_dir = log_dir
 
     @master_only
-    def before_run(self, runner):
-        super(TensorboardLoggerHook, self).before_run(runner)
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
         if (TORCH_VERSION == 'parrots'
                 or digit_version(TORCH_VERSION) < digit_version('1.1')):
             try:
@@ -44,7 +56,7 @@ class TensorboardLoggerHook(LoggerHook):
         self.writer = SummaryWriter(self.log_dir)
 
     @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner, allow_text=True)
         for tag, val in tags.items():
             if isinstance(val, str):
@@ -53,5 +65,5 @@ class TensorboardLoggerHook(LoggerHook):
                 self.writer.add_scalar(tag, val, self.get_iter(runner))
 
     @master_only
-    def after_run(self, runner):
+    def after_run(self, runner) -> None:
         self.writer.close()
diff --git a/mmcv/runner/hooks/logger/text.py b/mmcv/runner/hooks/logger/text.py
index 043c7bf20ba273da95830f4188923125239628b8..fbfa208a6201fd7cb60efcc2000ec0b07e25076f 100644
--- a/mmcv/runner/hooks/logger/text.py
+++ b/mmcv/runner/hooks/logger/text.py
@@ -3,6 +3,7 @@ import datetime
 import os
 import os.path as osp
 from collections import OrderedDict
+from typing import Dict, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -53,17 +54,16 @@ class TextLoggerHook(LoggerHook):
     """
 
     def __init__(self,
-                 by_epoch=True,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 interval_exp_name=1000,
-                 out_dir=None,
-                 out_suffix=('.log.json', '.log', '.py'),
-                 keep_local=True,
-                 file_client_args=None):
-        super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag,
-                                             by_epoch)
+                 by_epoch: bool = True,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 interval_exp_name: int = 1000,
+                 out_dir: Optional[str] = None,
+                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py'),
+                 keep_local: bool = True,
+                 file_client_args: Optional[Dict] = None):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.by_epoch = by_epoch
         self.time_sec_tot = 0
         self.interval_exp_name = interval_exp_name
@@ -86,8 +86,8 @@ class TextLoggerHook(LoggerHook):
             self.file_client = FileClient.infer_client(file_client_args,
                                                        self.out_dir)
 
-    def before_run(self, runner):
-        super(TextLoggerHook, self).before_run(runner)
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
 
         if self.out_dir is not None:
             self.file_client = FileClient.infer_client(self.file_client_args,
@@ -97,8 +97,8 @@ class TextLoggerHook(LoggerHook):
             basename = osp.basename(runner.work_dir.rstrip(osp.sep))
             self.out_dir = self.file_client.join_path(self.out_dir, basename)
             runner.logger.info(
-                (f'Text logs will be saved to {self.out_dir} by '
-                 f'{self.file_client.name} after the training process.'))
+                f'Text logs will be saved to {self.out_dir} by '
+                f'{self.file_client.name} after the training process.')
 
         self.start_iter = runner.iter
         self.json_log_path = osp.join(runner.work_dir,
@@ -106,17 +106,17 @@ class TextLoggerHook(LoggerHook):
         if runner.meta is not None:
             self._dump_log(runner.meta, runner)
 
-    def _get_max_memory(self, runner):
+    def _get_max_memory(self, runner) -> int:
         device = getattr(runner.model, 'output_device', None)
         mem = torch.cuda.max_memory_allocated(device=device)
-        mem_mb = torch.tensor([mem / (1024 * 1024)],
+        mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
                               dtype=torch.int,
                               device=device)
         if runner.world_size > 1:
             dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
         return mem_mb.item()
 
-    def _log_info(self, log_dict, runner):
+    def _log_info(self, log_dict: Dict, runner) -> None:
         # print exp name for users to distinguish experiments
         # at every ``interval_exp_name`` iterations and the end of each epoch
         if runner.meta is not None and 'exp_name' in runner.meta:
@@ -130,9 +130,9 @@ class TextLoggerHook(LoggerHook):
                 lr_str = []
                 for k, val in log_dict['lr'].items():
                     lr_str.append(f'lr_{k}: {val:.3e}')
-                lr_str = ' '.join(lr_str)
+                lr_str = ' '.join(lr_str)  # type: ignore
             else:
-                lr_str = f'lr: {log_dict["lr"]:.3e}'
+                lr_str = f'lr: {log_dict["lr"]:.3e}'  # type: ignore
 
             # by epoch: Epoch [4][100/1000]
             # by iter:  Iter [100/100000]
@@ -182,7 +182,7 @@ class TextLoggerHook(LoggerHook):
 
         runner.logger.info(log_str)
 
-    def _dump_log(self, log_dict, runner):
+    def _dump_log(self, log_dict: Dict, runner) -> None:
         # dump log in json format
         json_log = OrderedDict()
         for k, v in log_dict.items():
@@ -201,7 +201,7 @@ class TextLoggerHook(LoggerHook):
         else:
             return items
 
-    def log(self, runner):
+    def log(self, runner) -> OrderedDict:
         if 'eval_iter_num' in runner.log_buffer.output:
             # this doesn't modify runner.iter and is regardless of by_epoch
             cur_iter = runner.log_buffer.output.pop('eval_iter_num')
@@ -229,28 +229,28 @@ class TextLoggerHook(LoggerHook):
             if torch.cuda.is_available():
                 log_dict['memory'] = self._get_max_memory(runner)
 
-        log_dict = dict(log_dict, **runner.log_buffer.output)
+        log_dict = dict(log_dict, **runner.log_buffer.output)  # type: ignore
 
         self._log_info(log_dict, runner)
         self._dump_log(log_dict, runner)
         return log_dict
 
-    def after_run(self, runner):
+    def after_run(self, runner) -> None:
         # copy or upload logs to self.out_dir
         if self.out_dir is not None:
             for filename in scandir(runner.work_dir, self.out_suffix, True):
                 local_filepath = osp.join(runner.work_dir, filename)
                 out_filepath = self.file_client.join_path(
                     self.out_dir, filename)
-                with open(local_filepath, 'r') as f:
+                with open(local_filepath) as f:
                     self.file_client.put_text(f.read(), out_filepath)
 
                 runner.logger.info(
-                    (f'The file {local_filepath} has been uploaded to '
-                     f'{out_filepath}.'))
+                    f'The file {local_filepath} has been uploaded to '
+                    f'{out_filepath}.')
 
                 if not self.keep_local:
                     os.remove(local_filepath)
                     runner.logger.info(
-                        (f'{local_filepath} was removed due to the '
-                         '`self.keep_local=False`'))
+                        f'{local_filepath} was removed due to the '
+                        '`self.keep_local=False`')
diff --git a/mmcv/runner/hooks/logger/wandb.py b/mmcv/runner/hooks/logger/wandb.py
index 9f6808462eb79ab2b04806a5d9f0d3dd079b5ea9..1cf165507ef29af71f70f03108d85939980dd20e 100644
--- a/mmcv/runner/hooks/logger/wandb.py
+++ b/mmcv/runner/hooks/logger/wandb.py
@@ -1,4 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, Optional, Union
+
+from mmcv.utils import scandir
 from ...dist_utils import master_only
 from ..hook import HOOKS
 from .base import LoggerHook
@@ -6,23 +10,63 @@ from .base import LoggerHook
 
 @HOOKS.register_module()
 class WandbLoggerHook(LoggerHook):
+    """Class to log metrics with wandb.
+
+    It requires `wandb`_ to be installed.
+
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys. Check
+            https://docs.wandb.ai/ref/python/init for more init arguments.
+        interval (int): Logging interval (every k iterations).
+            Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+            Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        commit (bool): Save the metrics dict to the wandb server and increment
+            the step. If false ``wandb.log`` just updates the current metrics
+            dict with the row argument and metrics won't be saved until
+            ``wandb.log`` is called with ``commit=True``.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        log_artifact (bool): If True, artifacts in {work_dir} will be uploaded
+            to wandb after training ends.
+            Default: True
+            `New in version 1.4.3.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be uploaded to wandb.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.4.3.`
+
+    .. _wandb:
+        https://docs.wandb.ai
+    """
 
     def __init__(self,
-                 init_kwargs=None,
-                 interval=10,
-                 ignore_last=True,
-                 reset_flag=False,
-                 commit=True,
-                 by_epoch=True,
-                 with_step=True):
-        super(WandbLoggerHook, self).__init__(interval, ignore_last,
-                                              reset_flag, by_epoch)
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 commit: bool = True,
+                 by_epoch: bool = True,
+                 with_step: bool = True,
+                 log_artifact: bool = True,
+                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py')):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
         self.import_wandb()
         self.init_kwargs = init_kwargs
         self.commit = commit
         self.with_step = with_step
+        self.log_artifact = log_artifact
+        self.out_suffix = out_suffix
 
-    def import_wandb(self):
+    def import_wandb(self) -> None:
         try:
             import wandb
         except ImportError:
@@ -31,17 +75,17 @@ class WandbLoggerHook(LoggerHook):
         self.wandb = wandb
 
     @master_only
-    def before_run(self, runner):
-        super(WandbLoggerHook, self).before_run(runner)
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
         if self.wandb is None:
             self.import_wandb()
         if self.init_kwargs:
-            self.wandb.init(**self.init_kwargs)
+            self.wandb.init(**self.init_kwargs)  # type: ignore
         else:
-            self.wandb.init()
+            self.wandb.init()  # type: ignore
 
     @master_only
-    def log(self, runner):
+    def log(self, runner) -> None:
         tags = self.get_loggable_tags(runner)
         if tags:
             if self.with_step:
@@ -52,5 +96,12 @@ class WandbLoggerHook(LoggerHook):
                 self.wandb.log(tags, commit=self.commit)
 
     @master_only
-    def after_run(self, runner):
+    def after_run(self, runner) -> None:
+        if self.log_artifact:
+            wandb_artifact = self.wandb.Artifact(
+                name='artifacts', type='model')
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                wandb_artifact.add_file(local_filepath)
+            self.wandb.log_artifact(wandb_artifact)
         self.wandb.join()
diff --git a/mmcv/runner/hooks/lr_updater.py b/mmcv/runner/hooks/lr_updater.py
index e5a124157c22fc850ce3ed0a47b9c1fcf1ab3de2..e0be405596d1d9153f9fa03de2557da62c688615 100644
--- a/mmcv/runner/hooks/lr_updater.py
+++ b/mmcv/runner/hooks/lr_updater.py
@@ -1,8 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import numbers
 from math import cos, pi
+from typing import Callable, List, Optional, Union
 
 import mmcv
+from mmcv import runner
 from .hook import HOOKS, Hook
 
 
@@ -23,17 +25,17 @@ class LrUpdaterHook(Hook):
     """
 
     def __init__(self,
-                 by_epoch=True,
-                 warmup=None,
-                 warmup_iters=0,
-                 warmup_ratio=0.1,
-                 warmup_by_epoch=False):
+                 by_epoch: bool = True,
+                 warmup: Optional[str] = None,
+                 warmup_iters: int = 0,
+                 warmup_ratio: float = 0.1,
+                 warmup_by_epoch: bool = False) -> None:
         # validate the "warmup" argument
         if warmup is not None:
             if warmup not in ['constant', 'linear', 'exp']:
                 raise ValueError(
                     f'"{warmup}" is not a supported type for warming up, valid'
-                    ' types are "constant" and "linear"')
+                    ' types are "constant", "linear" and "exp"')
         if warmup is not None:
             assert warmup_iters > 0, \
                 '"warmup_iters" must be a positive integer'
@@ -42,18 +44,18 @@ class LrUpdaterHook(Hook):
 
         self.by_epoch = by_epoch
         self.warmup = warmup
-        self.warmup_iters = warmup_iters
+        self.warmup_iters: Optional[int] = warmup_iters
         self.warmup_ratio = warmup_ratio
         self.warmup_by_epoch = warmup_by_epoch
 
         if self.warmup_by_epoch:
-            self.warmup_epochs = self.warmup_iters
+            self.warmup_epochs: Optional[int] = self.warmup_iters
             self.warmup_iters = None
         else:
             self.warmup_epochs = None
 
-        self.base_lr = []  # initial lr for all param groups
-        self.regular_lr = []  # expected lr if no warming up is performed
+        self.base_lr: Union[list, dict] = []  # initial lr for all param groups
+        self.regular_lr: list = []  # expected lr if no warming up is performed
 
     def _set_lr(self, runner, lr_groups):
         if isinstance(runner.optimizer, dict):
@@ -65,10 +67,10 @@ class LrUpdaterHook(Hook):
                                        lr_groups):
                 param_group['lr'] = lr
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         raise NotImplementedError
 
-    def get_regular_lr(self, runner):
+    def get_regular_lr(self, runner: 'runner.BaseRunner'):
         if isinstance(runner.optimizer, dict):
             lr_groups = {}
             for k in runner.optimizer.keys():
@@ -82,7 +84,7 @@ class LrUpdaterHook(Hook):
         else:
             return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
 
-    def get_warmup_lr(self, cur_iters):
+    def get_warmup_lr(self, cur_iters: int):
 
         def _get_warmup_lr(cur_iters, regular_lr):
             if self.warmup == 'constant':
@@ -104,7 +106,7 @@ class LrUpdaterHook(Hook):
         else:
             return _get_warmup_lr(cur_iters, self.regular_lr)
 
-    def before_run(self, runner):
+    def before_run(self, runner: 'runner.BaseRunner'):
         # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
         # it will be set according to the optimizer params
         if isinstance(runner.optimizer, dict):
@@ -117,16 +119,17 @@ class LrUpdaterHook(Hook):
                 ]
                 self.base_lr.update({k: _base_lr})
         else:
-            for group in runner.optimizer.param_groups:
+            for group in runner.optimizer.param_groups:  # type: ignore
                 group.setdefault('initial_lr', group['lr'])
             self.base_lr = [
-                group['initial_lr'] for group in runner.optimizer.param_groups
+                group['initial_lr']
+                for group in runner.optimizer.param_groups  # type: ignore
             ]
 
-    def before_train_epoch(self, runner):
+    def before_train_epoch(self, runner: 'runner.BaseRunner'):
         if self.warmup_iters is None:
-            epoch_len = len(runner.data_loader)
-            self.warmup_iters = self.warmup_epochs * epoch_len
+            epoch_len = len(runner.data_loader)  # type: ignore
+            self.warmup_iters = self.warmup_epochs * epoch_len  # type: ignore
 
         if not self.by_epoch:
             return
@@ -134,8 +137,9 @@ class LrUpdaterHook(Hook):
         self.regular_lr = self.get_regular_lr(runner)
         self._set_lr(runner, self.regular_lr)
 
-    def before_train_iter(self, runner):
+    def before_train_iter(self, runner: 'runner.BaseRunner'):
         cur_iter = runner.iter
+        assert isinstance(self.warmup_iters, int)
         if not self.by_epoch:
             self.regular_lr = self.get_regular_lr(runner)
             if self.warmup is None or cur_iter >= self.warmup_iters:
@@ -157,7 +161,7 @@ class LrUpdaterHook(Hook):
 class FixedLrUpdaterHook(LrUpdaterHook):
 
     def __init__(self, **kwargs):
-        super(FixedLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def get_lr(self, runner, base_lr):
         return base_lr
@@ -171,13 +175,17 @@ class StepLrUpdaterHook(LrUpdaterHook):
         step (int | list[int]): Step to decay the LR. If an int value is given,
             regard it as the decay interval. If a list is given, decay LR at
             these steps.
-        gamma (float, optional): Decay LR ratio. Default: 0.1.
+        gamma (float): Decay LR ratio. Defaults to 0.1.
         min_lr (float, optional): Minimum LR value to keep. If LR after decay
             is lower than `min_lr`, it will be clipped to this value. If None
             is given, we don't perform lr clipping. Default: None.
     """
 
-    def __init__(self, step, gamma=0.1, min_lr=None, **kwargs):
+    def __init__(self,
+                 step: Union[int, List[int]],
+                 gamma: float = 0.1,
+                 min_lr: Optional[float] = None,
+                 **kwargs) -> None:
         if isinstance(step, list):
             assert mmcv.is_list_of(step, int)
             assert all([s > 0 for s in step])
@@ -188,9 +196,9 @@ class StepLrUpdaterHook(LrUpdaterHook):
         self.step = step
         self.gamma = gamma
         self.min_lr = min_lr
-        super(StepLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         progress = runner.epoch if self.by_epoch else runner.iter
 
         # calculate exponential term
@@ -213,11 +221,11 @@ class StepLrUpdaterHook(LrUpdaterHook):
 @HOOKS.register_module()
 class ExpLrUpdaterHook(LrUpdaterHook):
 
-    def __init__(self, gamma, **kwargs):
+    def __init__(self, gamma: float, **kwargs) -> None:
         self.gamma = gamma
-        super(ExpLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         progress = runner.epoch if self.by_epoch else runner.iter
         return base_lr * self.gamma**progress
 
@@ -225,12 +233,15 @@ class ExpLrUpdaterHook(LrUpdaterHook):
 @HOOKS.register_module()
 class PolyLrUpdaterHook(LrUpdaterHook):
 
-    def __init__(self, power=1., min_lr=0., **kwargs):
+    def __init__(self,
+                 power: float = 1.,
+                 min_lr: float = 0.,
+                 **kwargs) -> None:
         self.power = power
         self.min_lr = min_lr
-        super(PolyLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         if self.by_epoch:
             progress = runner.epoch
             max_progress = runner.max_epochs
@@ -244,26 +255,37 @@ class PolyLrUpdaterHook(LrUpdaterHook):
 @HOOKS.register_module()
 class InvLrUpdaterHook(LrUpdaterHook):
 
-    def __init__(self, gamma, power=1., **kwargs):
+    def __init__(self, gamma: float, power: float = 1., **kwargs) -> None:
         self.gamma = gamma
         self.power = power
-        super(InvLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         progress = runner.epoch if self.by_epoch else runner.iter
         return base_lr * (1 + self.gamma * progress)**(-self.power)
 
 
 @HOOKS.register_module()
 class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """CosineAnnealing LR scheduler.
+
+    Args:
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
 
-    def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs):
+    def __init__(self,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
         assert (min_lr is None) ^ (min_lr_ratio is None)
         self.min_lr = min_lr
         self.min_lr_ratio = min_lr_ratio
-        super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         if self.by_epoch:
             progress = runner.epoch
             max_progress = runner.max_epochs
@@ -274,7 +296,7 @@ class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
         if self.min_lr_ratio is not None:
             target_lr = base_lr * self.min_lr_ratio
         else:
-            target_lr = self.min_lr
+            target_lr = self.min_lr  # type:ignore
         return annealing_cos(base_lr, target_lr, progress / max_progress)
 
 
@@ -296,10 +318,10 @@ class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
     """
 
     def __init__(self,
-                 start_percent=0.75,
-                 min_lr=None,
-                 min_lr_ratio=None,
-                 **kwargs):
+                 start_percent: float = 0.75,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
         assert (min_lr is None) ^ (min_lr_ratio is None)
         if start_percent < 0 or start_percent > 1 or not isinstance(
                 start_percent, float):
@@ -309,9 +331,9 @@ class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
         self.start_percent = start_percent
         self.min_lr = min_lr
         self.min_lr_ratio = min_lr_ratio
-        super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         if self.by_epoch:
             start = round(runner.max_epochs * self.start_percent)
             progress = runner.epoch - start
@@ -324,7 +346,7 @@ class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
         if self.min_lr_ratio is not None:
             target_lr = base_lr * self.min_lr_ratio
         else:
-            target_lr = self.min_lr
+            target_lr = self.min_lr  # type:ignore
 
         if progress < 0:
             return base_lr
@@ -338,8 +360,8 @@ class CosineRestartLrUpdaterHook(LrUpdaterHook):
 
     Args:
         periods (list[int]): Periods for each cosine anneling cycle.
-        restart_weights (list[float], optional): Restart weights at each
-            restart iteration. Default: [1].
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
         min_lr (float, optional): The minimum lr. Default: None.
         min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
             Either `min_lr` or `min_lr_ratio` should be specified.
@@ -347,11 +369,11 @@ class CosineRestartLrUpdaterHook(LrUpdaterHook):
     """
 
     def __init__(self,
-                 periods,
-                 restart_weights=[1],
-                 min_lr=None,
-                 min_lr_ratio=None,
-                 **kwargs):
+                 periods: List[int],
+                 restart_weights: List[float] = [1],
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
         assert (min_lr is None) ^ (min_lr_ratio is None)
         self.periods = periods
         self.min_lr = min_lr
@@ -359,13 +381,13 @@ class CosineRestartLrUpdaterHook(LrUpdaterHook):
         self.restart_weights = restart_weights
         assert (len(self.periods) == len(self.restart_weights)
                 ), 'periods and restart_weights should have the same length.'
-        super(CosineRestartLrUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         self.cumulative_periods = [
             sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
         ]
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         if self.by_epoch:
             progress = runner.epoch
         else:
@@ -374,7 +396,7 @@ class CosineRestartLrUpdaterHook(LrUpdaterHook):
         if self.min_lr_ratio is not None:
             target_lr = base_lr * self.min_lr_ratio
         else:
-            target_lr = self.min_lr
+            target_lr = self.min_lr  # type:ignore
 
         idx = get_position_from_periods(progress, self.cumulative_periods)
         current_weight = self.restart_weights[idx]
@@ -385,7 +407,7 @@ class CosineRestartLrUpdaterHook(LrUpdaterHook):
         return annealing_cos(base_lr, target_lr, alpha, current_weight)
 
 
-def get_position_from_periods(iteration, cumulative_periods):
+def get_position_from_periods(iteration: int, cumulative_periods: List[int]):
     """Get the position from a period list.
 
     It will return the index of the right-closest number in the period list.
@@ -420,24 +442,29 @@ class CyclicLrUpdaterHook(LrUpdaterHook):
     3D detection area.
 
     Args:
-        by_epoch (bool): Whether to update LR by epoch.
-        target_ratio (tuple[float]): Relative ratio of the highest LR and the
-            lowest LR to the initial LR.
-        cyclic_times (int): Number of cycles during training
-        step_ratio_up (float): The ratio of the increasing process of LR in
-            the total cycle.
-        anneal_strategy (str): {'cos', 'linear'}
+        by_epoch (bool, optional): Whether to update LR by epoch.
+        target_ratio (tuple[float], optional): Relative ratio of the highest LR
+            and the lowest LR to the initial LR.
+        cyclic_times (int, optional): Number of cycles during training
+        step_ratio_up (float, optional): The ratio of the increasing process of
+            LR in the total cycle.
+        anneal_strategy (str, optional): {'cos', 'linear'}
             Specifies the annealing strategy: 'cos' for cosine annealing,
             'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
     """
 
     def __init__(self,
-                 by_epoch=False,
-                 target_ratio=(10, 1e-4),
-                 cyclic_times=1,
-                 step_ratio_up=0.4,
-                 anneal_strategy='cos',
-                 **kwargs):
+                 by_epoch: bool = False,
+                 target_ratio: Union[float, tuple] = (10, 1e-4),
+                 cyclic_times: int = 1,
+                 step_ratio_up: float = 0.4,
+                 anneal_strategy: str = 'cos',
+                 gamma: float = 1,
+                 **kwargs) -> None:
         if isinstance(target_ratio, float):
             target_ratio = (target_ratio, target_ratio / 1e5)
         elif isinstance(target_ratio, tuple):
@@ -451,43 +478,60 @@ class CyclicLrUpdaterHook(LrUpdaterHook):
             '"target_ratio" must be list or tuple of two floats'
         assert 0 <= step_ratio_up < 1.0, \
             '"step_ratio_up" must be in range [0,1)'
+        assert 0 < gamma <= 1, \
+            '"gamma" must be in range (0, 1]'
 
         self.target_ratio = target_ratio
         self.cyclic_times = cyclic_times
         self.step_ratio_up = step_ratio_up
-        self.lr_phases = []  # init lr_phases
+        self.gamma = gamma
+        self.max_iter_per_phase = None
+        self.lr_phases: list = []  # init lr_phases
         # validate anneal_strategy
         if anneal_strategy not in ['cos', 'linear']:
             raise ValueError('anneal_strategy must be one of "cos" or '
                              f'"linear", instead got {anneal_strategy}')
         elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
+            self.anneal_func: Callable[[float, float, float],
+                                       float] = annealing_cos
         elif anneal_strategy == 'linear':
             self.anneal_func = annealing_linear
 
         assert not by_epoch, \
             'currently only support "by_epoch" = False'
-        super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs)
+        super().__init__(by_epoch, **kwargs)
 
-    def before_run(self, runner):
-        super(CyclicLrUpdaterHook, self).before_run(runner)
+    def before_run(self, runner: 'runner.BaseRunner'):
+        super().before_run(runner)
         # initiate lr_phases
         # total lr_phases are separated as up and down
-        max_iter_per_phase = runner.max_iters // self.cyclic_times
-        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
-        self.lr_phases.append(
-            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
+        self.max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up *
+                            self.max_iter_per_phase)  # type: ignore
+        self.lr_phases.append([0, iter_up_phase, 1, self.target_ratio[0]])
         self.lr_phases.append([
-            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
-            self.target_ratio[0], self.target_ratio[1]
+            iter_up_phase, self.max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
         ])
 
-    def get_lr(self, runner, base_lr):
-        curr_iter = runner.iter
-        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
-             end_ratio) in self.lr_phases:
-            curr_iter %= max_iter_per_phase
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        curr_iter = runner.iter % self.max_iter_per_phase  # type: ignore
+        curr_cycle = runner.iter // self.max_iter_per_phase  # type: ignore
+        # Update weight decay
+        scale = self.gamma**curr_cycle
+
+        for (start_iter, end_iter, start_ratio, end_ratio) in self.lr_phases:
             if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_lr and base lr. The target end_ratio can be
+                # expressed as:
+                # end_ratio = (base_lr + scale * (max_lr - base_lr)) / base_lr
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
                 progress = curr_iter - start_iter
                 return self.anneal_func(base_lr * start_ratio,
                                         base_lr * end_ratio,
@@ -530,14 +574,14 @@ class OneCycleLrUpdaterHook(LrUpdaterHook):
     """
 
     def __init__(self,
-                 max_lr,
-                 total_steps=None,
-                 pct_start=0.3,
-                 anneal_strategy='cos',
-                 div_factor=25,
-                 final_div_factor=1e4,
-                 three_phase=False,
-                 **kwargs):
+                 max_lr: Union[float, List],
+                 total_steps: Optional[int] = None,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 div_factor: float = 25,
+                 final_div_factor: float = 1e4,
+                 three_phase: bool = False,
+                 **kwargs) -> None:
         # validate by_epoch, currently only support by_epoch = False
         if 'by_epoch' not in kwargs:
             kwargs['by_epoch'] = False
@@ -563,16 +607,17 @@ class OneCycleLrUpdaterHook(LrUpdaterHook):
             raise ValueError('anneal_strategy must be one of "cos" or '
                              f'"linear", instead got {anneal_strategy}')
         elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
+            self.anneal_func: Callable[[float, float, float],
+                                       float] = annealing_cos
         elif anneal_strategy == 'linear':
             self.anneal_func = annealing_linear
         self.div_factor = div_factor
         self.final_div_factor = final_div_factor
         self.three_phase = three_phase
-        self.lr_phases = []  # init lr_phases
-        super(OneCycleLrUpdaterHook, self).__init__(**kwargs)
+        self.lr_phases: list = []  # init lr_phases
+        super().__init__(**kwargs)
 
-    def before_run(self, runner):
+    def before_run(self, runner: 'runner.BaseRunner'):
         if hasattr(self, 'total_steps'):
             total_steps = self.total_steps
         else:
@@ -594,7 +639,8 @@ class OneCycleLrUpdaterHook(LrUpdaterHook):
             k = type(runner.optimizer).__name__
             _max_lr = format_param(k, runner.optimizer, self._max_lr)
             self.base_lr = [lr / self.div_factor for lr in _max_lr]
-            for group, lr in zip(runner.optimizer.param_groups, self.base_lr):
+            optim_param_groups = runner.optimizer.param_groups  # type: ignore
+            for group, lr in zip(optim_param_groups, self.base_lr):
                 group.setdefault('initial_lr', lr)
 
         if self.three_phase:
@@ -611,7 +657,7 @@ class OneCycleLrUpdaterHook(LrUpdaterHook):
             self.lr_phases.append(
                 [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
 
-    def get_lr(self, runner, base_lr):
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
         curr_iter = runner.iter
         start_iter = 0
         for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
@@ -624,7 +670,45 @@ class OneCycleLrUpdaterHook(LrUpdaterHook):
         return lr
 
 
-def annealing_cos(start, end, factor, weight=1):
+@HOOKS.register_module()
+class LinearAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Linear annealing LR Scheduler decays the learning rate of each parameter
+    group linearly.
+
+    Args:
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr  # type:ignore
+        return annealing_linear(base_lr, target_lr, progress / max_progress)
+
+
+def annealing_cos(start: float,
+                  end: float,
+                  factor: float,
+                  weight: float = 1.) -> float:
     """Calculate annealing cos learning rate.
 
     Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
@@ -642,7 +726,7 @@ def annealing_cos(start, end, factor, weight=1):
     return end + 0.5 * weight * (start - end) * cos_out
 
 
-def annealing_linear(start, end, factor):
+def annealing_linear(start: float, end: float, factor: float) -> float:
     """Calculate annealing linear learning rate.
 
     Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
diff --git a/mmcv/runner/hooks/memory.py b/mmcv/runner/hooks/memory.py
index 70cf9a838fb314e3bd3c07aadbc00921a81e83ed..78d1a7e3684a22d20193af13665013a4aabadd60 100644
--- a/mmcv/runner/hooks/memory.py
+++ b/mmcv/runner/hooks/memory.py
@@ -7,7 +7,10 @@ from .hook import HOOKS, Hook
 @HOOKS.register_module()
 class EmptyCacheHook(Hook):
 
-    def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
+    def __init__(self,
+                 before_epoch: bool = False,
+                 after_epoch: bool = True,
+                 after_iter: bool = False):
         self._before_epoch = before_epoch
         self._after_epoch = after_epoch
         self._after_iter = after_iter
diff --git a/mmcv/runner/hooks/momentum_updater.py b/mmcv/runner/hooks/momentum_updater.py
index 13d0e2fab4deda851fe16b6a9716b3a8e1fd6187..fd9bc4834b76ef249dad7f13ef0eccd4447d6867 100644
--- a/mmcv/runner/hooks/momentum_updater.py
+++ b/mmcv/runner/hooks/momentum_updater.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
 import mmcv
 from .hook import HOOKS, Hook
 from .lr_updater import annealing_cos, annealing_linear, format_param
@@ -7,10 +9,10 @@ from .lr_updater import annealing_cos, annealing_linear, format_param
 class MomentumUpdaterHook(Hook):
 
     def __init__(self,
-                 by_epoch=True,
-                 warmup=None,
-                 warmup_iters=0,
-                 warmup_ratio=0.9):
+                 by_epoch: bool = True,
+                 warmup: Optional[str] = None,
+                 warmup_iters: int = 0,
+                 warmup_ratio: float = 0.9):
         # validate the "warmup" argument
         if warmup is not None:
             if warmup not in ['constant', 'linear', 'exp']:
@@ -28,9 +30,10 @@ class MomentumUpdaterHook(Hook):
         self.warmup_iters = warmup_iters
         self.warmup_ratio = warmup_ratio
 
-        self.base_momentum = []  # initial momentum for all param groups
-        self.regular_momentum = [
-        ]  # expected momentum if no warming up is performed
+        # initial momentum for all param groups
+        self.base_momentum: Union[list, dict] = []
+        # expected momentum if no warming up is performed
+        self.regular_momentum: Union[list, dict] = []
 
     def _set_momentum(self, runner, momentum_groups):
         if isinstance(runner.optimizer, dict):
@@ -49,44 +52,52 @@ class MomentumUpdaterHook(Hook):
                 elif 'betas' in param_group.keys():
                     param_group['betas'] = (mom, param_group['betas'][1])
 
-    def get_momentum(self, runner, base_momentum):
+    def get_momentum(self, runner, base_momentum) -> float:
         raise NotImplementedError
 
-    def get_regular_momentum(self, runner):
+    def get_regular_momentum(self, runner) -> Union[list, Dict[str, list]]:
         if isinstance(runner.optimizer, dict):
-            momentum_groups = {}
+            assert isinstance(self.base_momentum, dict)
+            momentum_groups: Dict[str, List[float]] = {}
             for k in runner.optimizer.keys():
-                _momentum_group = [
+                _momentum_group: List[float] = [
                     self.get_momentum(runner, _base_momentum)
                     for _base_momentum in self.base_momentum[k]
                 ]
                 momentum_groups.update({k: _momentum_group})
             return momentum_groups
         else:
+            assert isinstance(self.base_momentum, list)
             return [
                 self.get_momentum(runner, _base_momentum)
                 for _base_momentum in self.base_momentum
             ]
 
-    def get_warmup_momentum(self, cur_iters):
+    def get_warmup_momentum(
+            self,
+            cur_iters: int) -> Union[List[float], Dict[str, List[float]]]:
 
         def _get_warmup_momentum(cur_iters, regular_momentum):
             if self.warmup == 'constant':
                 warmup_momentum = [
                     _momentum / self.warmup_ratio
-                    for _momentum in self.regular_momentum
+                    for _momentum in regular_momentum
                 ]
             elif self.warmup == 'linear':
                 k = (1 - cur_iters / self.warmup_iters) * (1 -
                                                            self.warmup_ratio)
                 warmup_momentum = [
-                    _momentum / (1 - k) for _momentum in self.regular_mom
+                    _momentum / (1 - k) for _momentum in regular_momentum
                 ]
             elif self.warmup == 'exp':
                 k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
                 warmup_momentum = [
-                    _momentum / k for _momentum in self.regular_mom
+                    _momentum / k for _momentum in regular_momentum
                 ]
+            else:
+                raise ValueError(
+                    'Expected values of `self.warmup` to be "constant", '
+                    f'"linear", or "exp", got {self.warmup}')
             return warmup_momentum
 
         if isinstance(self.regular_momentum, dict):
@@ -128,15 +139,15 @@ class MomentumUpdaterHook(Hook):
     def before_train_epoch(self, runner):
         if not self.by_epoch:
             return
-        self.regular_mom = self.get_regular_momentum(runner)
-        self._set_momentum(runner, self.regular_mom)
+        self.regular_momentum = self.get_regular_momentum(runner)
+        self._set_momentum(runner, self.regular_momentum)
 
     def before_train_iter(self, runner):
         cur_iter = runner.iter
         if not self.by_epoch:
-            self.regular_mom = self.get_regular_momentum(runner)
+            self.regular_momentum = self.get_regular_momentum(runner)
             if self.warmup is None or cur_iter >= self.warmup_iters:
-                self._set_momentum(runner, self.regular_mom)
+                self._set_momentum(runner, self.regular_momentum)
             else:
                 warmup_momentum = self.get_warmup_momentum(cur_iter)
                 self._set_momentum(runner, warmup_momentum)
@@ -144,7 +155,7 @@ class MomentumUpdaterHook(Hook):
             if self.warmup is None or cur_iter > self.warmup_iters:
                 return
             elif cur_iter == self.warmup_iters:
-                self._set_momentum(runner, self.regular_mom)
+                self._set_momentum(runner, self.regular_momentum)
             else:
                 warmup_momentum = self.get_warmup_momentum(cur_iter)
                 self._set_momentum(runner, warmup_momentum)
@@ -165,7 +176,11 @@ class StepMomentumUpdaterHook(MomentumUpdaterHook):
             Default: None.
     """
 
-    def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs):
+    def __init__(self,
+                 step: Union[int, List[int]],
+                 gamma: float = 0.5,
+                 min_momentum: Optional[float] = None,
+                 **kwargs):
         if isinstance(step, list):
             assert mmcv.is_list_of(step, int)
             assert all([s > 0 for s in step])
@@ -176,9 +191,9 @@ class StepMomentumUpdaterHook(MomentumUpdaterHook):
         self.step = step
         self.gamma = gamma
         self.min_momentum = min_momentum
-        super(StepMomentumUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_momentum(self, runner, base_momentum):
+    def get_momentum(self, runner, base_momentum: float) -> float:
         progress = runner.epoch if self.by_epoch else runner.iter
 
         # calculate exponential term
@@ -200,14 +215,26 @@ class StepMomentumUpdaterHook(MomentumUpdaterHook):
 
 @HOOKS.register_module()
 class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+    """Cosine annealing LR Momentum decays the Momentum of each parameter group
+    linearly.
+
+    Args:
+        min_momentum (float, optional): The minimum momentum. Default: None.
+        min_momentum_ratio (float, optional): The ratio of minimum momentum to
+            the base momentum. Either `min_momentum` or `min_momentum_ratio`
+            should be specified. Default: None.
+    """
 
-    def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs):
+    def __init__(self,
+                 min_momentum: Optional[float] = None,
+                 min_momentum_ratio: Optional[float] = None,
+                 **kwargs):
         assert (min_momentum is None) ^ (min_momentum_ratio is None)
         self.min_momentum = min_momentum
         self.min_momentum_ratio = min_momentum_ratio
-        super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
-    def get_momentum(self, runner, base_momentum):
+    def get_momentum(self, runner, base_momentum: float) -> float:
         if self.by_epoch:
             progress = runner.epoch
             max_progress = runner.max_epochs
@@ -217,11 +244,49 @@ class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
         if self.min_momentum_ratio is not None:
             target_momentum = base_momentum * self.min_momentum_ratio
         else:
+            assert self.min_momentum is not None
             target_momentum = self.min_momentum
         return annealing_cos(base_momentum, target_momentum,
                              progress / max_progress)
 
 
+@HOOKS.register_module()
+class LinearAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+    """Linear annealing LR Momentum decays the Momentum of each parameter group
+    linearly.
+
+    Args:
+        min_momentum (float, optional): The minimum momentum. Default: None.
+        min_momentum_ratio (float, optional): The ratio of minimum momentum to
+            the base momentum. Either `min_momentum` or `min_momentum_ratio`
+            should be specified. Default: None.
+    """
+
+    def __init__(self,
+                 min_momentum: Optional[float] = None,
+                 min_momentum_ratio: Optional[float] = None,
+                 **kwargs):
+        assert (min_momentum is None) ^ (min_momentum_ratio is None)
+        self.min_momentum = min_momentum
+        self.min_momentum_ratio = min_momentum_ratio
+        super().__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_momentum_ratio is not None:
+            target_momentum = base_momentum * self.min_momentum_ratio
+        else:
+            assert self.min_momentum is not None
+            target_momentum = self.min_momentum
+        return annealing_linear(base_momentum, target_momentum,
+                                progress / max_progress)
+
+
 @HOOKS.register_module()
 class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
     """Cyclic momentum Scheduler.
@@ -232,20 +297,29 @@ class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
     This momentum scheduler usually used together with the CyclicLRUpdater
     to improve the performance in the 3D detection area.
 
-    Attributes:
+    Args:
         target_ratio (tuple[float]): Relative ratio of the lowest momentum and
             the highest momentum to the initial momentum.
         cyclic_times (int): Number of cycles during training
         step_ratio_up (float): The ratio of the increasing process of momentum
             in  the total cycle.
         by_epoch (bool): Whether to update momentum by epoch.
+        anneal_strategy (str, optional): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
     """
 
     def __init__(self,
-                 by_epoch=False,
-                 target_ratio=(0.85 / 0.95, 1),
-                 cyclic_times=1,
-                 step_ratio_up=0.4,
+                 by_epoch: bool = False,
+                 target_ratio: Tuple[float, float] = (0.85 / 0.95, 1.),
+                 cyclic_times: int = 1,
+                 step_ratio_up: float = 0.4,
+                 anneal_strategy: str = 'cos',
+                 gamma: float = 1.,
                  **kwargs):
         if isinstance(target_ratio, float):
             target_ratio = (target_ratio, target_ratio / 1e5)
@@ -264,35 +338,60 @@ class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
         self.target_ratio = target_ratio
         self.cyclic_times = cyclic_times
         self.step_ratio_up = step_ratio_up
-        self.momentum_phases = []  # init momentum_phases
+        self.gamma = gamma
+        self.momentum_phases: List[list] = []  # init momentum_phases
+
+        self.anneal_func: Callable[[float, float, float], float]
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
         # currently only support by_epoch=False
         assert not by_epoch, \
             'currently only support "by_epoch" = False'
-        super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs)
+        super().__init__(by_epoch, **kwargs)
 
     def before_run(self, runner):
-        super(CyclicMomentumUpdaterHook, self).before_run(runner)
+        super().before_run(runner)
         # initiate momentum_phases
         # total momentum_phases are separated as up and down
         max_iter_per_phase = runner.max_iters // self.cyclic_times
         iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.max_iter_per_phase = max_iter_per_phase
         self.momentum_phases.append(
-            [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]])
+            [0, iter_up_phase, 1, self.target_ratio[0]])
         self.momentum_phases.append([
-            iter_up_phase, max_iter_per_phase, max_iter_per_phase,
-            self.target_ratio[0], self.target_ratio[1]
+            iter_up_phase, max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
         ])
 
-    def get_momentum(self, runner, base_momentum):
-        curr_iter = runner.iter
-        for (start_iter, end_iter, max_iter_per_phase, start_ratio,
-             end_ratio) in self.momentum_phases:
-            curr_iter %= max_iter_per_phase
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        curr_iter = runner.iter % self.max_iter_per_phase
+        curr_cycle = runner.iter // self.max_iter_per_phase
+        scale = self.gamma**curr_cycle
+        for (start_iter, end_iter, start_ratio, end_ratio) \
+                in self.momentum_phases:
             if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_momentum and base momentum. The target end_ratio
+                # can be expressed as:
+                # end_ratio = (base_momentum + scale * \
+                # (max_momentum - base_momentum)) / base_momentum
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
                 progress = curr_iter - start_iter
-                return annealing_cos(base_momentum * start_ratio,
-                                     base_momentum * end_ratio,
-                                     progress / (end_iter - start_iter))
+                return self.anneal_func(base_momentum * start_ratio,
+                                        base_momentum * end_ratio,
+                                        progress / (end_iter - start_iter))
+        raise RuntimeError('The method should return in the for-loop and '
+                           'should not be executed until this')
 
 
 @HOOKS.register_module()
@@ -331,11 +430,11 @@ class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
     """
 
     def __init__(self,
-                 base_momentum=0.85,
-                 max_momentum=0.95,
-                 pct_start=0.3,
-                 anneal_strategy='cos',
-                 three_phase=False,
+                 base_momentum: Union[float, list, dict] = 0.85,
+                 max_momentum: Union[float, list, dict] = 0.95,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 three_phase: bool = False,
                  **kwargs):
         # validate by_epoch, currently only support by_epoch=False
         if 'by_epoch' not in kwargs:
@@ -357,6 +456,7 @@ class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
                              f'got {pct_start}')
         self.pct_start = pct_start
         # validate anneal_strategy
+        self.anneal_func: Callable[[float, float, float], float]
         if anneal_strategy not in ['cos', 'linear']:
             raise ValueError('anneal_strategy must by one of "cos" or '
                              f'"linear", instead got {anneal_strategy}')
@@ -365,8 +465,8 @@ class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
         elif anneal_strategy == 'linear':
             self.anneal_func = annealing_linear
         self.three_phase = three_phase
-        self.momentum_phases = []  # init momentum_phases
-        super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs)
+        self.momentum_phases: List[dict] = []  # init momentum_phases
+        super().__init__(**kwargs)
 
     def before_run(self, runner):
         if isinstance(runner.optimizer, dict):
@@ -462,9 +562,10 @@ class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
                 elif 'betas' in param_group.keys():
                     param_group['betas'] = (mom, param_group['betas'][1])
 
-    def get_momentum(self, runner, param_group):
+    def get_momentum(self, runner, param_group: Dict[str, float]) -> float:
         curr_iter = runner.iter
         start_iter = 0
+        momentum = 0.
         for i, phase in enumerate(self.momentum_phases):
             end_iter = phase['end_iter']
             if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
diff --git a/mmcv/runner/hooks/optimizer.py b/mmcv/runner/hooks/optimizer.py
index f575ceda05edfa15fc48b381f197a1fd8320e3d1..fb3f90e656b4655b8919c11a53bf559326c7b779 100644
--- a/mmcv/runner/hooks/optimizer.py
+++ b/mmcv/runner/hooks/optimizer.py
@@ -1,8 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
+import logging
 from collections import defaultdict
 from itertools import chain
+from typing import Optional, Union
 
+import torch.nn as nn
+from torch import Tensor
 from torch.nn.utils import clip_grad
 
 from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
@@ -20,9 +24,29 @@ except ImportError:
 
 @HOOKS.register_module()
 class OptimizerHook(Hook):
+    """A hook contains custom operations for the optimizer.
 
-    def __init__(self, grad_clip=None):
+    Args:
+        grad_clip (dict, optional): A config dict to control the clip_grad.
+            Default: None.
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+                - Parameters were not used during
+                  forward pass.
+                - Parameters were not used to produce
+                  loss.
+            Default: False.
+    """
+
+    def __init__(self,
+                 grad_clip: Optional[dict] = None,
+                 detect_anomalous_params: bool = False):
         self.grad_clip = grad_clip
+        self.detect_anomalous_params = detect_anomalous_params
 
     def clip_grads(self, params):
         params = list(
@@ -32,7 +56,10 @@ class OptimizerHook(Hook):
 
     def after_train_iter(self, runner):
         runner.optimizer.zero_grad()
+        if self.detect_anomalous_params:
+            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
         runner.outputs['loss'].backward()
+
         if self.grad_clip is not None:
             grad_norm = self.clip_grads(runner.model.parameters())
             if grad_norm is not None:
@@ -41,6 +68,32 @@ class OptimizerHook(Hook):
                                          runner.outputs['num_samples'])
         runner.optimizer.step()
 
+    def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
+        logger = runner.logger
+        parameters_in_graph = set()
+        visited = set()
+
+        def traverse(grad_fn):
+            if grad_fn is None:
+                return
+            if grad_fn not in visited:
+                visited.add(grad_fn)
+                if hasattr(grad_fn, 'variable'):
+                    parameters_in_graph.add(grad_fn.variable)
+                parents = grad_fn.next_functions
+                if parents is not None:
+                    for parent in parents:
+                        grad_fn = parent[0]
+                        traverse(grad_fn)
+
+        traverse(loss.grad_fn)
+        for n, p in runner.model.named_parameters():
+            if p not in parameters_in_graph and p.requires_grad:
+                logger.log(
+                    level=logging.ERROR,
+                    msg=f'{n} with shape {p.size()} is not '
+                    f'in the computational graph \n')
+
 
 @HOOKS.register_module()
 class GradientCumulativeOptimizerHook(OptimizerHook):
@@ -61,8 +114,8 @@ class GradientCumulativeOptimizerHook(OptimizerHook):
         >>> optim_hook = OptimizerHook()
     """
 
-    def __init__(self, cumulative_iters=1, **kwargs):
-        super(GradientCumulativeOptimizerHook, self).__init__(**kwargs)
+    def __init__(self, cumulative_iters: int = 1, **kwargs):
+        super().__init__(**kwargs)
 
         assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
             f'cumulative_iters only accepts positive int, but got ' \
@@ -73,7 +126,7 @@ class GradientCumulativeOptimizerHook(OptimizerHook):
         self.remainder_iters = 0
         self.initialized = False
 
-    def has_batch_norm(self, module):
+    def has_batch_norm(self, module: nn.Module) -> bool:
         if isinstance(module, _BatchNorm):
             return True
         for m in module.children():
@@ -160,11 +213,11 @@ if (TORCH_VERSION != 'parrots'
         """
 
         def __init__(self,
-                     grad_clip=None,
-                     coalesce=True,
-                     bucket_size_mb=-1,
-                     loss_scale=512.,
-                     distributed=True):
+                     grad_clip: Optional[dict] = None,
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1,
+                     loss_scale: Union[float, str, dict] = 512.,
+                     distributed: bool = True):
             self.grad_clip = grad_clip
             self.coalesce = coalesce
             self.bucket_size_mb = bucket_size_mb
@@ -181,7 +234,7 @@ if (TORCH_VERSION != 'parrots'
                 raise ValueError('loss_scale must be of type float, dict, or '
                                  f'"dynamic", got {loss_scale}')
 
-        def before_run(self, runner):
+        def before_run(self, runner) -> None:
             """Preparing steps before Mixed Precision Training."""
             # wrap model mode to fp16
             wrap_fp16_model(runner.model)
@@ -190,7 +243,8 @@ if (TORCH_VERSION != 'parrots'
                 scaler_state_dict = runner.meta['fp16']['loss_scaler']
                 self.loss_scaler.load_state_dict(scaler_state_dict)
 
-        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+        def copy_grads_to_fp32(self, fp16_net: nn.Module,
+                               fp32_weights: Tensor) -> None:
             """Copy gradients from fp16 model to fp32 weight copy."""
             for fp32_param, fp16_param in zip(fp32_weights,
                                               fp16_net.parameters()):
@@ -200,13 +254,14 @@ if (TORCH_VERSION != 'parrots'
                             fp32_param.size())
                     fp32_param.grad.copy_(fp16_param.grad)
 
-        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+        def copy_params_to_fp16(self, fp16_net: nn.Module,
+                                fp32_weights: Tensor) -> None:
             """Copy updated params from fp32 weight copy to fp16 model."""
             for fp16_param, fp32_param in zip(fp16_net.parameters(),
                                               fp32_weights):
                 fp16_param.data.copy_(fp32_param.data)
 
-        def after_train_iter(self, runner):
+        def after_train_iter(self, runner) -> None:
             """Backward optimization steps for Mixed Precision Training. For
             dynamic loss scaling, please refer to
             https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
@@ -249,10 +304,9 @@ if (TORCH_VERSION != 'parrots'
         """
 
         def __init__(self, *args, **kwargs):
-            super(GradientCumulativeFp16OptimizerHook,
-                  self).__init__(*args, **kwargs)
+            super().__init__(*args, **kwargs)
 
-        def after_train_iter(self, runner):
+        def after_train_iter(self, runner) -> None:
             if not self.initialized:
                 self._init(runner)
 
@@ -294,7 +348,7 @@ if (TORCH_VERSION != 'parrots'
 else:
 
     @HOOKS.register_module()
-    class Fp16OptimizerHook(OptimizerHook):
+    class Fp16OptimizerHook(OptimizerHook):  # type: ignore
         """FP16 optimizer hook (mmcv's implementation).
 
         The steps of fp16 optimizer is as follows.
@@ -316,11 +370,11 @@ else:
         """
 
         def __init__(self,
-                     grad_clip=None,
-                     coalesce=True,
-                     bucket_size_mb=-1,
-                     loss_scale=512.,
-                     distributed=True):
+                     grad_clip: Optional[dict] = None,
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1,
+                     loss_scale: Union[float, str, dict] = 512.,
+                     distributed: bool = True):
             self.grad_clip = grad_clip
             self.coalesce = coalesce
             self.bucket_size_mb = bucket_size_mb
@@ -336,7 +390,7 @@ else:
                 raise ValueError('loss_scale must be of type float, dict, or '
                                  f'"dynamic", got {loss_scale}')
 
-        def before_run(self, runner):
+        def before_run(self, runner) -> None:
             """Preparing steps before Mixed Precision Training.
 
             1. Make a master copy of fp32 weights for optimization.
@@ -346,7 +400,7 @@ else:
             old_groups = runner.optimizer.param_groups
             runner.optimizer.param_groups = copy.deepcopy(
                 runner.optimizer.param_groups)
-            state = defaultdict(dict)
+            state: defaultdict = defaultdict(dict)
             p_map = {
                 old_p: p
                 for old_p, p in zip(
@@ -364,7 +418,8 @@ else:
                 scaler_state_dict = runner.meta['fp16']['loss_scaler']
                 self.loss_scaler.load_state_dict(scaler_state_dict)
 
-        def copy_grads_to_fp32(self, fp16_net, fp32_weights):
+        def copy_grads_to_fp32(self, fp16_net: nn.Module,
+                               fp32_weights: Tensor) -> None:
             """Copy gradients from fp16 model to fp32 weight copy."""
             for fp32_param, fp16_param in zip(fp32_weights,
                                               fp16_net.parameters()):
@@ -374,13 +429,14 @@ else:
                             fp32_param.size())
                     fp32_param.grad.copy_(fp16_param.grad)
 
-        def copy_params_to_fp16(self, fp16_net, fp32_weights):
+        def copy_params_to_fp16(self, fp16_net: nn.Module,
+                                fp32_weights: Tensor) -> None:
             """Copy updated params from fp32 weight copy to fp16 model."""
             for fp16_param, fp32_param in zip(fp16_net.parameters(),
                                               fp32_weights):
                 fp16_param.data.copy_(fp32_param.data)
 
-        def after_train_iter(self, runner):
+        def after_train_iter(self, runner) -> None:
             """Backward optimization steps for Mixed Precision Training. For
             dynamic loss scaling, please refer `loss_scalar.py`
 
@@ -436,16 +492,15 @@ else:
                 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
 
     @HOOKS.register_module()
-    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
-                                              Fp16OptimizerHook):
+    class GradientCumulativeFp16OptimizerHook(  # type: ignore
+            GradientCumulativeOptimizerHook, Fp16OptimizerHook):
         """Fp16 optimizer Hook (using mmcv implementation) implements multi-
         iters gradient cumulating."""
 
         def __init__(self, *args, **kwargs):
-            super(GradientCumulativeFp16OptimizerHook,
-                  self).__init__(*args, **kwargs)
+            super().__init__(*args, **kwargs)
 
-        def after_train_iter(self, runner):
+        def after_train_iter(self, runner) -> None:
             if not self.initialized:
                 self._init(runner)
 
diff --git a/mmcv/runner/hooks/profiler.py b/mmcv/runner/hooks/profiler.py
index b70236997eec59c2209ef351ae38863b4112d0ec..6b0fc4b86466aef85e2ea5c554766a94f6378125 100644
--- a/mmcv/runner/hooks/profiler.py
+++ b/mmcv/runner/hooks/profiler.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 import warnings
 from typing import Callable, List, Optional, Union
 
@@ -131,6 +132,15 @@ class ProfilerHook(Hook):
                     raise ImportError('please run "pip install '
                                       'torch-tb-profiler" to install '
                                       'torch_tb_profiler')
+                if 'dir_name' not in trace_cfg:
+                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
+                                                     'tf_tracing_logs')
+                elif not osp.isabs(trace_cfg['dir_name']):
+                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
+                                                     trace_cfg['dir_name'])
+                runner.logger.info(
+                    'tracing files of ProfilerHook will be saved to '
+                    f"{trace_cfg['dir_name']}.")
                 _on_trace_ready = torch.profiler.tensorboard_trace_handler(
                     **trace_cfg)
             else:
@@ -142,7 +152,7 @@ class ProfilerHook(Hook):
             raise ValueError('on_trace_ready should be handler, dict or None, '
                              f'but got {type(self.on_trace_ready)}')
 
-        if runner.max_epochs > 1:
+        if self.by_epoch and runner.max_epochs > 1:
             warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
                           'instead of 1 epoch. Since profiler will slow down '
                           'the training, it is recommended to train 1 epoch '
diff --git a/mmcv/runner/hooks/sync_buffer.py b/mmcv/runner/hooks/sync_buffer.py
index 6376b7ff894280cb2782243b25e8973650591577..5f07ae656ad04b87552bbd16e7b3e5b22ad03a68 100644
--- a/mmcv/runner/hooks/sync_buffer.py
+++ b/mmcv/runner/hooks/sync_buffer.py
@@ -13,7 +13,7 @@ class SyncBuffersHook(Hook):
           effective only for distributed training. Defaults to True.
     """
 
-    def __init__(self, distributed=True):
+    def __init__(self, distributed: bool = True):
         self.distributed = distributed
 
     def after_epoch(self, runner):
diff --git a/mmcv/runner/iter_based_runner.py b/mmcv/runner/iter_based_runner.py
index 9892b07a4a496f9f217d598ea0140c27ca187ffe..06b4b7d2a0ab056dc7689b95548c4427ce115aa2 100644
--- a/mmcv/runner/iter_based_runner.py
+++ b/mmcv/runner/iter_based_runner.py
@@ -4,9 +4,11 @@ import platform
 import shutil
 import time
 import warnings
+from typing import Callable, Dict, List, Optional, Tuple, Union, no_type_check
 
 import torch
 from torch.optim import Optimizer
+from torch.utils.data import DataLoader
 
 import mmcv
 from .base_runner import BaseRunner
@@ -18,13 +20,13 @@ from .utils import get_host_info
 
 class IterLoader:
 
-    def __init__(self, dataloader):
+    def __init__(self, dataloader: DataLoader):
         self._dataloader = dataloader
         self.iter_loader = iter(self._dataloader)
         self._epoch = 0
 
     @property
-    def epoch(self):
+    def epoch(self) -> int:
         return self._epoch
 
     def __next__(self):
@@ -57,6 +59,7 @@ class IterBasedRunner(BaseRunner):
         self.data_loader = data_loader
         self._epoch = data_loader.epoch
         data_batch = next(data_loader)
+        self.data_batch = data_batch
         self.call_hook('before_train_iter')
         outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
         if not isinstance(outputs, dict):
@@ -65,6 +68,7 @@ class IterBasedRunner(BaseRunner):
             self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
         self.outputs = outputs
         self.call_hook('after_train_iter')
+        del self.data_batch
         self._inner_iter += 1
         self._iter += 1
 
@@ -74,6 +78,7 @@ class IterBasedRunner(BaseRunner):
         self.mode = 'val'
         self.data_loader = data_loader
         data_batch = next(data_loader)
+        self.data_batch = data_batch
         self.call_hook('before_val_iter')
         outputs = self.model.val_step(data_batch, **kwargs)
         if not isinstance(outputs, dict):
@@ -82,9 +87,14 @@ class IterBasedRunner(BaseRunner):
             self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
         self.outputs = outputs
         self.call_hook('after_val_iter')
+        del self.data_batch
         self._inner_iter += 1
 
-    def run(self, data_loaders, workflow, max_iters=None, **kwargs):
+    def run(self,
+            data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]],
+            max_iters: Optional[int] = None,
+            **kwargs) -> None:
         """Start running.
 
         Args:
@@ -137,10 +147,11 @@ class IterBasedRunner(BaseRunner):
         self.call_hook('after_epoch')
         self.call_hook('after_run')
 
+    @no_type_check
     def resume(self,
-               checkpoint,
-               resume_optimizer=True,
-               map_location='default'):
+               checkpoint: str,
+               resume_optimizer: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
         """Resume model from checkpoint.
 
         Args:
@@ -176,12 +187,13 @@ class IterBasedRunner(BaseRunner):
 
         self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
 
-    def save_checkpoint(self,
-                        out_dir,
-                        filename_tmpl='iter_{}.pth',
-                        meta=None,
-                        save_optimizer=True,
-                        create_symlink=True):
+    def save_checkpoint(  # type: ignore
+            self,
+            out_dir: str,
+            filename_tmpl: str = 'iter_{}.pth',
+            meta: Optional[Dict] = None,
+            save_optimizer: bool = True,
+            create_symlink: bool = True) -> None:
         """Save checkpoint to file.
 
         Args:
@@ -257,13 +269,13 @@ class IterBasedRunner(BaseRunner):
         will be triggered after default hooks.
         """
         if checkpoint_config is not None:
-            checkpoint_config.setdefault('by_epoch', False)
+            checkpoint_config.setdefault('by_epoch', False)  # type: ignore
         if lr_config is not None:
-            lr_config.setdefault('by_epoch', False)
+            lr_config.setdefault('by_epoch', False)  # type: ignore
         if log_config is not None:
             for info in log_config['hooks']:
                 info.setdefault('by_epoch', False)
-        super(IterBasedRunner, self).register_training_hooks(
+        super().register_training_hooks(
             lr_config=lr_config,
             momentum_config=momentum_config,
             optimizer_config=optimizer_config,
diff --git a/mmcv/runner/log_buffer.py b/mmcv/runner/log_buffer.py
index d949e2941c5400088c7cd8a1dc893d8b233ae785..3c9f3796376959bc46df2a6bb909d2fade1ff2ee 100644
--- a/mmcv/runner/log_buffer.py
+++ b/mmcv/runner/log_buffer.py
@@ -12,16 +12,16 @@ class LogBuffer:
         self.output = OrderedDict()
         self.ready = False
 
-    def clear(self):
+    def clear(self) -> None:
         self.val_history.clear()
         self.n_history.clear()
         self.clear_output()
 
-    def clear_output(self):
+    def clear_output(self) -> None:
         self.output.clear()
         self.ready = False
 
-    def update(self, vars, count=1):
+    def update(self, vars: dict, count: int = 1) -> None:
         assert isinstance(vars, dict)
         for key, var in vars.items():
             if key not in self.val_history:
@@ -30,7 +30,7 @@ class LogBuffer:
             self.val_history[key].append(var)
             self.n_history[key].append(count)
 
-    def average(self, n=0):
+    def average(self, n: int = 0) -> None:
         """Average latest n values or all values."""
         assert n >= 0
         for key in self.val_history:
diff --git a/mmcv/runner/optimizer/builder.py b/mmcv/runner/optimizer/builder.py
index f9234eed8f1f186d9d8dfda34562157ee39bdb3a..49d8f05a2ccb6d469a72d2dce33fab0d4b15c8a5 100644
--- a/mmcv/runner/optimizer/builder.py
+++ b/mmcv/runner/optimizer/builder.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import inspect
+from typing import Dict, List
 
 import torch
 
@@ -10,7 +11,7 @@ OPTIMIZERS = Registry('optimizer')
 OPTIMIZER_BUILDERS = Registry('optimizer builder')
 
 
-def register_torch_optimizers():
+def register_torch_optimizers() -> List:
     torch_optimizers = []
     for module_name in dir(torch.optim):
         if module_name.startswith('__'):
@@ -26,11 +27,11 @@ def register_torch_optimizers():
 TORCH_OPTIMIZERS = register_torch_optimizers()
 
 
-def build_optimizer_constructor(cfg):
+def build_optimizer_constructor(cfg: Dict):
     return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
 
 
-def build_optimizer(model, cfg):
+def build_optimizer(model, cfg: Dict):
     optimizer_cfg = copy.deepcopy(cfg)
     constructor_type = optimizer_cfg.pop('constructor',
                                          'DefaultOptimizerConstructor')
diff --git a/mmcv/runner/optimizer/default_constructor.py b/mmcv/runner/optimizer/default_constructor.py
index effd1e1703b0350745df2a4157493e02e6a45850..c82b56e52f24958bb9ac1845259eaeea5c126a8c 100644
--- a/mmcv/runner/optimizer/default_constructor.py
+++ b/mmcv/runner/optimizer/default_constructor.py
@@ -1,7 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Dict, List, Optional, Union
 
 import torch
+import torch.nn as nn
 from torch.nn import GroupNorm, LayerNorm
 
 from mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
@@ -46,16 +48,17 @@ class DefaultOptimizerConstructor:
       would not be added into optimizer. Default: False.
 
     Note:
+
         1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-            override the effect of ``bias_lr_mult`` in the bias of offset
-            layer. So be careful when using both ``bias_lr_mult`` and
-            ``dcn_offset_lr_mult``. If you wish to apply both of them to the
-            offset layer in deformable convs, set ``dcn_offset_lr_mult``
-            to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
         2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-            apply it to all the DCN layers in the model. So be careful when
-            the model contains multiple DCN layers in places other than
-            backbone.
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
 
     Args:
         model (:obj:`nn.Module`): The model with parameters to be optimized.
@@ -83,7 +86,7 @@ class DefaultOptimizerConstructor:
         >>> # assume model have attribute model.backbone and model.cls_head
         >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
         >>> paramwise_cfg = dict(custom_keys={
-                '.backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+                'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
         >>> optim_builder = DefaultOptimizerConstructor(
         >>>     optimizer_cfg, paramwise_cfg)
         >>> optimizer = optim_builder(model)
@@ -92,7 +95,9 @@ class DefaultOptimizerConstructor:
         >>> # model.cls_head is (0.01, 0.95).
     """
 
-    def __init__(self, optimizer_cfg, paramwise_cfg=None):
+    def __init__(self,
+                 optimizer_cfg: Dict,
+                 paramwise_cfg: Optional[Dict] = None):
         if not isinstance(optimizer_cfg, dict):
             raise TypeError('optimizer_cfg should be a dict',
                             f'but got {type(optimizer_cfg)}')
@@ -102,7 +107,7 @@ class DefaultOptimizerConstructor:
         self.base_wd = optimizer_cfg.get('weight_decay', None)
         self._validate_cfg()
 
-    def _validate_cfg(self):
+    def _validate_cfg(self) -> None:
         if not isinstance(self.paramwise_cfg, dict):
             raise TypeError('paramwise_cfg should be None or a dict, '
                             f'but got {type(self.paramwise_cfg)}')
@@ -125,7 +130,7 @@ class DefaultOptimizerConstructor:
             if self.base_wd is None:
                 raise ValueError('base_wd should not be None')
 
-    def _is_in(self, param_group, param_group_list):
+    def _is_in(self, param_group: Dict, param_group_list: List) -> bool:
         assert is_list_of(param_group_list, dict)
         param = set(param_group['params'])
         param_set = set()
@@ -134,7 +139,11 @@ class DefaultOptimizerConstructor:
 
         return not param.isdisjoint(param_set)
 
-    def add_params(self, params, module, prefix='', is_dcn_module=None):
+    def add_params(self,
+                   params: List[Dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Union[int, float, None] = None) -> None:
         """Add all parameters of module to the params list.
 
         The parameters of the given module will be added to the list of param
@@ -231,7 +240,7 @@ class DefaultOptimizerConstructor:
                 prefix=child_prefix,
                 is_dcn_module=is_dcn_module)
 
-    def __call__(self, model):
+    def __call__(self, model: nn.Module):
         if hasattr(model, 'module'):
             model = model.module
 
@@ -242,7 +251,7 @@ class DefaultOptimizerConstructor:
             return build_from_cfg(optimizer_cfg, OPTIMIZERS)
 
         # set param-wise lr and weight decay recursively
-        params = []
+        params: List[Dict] = []
         self.add_params(params, model)
         optimizer_cfg['params'] = params
 
diff --git a/mmcv/runner/priority.py b/mmcv/runner/priority.py
index 64cc4e3a05f8d5b89ab6eb32461e6e80f1d62e67..ff644043b810c49dbe673e2ba5e35900650c3f02 100644
--- a/mmcv/runner/priority.py
+++ b/mmcv/runner/priority.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from enum import Enum
+from typing import Union
 
 
 class Priority(Enum):
@@ -39,7 +40,7 @@ class Priority(Enum):
     LOWEST = 100
 
 
-def get_priority(priority):
+def get_priority(priority: Union[int, str, Priority]) -> int:
     """Get priority value.
 
     Args:
diff --git a/mmcv/runner/utils.py b/mmcv/runner/utils.py
index 144d11e1a857f740964ab20e13daf52be7c3d848..8cdc6faddb2cb85cc67cf7d408e7ccbc1ef5d75e 100644
--- a/mmcv/runner/utils.py
+++ b/mmcv/runner/utils.py
@@ -6,6 +6,8 @@ import time
 import warnings
 from getpass import getuser
 from socket import gethostname
+from types import ModuleType
+from typing import Optional
 
 import numpy as np
 import torch
@@ -13,7 +15,7 @@ import torch
 import mmcv
 
 
-def get_host_info():
+def get_host_info() -> str:
     """Get hostname and username.
 
     Return empty string if exception raised, e.g. ``getpass.getuser()`` will
@@ -28,11 +30,13 @@ def get_host_info():
         return host
 
 
-def get_time_str():
+def get_time_str() -> str:
     return time.strftime('%Y%m%d_%H%M%S', time.localtime())
 
 
-def obj_from_dict(info, parent=None, default_args=None):
+def obj_from_dict(info: dict,
+                  parent: Optional[ModuleType] = None,
+                  default_args: Optional[dict] = None):
     """Initialize an object from dict.
 
     The dict must contain the key "type", which indicates the object type, it
@@ -67,7 +71,9 @@ def obj_from_dict(info, parent=None, default_args=None):
     return obj_type(**args)
 
 
-def set_random_seed(seed, deterministic=False, use_rank_shift=False):
+def set_random_seed(seed: int,
+                    deterministic: bool = False,
+                    use_rank_shift: bool = False) -> None:
     """Set random seed.
 
     Args:
diff --git a/mmcv/tensorrt/__init__.py b/mmcv/tensorrt/__init__.py
index eb8cd4021e3e64d44cff783721d5a76c1a300d14..d86ddbf4bb876f7a66c87381a653dcb9c69d1c6f 100644
--- a/mmcv/tensorrt/__init__.py
+++ b/mmcv/tensorrt/__init__.py
@@ -22,9 +22,9 @@ if is_tensorrt_available():
     # load tensorrt plugin lib
     load_tensorrt_plugin()
 
-    __all__.append([
+    __all__.extend([
         'onnx2trt', 'save_trt_engine', 'load_trt_engine', 'TRTWraper',
         'TRTWrapper'
     ])
 
-__all__.append(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
+__all__.extend(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
diff --git a/mmcv/tensorrt/init_plugins.py b/mmcv/tensorrt/init_plugins.py
index d9dbd0204b5bd927cbb3213f540039061a2de837..909b9ae28f16caed229735890410a42f19615e31 100644
--- a/mmcv/tensorrt/init_plugins.py
+++ b/mmcv/tensorrt/init_plugins.py
@@ -2,10 +2,23 @@
 import ctypes
 import glob
 import os
+import warnings
 
 
-def get_tensorrt_op_path():
+def get_tensorrt_op_path() -> str:
     """Get TensorRT plugins library path."""
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     wildcard = os.path.join(
         os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
         '_ext_trt.*.so')
@@ -18,18 +31,44 @@ def get_tensorrt_op_path():
 plugin_is_loaded = False
 
 
-def is_tensorrt_plugin_loaded():
+def is_tensorrt_plugin_loaded() -> bool:
     """Check if TensorRT plugins library is loaded or not.
 
     Returns:
         bool: plugin_is_loaded flag
     """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     global plugin_is_loaded
     return plugin_is_loaded
 
 
-def load_tensorrt_plugin():
+def load_tensorrt_plugin() -> None:
     """load TensorRT plugins library."""
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     global plugin_is_loaded
     lib_path = get_tensorrt_op_path()
     if (not plugin_is_loaded) and os.path.exists(lib_path):
diff --git a/mmcv/tensorrt/preprocess.py b/mmcv/tensorrt/preprocess.py
index d07c67fc99061b5ed938342d8713d52a6095febd..a0ad25428c61c83caf82caa29a722d0265a9bb25 100644
--- a/mmcv/tensorrt/preprocess.py
+++ b/mmcv/tensorrt/preprocess.py
@@ -1,8 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
 import numpy as np
 import onnx
 
 
-def preprocess_onnx(onnx_model):
+def preprocess_onnx(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
     """Modify onnx model to match with TensorRT plugins in mmcv.
 
     There are some conflict between onnx node definition and TensorRT limit.
@@ -18,6 +21,19 @@ def preprocess_onnx(onnx_model):
     Returns:
         onnx.ModelProto: Modified onnx model.
     """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     graph = onnx_model.graph
     nodes = graph.node
     initializers = graph.initializer
diff --git a/mmcv/tensorrt/tensorrt_utils.py b/mmcv/tensorrt/tensorrt_utils.py
index ad193dec8086950e8671dc2d9a5c5d7c7ad3c65f..b415abcd7bf79040053903b99af3e078ed2cb586 100644
--- a/mmcv/tensorrt/tensorrt_utils.py
+++ b/mmcv/tensorrt/tensorrt_utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Union
 
 import onnx
 import tensorrt as trt
@@ -8,12 +9,12 @@ import torch
 from .preprocess import preprocess_onnx
 
 
-def onnx2trt(onnx_model,
-             opt_shape_dict,
-             log_level=trt.Logger.ERROR,
-             fp16_mode=False,
-             max_workspace_size=0,
-             device_id=0):
+def onnx2trt(onnx_model: Union[str, onnx.ModelProto],
+             opt_shape_dict: dict,
+             log_level: trt.ILogger.Severity = trt.Logger.ERROR,
+             fp16_mode: bool = False,
+             max_workspace_size: int = 0,
+             device_id: int = 0) -> trt.ICudaEngine:
     """Convert onnx model to tensorrt engine.
 
     Arguments:
@@ -40,7 +41,20 @@ def onnx2trt(onnx_model,
         >>>             device_id=0)
         >>>             })
     """
-    device = torch.device('cuda:{}'.format(device_id))
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    device = torch.device(f'cuda:{device_id}')
     # create builder and network
     logger = trt.Logger(log_level)
     builder = trt.Builder(logger)
@@ -87,18 +101,31 @@ def onnx2trt(onnx_model,
     return engine
 
 
-def save_trt_engine(engine, path):
+def save_trt_engine(engine: trt.ICudaEngine, path: str) -> None:
     """Serialize TensorRT engine to disk.
 
     Arguments:
         engine (tensorrt.ICudaEngine): TensorRT engine to serialize
         path (str): disk path to write the engine
     """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     with open(path, mode='wb') as f:
         f.write(bytearray(engine.serialize()))
 
 
-def load_trt_engine(path):
+def load_trt_engine(path: str) -> trt.ICudaEngine:
     """Deserialize TensorRT engine from disk.
 
     Arguments:
@@ -107,6 +134,19 @@ def load_trt_engine(path):
     Returns:
         tensorrt.ICudaEngine: the TensorRT engine loaded from disk
     """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
     with trt.Logger() as logger, trt.Runtime(logger) as runtime:
         with open(path, mode='rb') as f:
             engine_bytes = f.read()
@@ -114,7 +154,7 @@ def load_trt_engine(path):
         return engine
 
 
-def torch_dtype_from_trt(dtype):
+def torch_dtype_from_trt(dtype: trt.DataType) -> Union[torch.dtype, TypeError]:
     """Convert pytorch dtype to TensorRT dtype."""
     if dtype == trt.bool:
         return torch.bool
@@ -130,7 +170,8 @@ def torch_dtype_from_trt(dtype):
         raise TypeError('%s is not supported by torch' % dtype)
 
 
-def torch_device_from_trt(device):
+def torch_device_from_trt(
+        device: trt.TensorLocation) -> Union[torch.device, TypeError]:
     """Convert pytorch device to TensorRT device."""
     if device == trt.TensorLocation.DEVICE:
         return torch.device('cuda')
@@ -154,7 +195,21 @@ class TRTWrapper(torch.nn.Module):
     """
 
     def __init__(self, engine, input_names=None, output_names=None):
-        super(TRTWrapper, self).__init__()
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: This tool will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
+
+        super().__init__()
         self.engine = engine
         if isinstance(self.engine, str):
             self.engine = load_trt_engine(engine)
@@ -231,5 +286,6 @@ class TRTWraper(TRTWrapper):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        warnings.warn('TRTWraper will be deprecated in'
-                      ' future. Please use TRTWrapper instead')
+        warnings.warn(
+            'TRTWraper will be deprecated in'
+            ' future. Please use TRTWrapper instead', DeprecationWarning)
diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py
index 478f0151117d30063f24529cd0e7aba3f72e3a89..8bb5a8173d0faf442e139fcd41157904de62b501 100644
--- a/mmcv/utils/__init__.py
+++ b/mmcv/utils/__init__.py
@@ -36,17 +36,26 @@ except ImportError:
         'is_method_overridden', 'has_method'
     ]
 else:
+    from .device_type import (IS_IPU_AVAILABLE, IS_MLU_AVAILABLE,
+                              IS_MPS_AVAILABLE)
     from .env import collect_env
+    from .hub import load_url
     from .logging import get_logger, print_log
     from .parrots_jit import jit, skip_no_elena
-    from .parrots_wrapper import (
-        TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader,
-        PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
-        _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm,
-        _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home)
+    # yapf: disable
+    from .parrots_wrapper import (IS_CUDA_AVAILABLE, TORCH_VERSION,
+                                  BuildExtension, CppExtension, CUDAExtension,
+                                  DataLoader, PoolDataLoader, SyncBatchNorm,
+                                  _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
+                                  _AvgPoolNd, _BatchNorm, _ConvNd,
+                                  _ConvTransposeMixin, _get_cuda_home,
+                                  _InstanceNorm, _MaxPoolNd, get_build_config,
+                                  is_rocm_pytorch)
+    # yapf: enable
     from .registry import Registry, build_from_cfg
+    from .seed import worker_init_fn
+    from .torch_ops import torch_meshgrid
     from .trace import is_jit_tracing
-    from .hub import load_url
     __all__ = [
         'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger',
         'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast',
@@ -66,5 +75,7 @@ else:
         'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
         'assert_params_all_zeros', 'check_python_script',
         'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
-        '_get_cuda_home', 'load_url', 'has_method'
+        '_get_cuda_home', 'load_url', 'has_method', 'IS_CUDA_AVAILABLE',
+        'worker_init_fn', 'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE',
+        'IS_MPS_AVAILABLE', 'torch_meshgrid'
     ]
diff --git a/mmcv/utils/config.py b/mmcv/utils/config.py
index c71377c0718ba8aadab41848cdde541250d204f5..a76bc48724fbe2c30ada930b6ca3e1340c45cad3 100644
--- a/mmcv/utils/config.py
+++ b/mmcv/utils/config.py
@@ -7,11 +7,13 @@ import platform
 import shutil
 import sys
 import tempfile
+import types
 import uuid
 import warnings
 from argparse import Action, ArgumentParser
 from collections import abc
 from importlib import import_module
+from pathlib import Path
 
 from addict import Dict
 from yapf.yapflib.yapf_api import FormatCode
@@ -20,9 +22,9 @@ from .misc import import_modules_from_strings
 from .path import check_file_exist
 
 if platform.system() == 'Windows':
-    import regex as re
+    import regex as re  # type: ignore
 else:
-    import re
+    import re  # type: ignore
 
 BASE_KEY = '_base_'
 DELETE_KEY = '_delete_'
@@ -37,7 +39,7 @@ class ConfigDict(Dict):
 
     def __getattr__(self, name):
         try:
-            value = super(ConfigDict, self).__getattr__(name)
+            value = super().__getattr__(name)
         except KeyError:
             ex = AttributeError(f"'{self.__class__.__name__}' object has no "
                                 f"attribute '{name}'")
@@ -94,7 +96,7 @@ class Config:
 
     @staticmethod
     def _validate_py_syntax(filename):
-        with open(filename, 'r', encoding='utf-8') as f:
+        with open(filename, encoding='utf-8') as f:
             # Setting encoding explicitly to resolve coding issue on windows
             content = f.read()
         try:
@@ -114,7 +116,7 @@ class Config:
             fileBasename=file_basename,
             fileBasenameNoExtension=file_basename_no_extension,
             fileExtname=file_extname)
-        with open(filename, 'r', encoding='utf-8') as f:
+        with open(filename, encoding='utf-8') as f:
             # Setting encoding explicitly to resolve coding issue on windows
             config_file = f.read()
         for key, value in support_templates.items():
@@ -128,7 +130,7 @@ class Config:
     def _pre_substitute_base_vars(filename, temp_config_name):
         """Substitute base variable placehoders to string, so that parsing
         would work."""
-        with open(filename, 'r', encoding='utf-8') as f:
+        with open(filename, encoding='utf-8') as f:
             # Setting encoding explicitly to resolve coding issue on windows
             config_file = f.read()
         base_var_dict = {}
@@ -181,7 +183,7 @@ class Config:
         check_file_exist(filename)
         fileExtname = osp.splitext(filename)[1]
         if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
+            raise OSError('Only py/yml/yaml/json type are supported now!')
 
         with tempfile.TemporaryDirectory() as temp_config_dir:
             temp_config_file = tempfile.NamedTemporaryFile(
@@ -209,6 +211,8 @@ class Config:
                     name: value
                     for name, value in mod.__dict__.items()
                     if not name.startswith('__')
+                    and not isinstance(value, types.ModuleType)
+                    and not isinstance(value, types.FunctionType)
                 }
                 # delete imported module
                 del sys.modules[temp_module_name]
@@ -229,10 +233,10 @@ class Config:
             if 'reference' in deprecation_info:
                 warning_msg += ' More information can be found at ' \
                     f'{deprecation_info["reference"]}'
-            warnings.warn(warning_msg)
+            warnings.warn(warning_msg, DeprecationWarning)
 
         cfg_text = filename + '\n'
-        with open(filename, 'r', encoding='utf-8') as f:
+        with open(filename, encoding='utf-8') as f:
             # Setting encoding explicitly to resolve coding issue on windows
             cfg_text += f.read()
 
@@ -310,16 +314,19 @@ class Config:
                 if len(b) <= k:
                     raise KeyError(f'Index {k} exceeds the length of list {b}')
                 b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
-            elif isinstance(v,
-                            dict) and k in b and not v.pop(DELETE_KEY, False):
-                allowed_types = (dict, list) if allow_list_keys else dict
-                if not isinstance(b[k], allowed_types):
-                    raise TypeError(
-                        f'{k}={v} in child config cannot inherit from base '
-                        f'because {k} is a dict in the child config but is of '
-                        f'type {type(b[k])} in base config. You may set '
-                        f'`{DELETE_KEY}=True` to ignore the base config')
-                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types = (dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
             else:
                 b[k] = v
         return b
@@ -328,6 +335,8 @@ class Config:
     def fromfile(filename,
                  use_predefined_variables=True,
                  import_custom_modules=True):
+        if isinstance(filename, Path):
+            filename = str(filename)
         cfg_dict, cfg_text = Config._file2dict(filename,
                                                use_predefined_variables)
         if import_custom_modules and cfg_dict.get('custom_imports', None):
@@ -344,10 +353,10 @@ class Config:
                config str. Only py/yml/yaml/json type are supported now!
 
         Returns:
-            obj:`Config`: Config obj.
+            :obj:`Config`: Config obj.
         """
         if file_format not in ['.py', '.json', '.yaml', '.yml']:
-            raise IOError('Only py/yml/yaml/json type are supported now!')
+            raise OSError('Only py/yml/yaml/json type are supported now!')
         if file_format != '.py' and 'dict(' in cfg_str:
             # check if users specify a wrong suffix for python
             warnings.warn(
@@ -384,16 +393,19 @@ class Config:
             if key in RESERVED_KEYS:
                 raise KeyError(f'{key} is reserved for config file')
 
-        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
-        super(Config, self).__setattr__('_filename', filename)
+        if isinstance(filename, Path):
+            filename = str(filename)
+
+        super().__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super().__setattr__('_filename', filename)
         if cfg_text:
             text = cfg_text
         elif filename:
-            with open(filename, 'r') as f:
+            with open(filename) as f:
                 text = f.read()
         else:
             text = ''
-        super(Config, self).__setattr__('_text', text)
+        super().__setattr__('_text', text)
 
     @property
     def filename(self):
@@ -525,27 +537,66 @@ class Config:
     def __getstate__(self):
         return (self._cfg_dict, self._filename, self._text)
 
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+
+        return other
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
     def __setstate__(self, state):
         _cfg_dict, _filename, _text = state
-        super(Config, self).__setattr__('_cfg_dict', _cfg_dict)
-        super(Config, self).__setattr__('_filename', _filename)
-        super(Config, self).__setattr__('_text', _text)
+        super().__setattr__('_cfg_dict', _cfg_dict)
+        super().__setattr__('_filename', _filename)
+        super().__setattr__('_text', _text)
 
     def dump(self, file=None):
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict()
-        if self.filename.endswith('.py'):
-            if file is None:
+        """Dumps config into a file or returns a string representation of the
+        config.
+
+        If a file argument is given, saves the config to that file using the
+        format defined by the file argument extension.
+
+        Otherwise, returns a string representing the config. The formatting of
+        this returned string is defined by the extension of `self.filename`. If
+        `self.filename` is not defined, returns a string representation of a
+         dict (lowercased and using ' for strings).
+
+        Examples:
+            >>> cfg_dict = dict(item1=[1, 2], item2=dict(a=0),
+            ...     item3=True, item4='test')
+            >>> cfg = Config(cfg_dict=cfg_dict)
+            >>> dump_file = "a.py"
+            >>> cfg.dump(dump_file)
+
+        Args:
+            file (str, optional): Path of the output file where the config
+                will be dumped. Defaults to None.
+        """
+        import mmcv
+        cfg_dict = super().__getattribute__('_cfg_dict').to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
                 return self.pretty_text
             else:
-                with open(file, 'w', encoding='utf-8') as f:
-                    f.write(self.pretty_text)
-        else:
-            import mmcv
-            if file is None:
                 file_format = self.filename.split('.')[-1]
                 return mmcv.dump(cfg_dict, file_format=file_format)
-            else:
-                mmcv.dump(cfg_dict, file)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return mmcv.dump(cfg_dict, file=file, file_format=file_format)
 
     def merge_from_dict(self, options, allow_list_keys=True):
         """Merge list into cfg_dict.
@@ -561,7 +612,7 @@ class Config:
             >>> assert cfg_dict == dict(
             ...     model=dict(backbone=dict(depth=50, with_cp=True)))
 
-            # Merge list element
+            >>> # Merge list element
             >>> cfg = Config(dict(pipeline=[
             ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
             >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
@@ -587,8 +638,8 @@ class Config:
             subkey = key_list[-1]
             d[subkey] = v
 
-        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
-        super(Config, self).__setattr__(
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
             '_cfg_dict',
             Config._merge_a_into_b(
                 option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
@@ -615,6 +666,8 @@ class DictAction(Action):
             pass
         if val.lower() in ['true', 'false']:
             return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
         return val
 
     @staticmethod
diff --git a/mmcv/utils/device_type.py b/mmcv/utils/device_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42ff72e9ff9a43963aa832cd1113c340562e1ae
--- /dev/null
+++ b/mmcv/utils/device_type.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+def is_ipu_available() -> bool:
+    try:
+        import poptorch
+        return poptorch.ipuHardwareIsAvailable()
+    except ImportError:
+        return False
+
+
+IS_IPU_AVAILABLE = is_ipu_available()
+
+
+def is_mlu_available() -> bool:
+    try:
+        import torch
+        return (hasattr(torch, 'is_mlu_available')
+                and torch.is_mlu_available())
+    except Exception:
+        return False
+
+
+IS_MLU_AVAILABLE = is_mlu_available()
+
+
+def is_mps_available() -> bool:
+    """Return True if mps devices exist.
+
+    It's specialized for mac m1 chips and require torch version 1.12 or higher.
+    """
+    try:
+        import torch
+        return hasattr(torch.backends,
+                       'mps') and torch.backends.mps.is_available()
+    except Exception:
+        return False
+
+
+IS_MPS_AVAILABLE = is_mps_available()
diff --git a/mmcv/utils/env.py b/mmcv/utils/env.py
index e46a1094fbca86cda6de2100728cddba374a0ea8..511332506f88774efee9c01b0236e70462af41f7 100644
--- a/mmcv/utils/env.py
+++ b/mmcv/utils/env.py
@@ -26,6 +26,7 @@ def collect_env():
             - CUDA_HOME (optional): The env var ``CUDA_HOME``.
             - NVCC (optional): NVCC version.
             - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
             - PyTorch: PyTorch version.
             - PyTorch compiling details: The output of \
                 ``torch.__config__.show()``.
@@ -56,18 +57,42 @@ def collect_env():
         if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
             try:
                 nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
-                nvcc = subprocess.check_output(
-                    f'"{nvcc}" -V | tail -n1', shell=True)
+                nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
                 nvcc = nvcc.decode('utf-8').strip()
+                release = nvcc.rfind('Cuda compilation tools')
+                build = nvcc.rfind('Build ')
+                nvcc = nvcc[release:build].strip()
             except subprocess.SubprocessError:
                 nvcc = 'Not Available'
             env_info['NVCC'] = nvcc
 
     try:
-        gcc = subprocess.check_output('gcc --version | head -n1', shell=True)
-        gcc = gcc.decode('utf-8').strip()
-        env_info['GCC'] = gcc
-    except subprocess.CalledProcessError:  # gcc is unavailable
+        # Check C++ Compiler.
+        # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
+        # indicating the compiler used, we use this to get the compiler name
+        import sysconfig
+        cc = sysconfig.get_config_var('CC')
+        if cc:
+            cc = osp.basename(cc.split()[0])
+            cc_info = subprocess.check_output(f'{cc} --version', shell=True)
+            env_info['GCC'] = cc_info.decode('utf-8').partition(
+                '\n')[0].strip()
+        else:
+            # on Windows, cl.exe is not in PATH. We need to find the path.
+            # distutils.ccompiler.new_compiler() returns a msvccompiler
+            # object and after initialization, path to cl.exe is found.
+            import locale
+            import os
+            from distutils.ccompiler import new_compiler
+            ccompiler = new_compiler()
+            ccompiler.initialize()
+            cc = subprocess.check_output(
+                f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
+            encoding = os.device_encoding(
+                sys.stdout.fileno()) or locale.getpreferredencoding()
+            env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
+            env_info['GCC'] = 'n/a'
+    except subprocess.CalledProcessError:
         env_info['GCC'] = 'n/a'
 
     env_info['PyTorch'] = torch.__version__
diff --git a/mmcv/utils/ext_loader.py b/mmcv/utils/ext_loader.py
index 08132d2c1b9a1c28880e4bab4d4fa1ba39d9d083..a31e107dfef8b710dc56fd887f569097d1c63208 100644
--- a/mmcv/utils/ext_loader.py
+++ b/mmcv/utils/ext_loader.py
@@ -36,6 +36,7 @@ else:
         'ms_deform_attn_forward',
         'pixel_group',
         'contour_expand',
+        'diff_iou_rotated_sort_vertices_forward',
     ]
 
     def get_fake_func(name, e):
@@ -66,6 +67,6 @@ else:
         return ExtModule(*ext_list)
 
 
-def check_ops_exist():
+def check_ops_exist() -> bool:
     ext_loader = pkgutil.find_loader('mmcv._ext')
     return ext_loader is not None
diff --git a/mmcv/utils/hub.py b/mmcv/utils/hub.py
index 4e11796af03109d12dc86d5ee454773c33dfdf7c..a9cbbc95bafa55bfff36acbe56bd49ac372d0ec8 100644
--- a/mmcv/utils/hub.py
+++ b/mmcv/utils/hub.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 # The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
 # file format. It will cause RuntimeError when a checkpoint was saved in
 # torch >= 1.6.0 but loaded in torch < 1.7.0.
@@ -10,12 +11,13 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
         '1.7.0'):
     # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
     import os
-    import torch
-    import warnings
-    from urllib.parse import urlparse
     import sys
+    import warnings
     import zipfile
-    from torch.hub import download_url_to_file, _get_torch_home, HASH_REGEX
+    from urllib.parse import urlparse
+
+    import torch
+    from torch.hub import HASH_REGEX, _get_torch_home, download_url_to_file
 
     # Hub used to support automatically extracts from zipfile manually
     # compressed by users. The legacy zip format expects only one file from
@@ -28,10 +30,11 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
         return False
 
     def _legacy_zip_load(filename, model_dir, map_location):
-        warnings.warn('Falling back to the old format < 1.6. This support will'
-                      ' be deprecated in favor of default zipfile format '
-                      'introduced in 1.6. Please redo torch.save() to save it '
-                      'in the new zipfile format.')
+        warnings.warn(
+            'Falling back to the old format < 1.6. This support will'
+            ' be deprecated in favor of default zipfile format '
+            'introduced in 1.6. Please redo torch.save() to save it '
+            'in the new zipfile format.', DeprecationWarning)
         # Note: extractall() defaults to overwrite file if exists. No need to
         #       clean up beforehand. We deliberately don't handle tarfile here
         #       since our legacy serialization format was in tar.
@@ -84,8 +87,9 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
         """
         # Issue warning to move data if old env is set
         if os.getenv('TORCH_MODEL_ZOO'):
-            warnings.warn('TORCH_MODEL_ZOO is deprecated, please use env '
-                          'TORCH_HOME instead')
+            warnings.warn(
+                'TORCH_MODEL_ZOO is deprecated, please use env '
+                'TORCH_HOME instead', DeprecationWarning)
 
         if model_dir is None:
             torch_home = _get_torch_home()
@@ -124,4 +128,4 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
                     'loaded in torch<1.5.')
             raise error
 else:
-    from torch.utils.model_zoo import load_url  # noqa: F401
+    from torch.utils.model_zoo import load_url  # type: ignore # noqa: F401
diff --git a/mmcv/utils/logging.py b/mmcv/utils/logging.py
index 4aa0e04bb9b3ab2a4bfbc4def50404ccbac2c6e6..5a90aac8b270a6e7f420477e7d7d06b74aff59de 100644
--- a/mmcv/utils/logging.py
+++ b/mmcv/utils/logging.py
@@ -3,7 +3,7 @@ import logging
 
 import torch.distributed as dist
 
-logger_initialized = {}
+logger_initialized: dict = {}
 
 
 def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
@@ -89,6 +89,7 @@ def print_log(msg, logger=None, level=logging.INFO):
         msg (str): The message to be logged.
         logger (logging.Logger | str | None): The logger to be used.
             Some special loggers are:
+
             - "silent": no message will be printed.
             - other str: the logger obtained with `get_root_logger(logger)`.
             - None: The `print()` method will be used to print log messages.
diff --git a/mmcv/utils/misc.py b/mmcv/utils/misc.py
index 2c58d0d7fee9fe3d4519270ad8c1e998d0d8a18c..7957ea89b762763566139edfbf0a75401dc4e268 100644
--- a/mmcv/utils/misc.py
+++ b/mmcv/utils/misc.py
@@ -315,7 +315,7 @@ def deprecated_api_warning(name_dict, cls_name=None):
                         warnings.warn(
                             f'"{src_arg_name}" is deprecated in '
                             f'`{func_name}`, please use "{dst_arg_name}" '
-                            'instead')
+                            'instead', DeprecationWarning)
                         arg_names[arg_names.index(src_arg_name)] = dst_arg_name
             if kwargs:
                 for src_arg_name, dst_arg_name in name_dict.items():
@@ -333,7 +333,7 @@ def deprecated_api_warning(name_dict, cls_name=None):
                         warnings.warn(
                             f'"{src_arg_name}" is deprecated in '
                             f'`{func_name}`, please use "{dst_arg_name}" '
-                            'instead')
+                            'instead', DeprecationWarning)
                         kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
 
             # apply converted arguments to the decorated method
diff --git a/mmcv/utils/parrots_wrapper.py b/mmcv/utils/parrots_wrapper.py
index 93c97640d4b9ed088ca82cfe03e6efebfcfa9dbf..cf2c7e5ce0c2a640bdd7d8f91ef4973e5e944bbd 100644
--- a/mmcv/utils/parrots_wrapper.py
+++ b/mmcv/utils/parrots_wrapper.py
@@ -6,6 +6,13 @@ import torch
 TORCH_VERSION = torch.__version__
 
 
+def is_cuda_available() -> bool:
+    return torch.cuda.is_available()
+
+
+IS_CUDA_AVAILABLE = is_cuda_available()
+
+
 def is_rocm_pytorch() -> bool:
     is_rocm = False
     if TORCH_VERSION != 'parrots':
@@ -83,8 +90,8 @@ def _get_norm():
         from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
         SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
     else:
-        from torch.nn.modules.instancenorm import _InstanceNorm
         from torch.nn.modules.batchnorm import _BatchNorm
+        from torch.nn.modules.instancenorm import _InstanceNorm
         SyncBatchNorm_ = torch.nn.SyncBatchNorm
     return _BatchNorm, _InstanceNorm, SyncBatchNorm_
 
@@ -96,7 +103,7 @@ _BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
 _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
 
 
-class SyncBatchNorm(SyncBatchNorm_):
+class SyncBatchNorm(SyncBatchNorm_):  # type: ignore
 
     def _check_input_dim(self, input):
         if TORCH_VERSION == 'parrots':
diff --git a/mmcv/utils/path.py b/mmcv/utils/path.py
index 7dab4b3041413b1432b0f434b8b14783097d33c6..56808183777d8070a94f8c346b7929da1f56ceb4 100644
--- a/mmcv/utils/path.py
+++ b/mmcv/utils/path.py
@@ -40,7 +40,7 @@ def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
     """Scan a directory to find the interested files.
 
     Args:
-        dir_path (str | obj:`Path`): Path of the directory.
+        dir_path (str | :obj:`Path`): Path of the directory.
         suffix (str | tuple(str), optional): File suffix that we are
             interested in. Default: None.
         recursive (bool, optional): If set to True, recursively scan the
diff --git a/mmcv/utils/registry.py b/mmcv/utils/registry.py
index fa9df39bc9f3d8d568361e7250ab35468f2b74e0..a7db6bd442170319c4b115d6d289339f3bf50d25 100644
--- a/mmcv/utils/registry.py
+++ b/mmcv/utils/registry.py
@@ -2,12 +2,29 @@
 import inspect
 import warnings
 from functools import partial
+from typing import Any, Dict, Optional
 
-from .misc import is_seq_of
+from .misc import deprecated_api_warning, is_seq_of
 
 
-def build_from_cfg(cfg, registry, default_args=None):
-    """Build a module from config dict.
+def build_from_cfg(cfg: Dict,
+                   registry: 'Registry',
+                   default_args: Optional[Dict] = None) -> Any:
+    """Build a module from config dict when it is a class configuration, or
+    call a function from config dict when it is a function configuration.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='Resnet'), MODELS)
+        >>> # Returns an instantiated object
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='resnet50'), MODELS)
+        >>> # Return a result of the calling function
 
     Args:
         cfg (dict): Config dict. It should at least contain the key "type".
@@ -43,7 +60,7 @@ def build_from_cfg(cfg, registry, default_args=None):
         if obj_cls is None:
             raise KeyError(
                 f'{obj_type} is not in the {registry.name} registry')
-    elif inspect.isclass(obj_type):
+    elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
         obj_cls = obj_type
     else:
         raise TypeError(
@@ -56,15 +73,21 @@ def build_from_cfg(cfg, registry, default_args=None):
 
 
 class Registry:
-    """A registry to map strings to classes.
+    """A registry to map strings to classes or functions.
+
+    Registered object could be built from registry. Meanwhile, registered
+    functions could be called from registry.
 
-    Registered object could be built from registry.
     Example:
         >>> MODELS = Registry('models')
         >>> @MODELS.register_module()
         >>> class ResNet:
         >>>     pass
         >>> resnet = MODELS.build(dict(type='ResNet'))
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='resnet50'))
 
     Please refer to
     https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
@@ -128,20 +151,22 @@ class Registry:
         The name of the package where registry is defined will be returned.
 
         Example:
-            # in mmdet/models/backbone/resnet.py
+            >>> # in mmdet/models/backbone/resnet.py
             >>> MODELS = Registry('models')
             >>> @MODELS.register_module()
             >>> class ResNet:
             >>>     pass
             The scope of ``ResNet`` will be ``mmdet``.
 
-
         Returns:
-            scope (str): The inferred scope name.
+            str: The inferred scope name.
         """
-        # inspect.stack() trace where this function is called, the index-2
-        # indicates the frame where `infer_scope()` is called
-        filename = inspect.getmodule(inspect.stack()[2][0]).__name__
+        # We access the caller using inspect.currentframe() instead of
+        # inspect.stack() for performance reasons. See details in PR #1844
+        frame = inspect.currentframe()
+        # get the frame where `infer_scope()` is called
+        infer_scope_caller = frame.f_back.f_back
+        filename = inspect.getmodule(infer_scope_caller).__name__
         split_filename = filename.split('.')
         return split_filename[0]
 
@@ -158,8 +183,8 @@ class Registry:
             None, 'ResNet'
 
         Return:
-            scope (str, None): The first scope.
-            key (str): The remaining key.
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
         """
         split_index = key.find('.')
         if split_index != -1:
@@ -232,26 +257,28 @@ class Registry:
             f'scope {registry.scope} exists in {self.name} registry'
         self.children[registry.scope] = registry
 
-    def _register_module(self, module_class, module_name=None, force=False):
-        if not inspect.isclass(module_class):
-            raise TypeError('module must be a class, '
-                            f'but got {type(module_class)}')
+    @deprecated_api_warning(name_dict=dict(module_class='module'))
+    def _register_module(self, module, module_name=None, force=False):
+        if not inspect.isclass(module) and not inspect.isfunction(module):
+            raise TypeError('module must be a class or a function, '
+                            f'but got {type(module)}')
 
         if module_name is None:
-            module_name = module_class.__name__
+            module_name = module.__name__
         if isinstance(module_name, str):
             module_name = [module_name]
         for name in module_name:
             if not force and name in self._module_dict:
                 raise KeyError(f'{name} is already registered '
                                f'in {self.name}')
-            self._module_dict[name] = module_class
+            self._module_dict[name] = module
 
     def deprecated_register_module(self, cls=None, force=False):
         warnings.warn(
             'The old API of register_module(module, force=False) '
             'is deprecated and will be removed, please use the new API '
-            'register_module(name=None, force=False, module=None) instead.')
+            'register_module(name=None, force=False, module=None) instead.',
+            DeprecationWarning)
         if cls is None:
             return partial(self.deprecated_register_module, force=force)
         self._register_module(cls, force=force)
@@ -285,7 +312,7 @@ class Registry:
                 specified, the class name will be used.
             force (bool, optional): Whether to override an existing class with
                 the same name. Default: False.
-            module (type): Module class to be registered.
+            module (type): Module class or function to be registered.
         """
         if not isinstance(force, bool):
             raise TypeError(f'force must be a boolean, but got {type(force)}')
@@ -302,14 +329,12 @@ class Registry:
 
         # use it as a normal method: x.register_module(module=SomeClass)
         if module is not None:
-            self._register_module(
-                module_class=module, module_name=name, force=force)
+            self._register_module(module=module, module_name=name, force=force)
             return module
 
         # use it as a decorator: @x.register_module()
-        def _register(cls):
-            self._register_module(
-                module_class=cls, module_name=name, force=force)
-            return cls
+        def _register(module):
+            self._register_module(module=module, module_name=name, force=force)
+            return module
 
         return _register
diff --git a/mmcv/utils/seed.py b/mmcv/utils/seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..003f9236774165af2de921af3c06f9fe057725dc
--- /dev/null
+++ b/mmcv/utils/seed.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import numpy as np
+import torch
+
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """Function to initialize each worker.
+
+    The seed of each worker equals to
+    ``num_worker * rank + worker_id + user_seed``.
+
+    Args:
+        worker_id (int): Id for each worker.
+        num_workers (int): Number of workers.
+        rank (int): Rank in distributed training.
+        seed (int): Random seed.
+    """
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/mmcv/utils/testing.py b/mmcv/utils/testing.py
index a27f936da8ec14bac18562ede0a79d476d82f797..7b64e8fae39022fece6f5910cc6656598f31bff5 100644
--- a/mmcv/utils/testing.py
+++ b/mmcv/utils/testing.py
@@ -111,8 +111,9 @@ def assert_is_norm_layer(module) -> bool:
     Returns:
         bool: Whether the module is a norm layer.
     """
-    from .parrots_wrapper import _BatchNorm, _InstanceNorm
     from torch.nn import GroupNorm, LayerNorm
+
+    from .parrots_wrapper import _BatchNorm, _InstanceNorm
     norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
     return isinstance(module, norm_layer_candidates)
 
diff --git a/mmcv/utils/timer.py b/mmcv/utils/timer.py
index 66d4a78a85c42def104034afb8a04c33deb0a67d..087a969cfabe30ce0ed3080fd6eb6b81e232502f 100644
--- a/mmcv/utils/timer.py
+++ b/mmcv/utils/timer.py
@@ -6,33 +6,32 @@ class TimerError(Exception):
 
     def __init__(self, message):
         self.message = message
-        super(TimerError, self).__init__(message)
+        super().__init__(message)
 
 
 class Timer:
     """A flexible Timer class.
 
-    :Example:
-
-    >>> import time
-    >>> import mmcv
-    >>> with mmcv.Timer():
-    >>>     # simulate a code block that will run for 1s
-    >>>     time.sleep(1)
-    1.000
-    >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
-    >>>     # simulate a code block that will run for 1s
-    >>>     time.sleep(1)
-    it takes 1.0 seconds
-    >>> timer = mmcv.Timer()
-    >>> time.sleep(0.5)
-    >>> print(timer.since_start())
-    0.500
-    >>> time.sleep(0.5)
-    >>> print(timer.since_last_check())
-    0.500
-    >>> print(timer.since_start())
-    1.000
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> with mmcv.Timer():
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        1.000
+        >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        it takes 1.0 seconds
+        >>> timer = mmcv.Timer()
+        >>> time.sleep(0.5)
+        >>> print(timer.since_start())
+        0.500
+        >>> time.sleep(0.5)
+        >>> print(timer.since_last_check())
+        0.500
+        >>> print(timer.since_start())
+        1.000
     """
 
     def __init__(self, start=True, print_tmpl=None):
@@ -64,7 +63,8 @@ class Timer:
     def since_start(self):
         """Total time since the timer is started.
 
-        Returns (float): Time in seconds.
+        Returns:
+            float: Time in seconds.
         """
         if not self._is_running:
             raise TimerError('timer is not running')
@@ -77,7 +77,8 @@ class Timer:
         Either :func:`since_start` or :func:`since_last_check` is a checking
         operation.
 
-        Returns (float): Time in seconds.
+        Returns:
+            float: Time in seconds.
         """
         if not self._is_running:
             raise TimerError('timer is not running')
@@ -95,21 +96,20 @@ def check_time(timer_id):
     This method is suitable for running a task on a list of items. A timer will
     be registered when the method is called for the first time.
 
-    :Example:
-
-    >>> import time
-    >>> import mmcv
-    >>> for i in range(1, 6):
-    >>>     # simulate a code block
-    >>>     time.sleep(i)
-    >>>     mmcv.check_time('task1')
-    2.000
-    3.000
-    4.000
-    5.000
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> for i in range(1, 6):
+        >>>     # simulate a code block
+        >>>     time.sleep(i)
+        >>>     mmcv.check_time('task1')
+        2.000
+        3.000
+        4.000
+        5.000
 
     Args:
-        timer_id (str): Timer identifier.
+        str: Timer identifier.
     """
     if timer_id not in _g_timers:
         _g_timers[timer_id] = Timer()
diff --git a/mmcv/utils/torch_ops.py b/mmcv/utils/torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4f2213a43eda55ebfeb0ec61c9060a37224c25d
--- /dev/null
+++ b/mmcv/utils/torch_ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .parrots_wrapper import TORCH_VERSION
+from .version_utils import digit_version
+
+_torch_version_meshgrid_indexing = (
+    'parrots' not in TORCH_VERSION
+    and digit_version(TORCH_VERSION) >= digit_version('1.10.0a0'))
+
+
+def torch_meshgrid(*tensors):
+    """A wrapper of torch.meshgrid to compat different PyTorch versions.
+
+    Since PyTorch 1.10.0a0, torch.meshgrid supports the arguments ``indexing``.
+    So we implement a wrapper here to avoid warning when using high-version
+    PyTorch and avoid compatibility issues when using previous versions of
+    PyTorch.
+
+    Args:
+        tensors (List[Tensor]): List of scalars or 1 dimensional tensors.
+
+    Returns:
+        Sequence[Tensor]: Sequence of meshgrid tensors.
+    """
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/mmcv/utils/trace.py b/mmcv/utils/trace.py
index 8e49bfd38456671d3217c089e5f6653fa7357323..45423bd0551b8c4824193110546d5328ea4253d1 100644
--- a/mmcv/utils/trace.py
+++ b/mmcv/utils/trace.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 
 import torch
diff --git a/mmcv/utils/version_utils.py b/mmcv/utils/version_utils.py
index 963c45a2e8a86a88413ab6c18c22481fb9831985..77c41f608439f85aa29f8a6c9bd148b04d0c5973 100644
--- a/mmcv/utils/version_utils.py
+++ b/mmcv/utils/version_utils.py
@@ -41,7 +41,7 @@ def digit_version(version_str: str, length: int = 4):
             release.extend([val, 0])
 
     elif version.is_postrelease:
-        release.extend([1, version.post])
+        release.extend([1, version.post])  # type: ignore
     else:
         release.extend([0, 0])
     return tuple(release)
diff --git a/mmcv/version.py b/mmcv/version.py
index 783bcbbf425de19dd7ab5d0001116a037b9e47b5..9e03bf9bd6e93c42e17a8b8c43d5bc9e7dadc9ef 100644
--- a/mmcv/version.py
+++ b/mmcv/version.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-__version__ = '1.4.0'
+__version__ = '1.6.1'
 
 
 def parse_version_info(version_str: str, length: int = 4) -> tuple:
@@ -22,9 +22,9 @@ def parse_version_info(version_str: str, length: int = 4) -> tuple:
     if len(release) < length:
         release = release + [0] * (length - len(release))
     if version.is_prerelease:
-        release.extend(list(version.pre))
+        release.extend(list(version.pre))  # type: ignore
     elif version.is_postrelease:
-        release.extend(list(version.post))
+        release.extend(list(version.post))  # type: ignore
     else:
         release.extend([0, 0])
     return tuple(release)
diff --git a/mmcv/video/io.py b/mmcv/video/io.py
index 30971a15a5c54e1e42d6f96ad3aa08a4c69664f4..09fa770db3dac1c5b2bec13b8743ccf3b2b46ef6 100644
--- a/mmcv/video/io.py
+++ b/mmcv/video/io.py
@@ -50,15 +50,14 @@ class VideoReader:
     the second time, there is no need to decode again if it is stored in the
     cache.
 
-    :Example:
-
-    >>> import mmcv
-    >>> v = mmcv.VideoReader('sample.mp4')
-    >>> len(v)  # get the total frame number with `len()`
-    120
-    >>> for img in v:  # v is iterable
-    >>>     mmcv.imshow(img)
-    >>> v[5]  # get the 6th frame
+    Examples:
+        >>> import mmcv
+        >>> v = mmcv.VideoReader('sample.mp4')
+        >>> len(v)  # get the total frame number with `len()`
+        120
+        >>> for img in v:  # v is iterable
+        >>>     mmcv.imshow(img)
+        >>> v[5]  # get the 6th frame
     """
 
     def __init__(self, filename, cache_capacity=10):
@@ -189,7 +188,7 @@ class VideoReader:
 
         Returns:
             ndarray or None: If the video is fresh, return None, otherwise
-                return the frame.
+            return the frame.
         """
         if self._position == 0:
             return None
@@ -273,14 +272,14 @@ class VideoReader:
         self._vcap.release()
 
 
-def frames2video(frame_dir,
-                 video_file,
-                 fps=30,
-                 fourcc='XVID',
-                 filename_tmpl='{:06d}.jpg',
-                 start=0,
-                 end=0,
-                 show_progress=True):
+def frames2video(frame_dir: str,
+                 video_file: str,
+                 fps: float = 30,
+                 fourcc: str = 'XVID',
+                 filename_tmpl: str = '{:06d}.jpg',
+                 start: int = 0,
+                 end: int = 0,
+                 show_progress: bool = True) -> None:
     """Read the frame images from a directory and join them as a video.
 
     Args:
diff --git a/mmcv/video/optflow.py b/mmcv/video/optflow.py
index c246f5b47bc4d7c706dbf634f5e9576b413c938b..91ce004570d356b373ce8707e570c05ae026f1d5 100644
--- a/mmcv/video/optflow.py
+++ b/mmcv/video/optflow.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Tuple, Union
 
 import cv2
 import numpy as np
@@ -9,7 +10,11 @@ from mmcv.image import imread, imwrite
 from mmcv.utils import is_str
 
 
-def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
+def flowread(flow_or_path: Union[np.ndarray, str],
+             quantize: bool = False,
+             concat_axis: int = 0,
+             *args,
+             **kwargs) -> np.ndarray:
     """Read an optical flow map.
 
     Args:
@@ -35,10 +40,10 @@ def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
             try:
                 header = f.read(4).decode('utf-8')
             except Exception:
-                raise IOError(f'Invalid flow file: {flow_or_path}')
+                raise OSError(f'Invalid flow file: {flow_or_path}')
             else:
                 if header != 'PIEH':
-                    raise IOError(f'Invalid flow file: {flow_or_path}, '
+                    raise OSError(f'Invalid flow file: {flow_or_path}, '
                                   'header does not contain PIEH')
 
             w = np.fromfile(f, np.int32, 1).squeeze()
@@ -48,7 +53,7 @@ def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
         assert concat_axis in [0, 1]
         cat_flow = imread(flow_or_path, flag='unchanged')
         if cat_flow.ndim != 2:
-            raise IOError(
+            raise OSError(
                 f'{flow_or_path} is not a valid quantized flow file, '
                 f'its dimension is {cat_flow.ndim}.')
         assert cat_flow.shape[concat_axis] % 2 == 0
@@ -58,7 +63,12 @@ def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs):
     return flow.astype(np.float32)
 
 
-def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
+def flowwrite(flow: np.ndarray,
+              filename: str,
+              quantize: bool = False,
+              concat_axis: int = 0,
+              *args,
+              **kwargs) -> None:
     """Write optical flow to file.
 
     If the flow is not quantized, it will be saved as a .flo file losslessly,
@@ -76,7 +86,7 @@ def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
     """
     if not quantize:
         with open(filename, 'wb') as f:
-            f.write('PIEH'.encode('utf-8'))
+            f.write(b'PIEH')
             np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
             flow = flow.astype(np.float32)
             flow.tofile(f)
@@ -88,7 +98,9 @@ def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs):
         imwrite(dxdy, filename)
 
 
-def quantize_flow(flow, max_val=0.02, norm=True):
+def quantize_flow(flow: np.ndarray,
+                  max_val: float = 0.02,
+                  norm: bool = True) -> tuple:
     """Quantize flow to [0, 255].
 
     After this step, the size of flow will be much smaller, and can be
@@ -116,7 +128,10 @@ def quantize_flow(flow, max_val=0.02, norm=True):
     return tuple(flow_comps)
 
 
-def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
+def dequantize_flow(dx: np.ndarray,
+                    dy: np.ndarray,
+                    max_val: float = 0.02,
+                    denorm: bool = True) -> np.ndarray:
     """Recover from quantized flow.
 
     Args:
@@ -131,7 +146,7 @@ def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
     assert dx.shape == dy.shape
     assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
 
-    dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]]
+    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])
 
     if denorm:
         dx *= dx.shape[1]
@@ -140,12 +155,15 @@ def dequantize_flow(dx, dy, max_val=0.02, denorm=True):
     return flow
 
 
-def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'):
+def flow_warp(img: np.ndarray,
+              flow: np.ndarray,
+              filling_value: int = 0,
+              interpolate_mode: str = 'nearest') -> np.ndarray:
     """Use flow to warp img.
 
     Args:
-        img (ndarray, float or uint8): Image to be warped.
-        flow (ndarray, float): Optical Flow.
+        img (ndarray): Image to be warped.
+        flow (ndarray): Optical Flow.
         filling_value (int): The missing pixels will be set with filling_value.
         interpolate_mode (str): bilinear -> Bilinear Interpolation;
                                 nearest -> Nearest Neighbor.
@@ -201,7 +219,7 @@ def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'):
     return output.astype(img.dtype)
 
 
-def flow_from_bytes(content):
+def flow_from_bytes(content: bytes) -> np.ndarray:
     """Read dense optical flow from bytes.
 
     .. note::
@@ -231,7 +249,7 @@ def flow_from_bytes(content):
     return flow
 
 
-def sparse_flow_from_bytes(content):
+def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
     """Read the optical flow in KITTI datasets from bytes.
 
     This function is modified from RAFT load the `KITTI datasets
@@ -242,7 +260,7 @@ def sparse_flow_from_bytes(content):
 
     Returns:
         Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
-            and flow valid mask with the shape (H, W).
+        and flow valid mask with the shape (H, W).
     """  # nopa
 
     content = np.frombuffer(content, np.uint8)
diff --git a/mmcv/video/processing.py b/mmcv/video/processing.py
index e623cf4a6970c719cc2a9277503d15d5d433ea28..90e2a4c0228b8ed44ac50309a6e77092ca5a4ed0 100644
--- a/mmcv/video/processing.py
+++ b/mmcv/video/processing.py
@@ -3,16 +3,17 @@ import os
 import os.path as osp
 import subprocess
 import tempfile
+from typing import List, Optional, Union
 
 from mmcv.utils import requires_executable
 
 
 @requires_executable('ffmpeg')
-def convert_video(in_file,
-                  out_file,
-                  print_cmd=False,
-                  pre_options='',
-                  **kwargs):
+def convert_video(in_file: str,
+                  out_file: str,
+                  print_cmd: bool = False,
+                  pre_options: str = '',
+                  **kwargs) -> None:
     """Convert a video with ffmpeg.
 
     This provides a general api to ffmpeg, the executed command is::
@@ -52,13 +53,13 @@ def convert_video(in_file,
 
 
 @requires_executable('ffmpeg')
-def resize_video(in_file,
-                 out_file,
-                 size=None,
-                 ratio=None,
-                 keep_ar=False,
-                 log_level='info',
-                 print_cmd=False):
+def resize_video(in_file: str,
+                 out_file: str,
+                 size: Optional[tuple] = None,
+                 ratio: Union[tuple, float, None] = None,
+                 keep_ar: bool = False,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
     """Resize a video.
 
     Args:
@@ -90,14 +91,14 @@ def resize_video(in_file,
 
 
 @requires_executable('ffmpeg')
-def cut_video(in_file,
-              out_file,
-              start=None,
-              end=None,
-              vcodec=None,
-              acodec=None,
-              log_level='info',
-              print_cmd=False):
+def cut_video(in_file: str,
+              out_file: str,
+              start: Optional[float] = None,
+              end: Optional[float] = None,
+              vcodec: Optional[str] = None,
+              acodec: Optional[str] = None,
+              log_level: str = 'info',
+              print_cmd: bool = False) -> None:
     """Cut a clip from a video.
 
     Args:
@@ -116,21 +117,21 @@ def cut_video(in_file,
     if acodec is None:
         options['acodec'] = 'copy'
     if start:
-        options['ss'] = start
+        options['ss'] = start  # type: ignore
     else:
         start = 0
     if end:
-        options['t'] = end - start
+        options['t'] = end - start  # type: ignore
     convert_video(in_file, out_file, print_cmd, **options)
 
 
 @requires_executable('ffmpeg')
-def concat_video(video_list,
-                 out_file,
-                 vcodec=None,
-                 acodec=None,
-                 log_level='info',
-                 print_cmd=False):
+def concat_video(video_list: List,
+                 out_file: str,
+                 vcodec: Optional[str] = None,
+                 acodec: Optional[str] = None,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
     """Concatenate multiple videos into a single one.
 
     Args:
diff --git a/mmcv/visualization/color.py b/mmcv/visualization/color.py
index e00355a515224d5d68515639a167c94712f299a1..2cc0b523e02754d2636afea018c2eb39c4f8709e 100644
--- a/mmcv/visualization/color.py
+++ b/mmcv/visualization/color.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from enum import Enum
+from typing import Union
 
 import numpy as np
 
@@ -21,7 +22,7 @@ class Color(Enum):
     black = (0, 0, 0)
 
 
-def color_val(color):
+def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
     """Convert various input to color tuples.
 
     Args:
@@ -31,7 +32,7 @@ def color_val(color):
         tuple[int]: A tuple of 3 integers indicating BGR channels.
     """
     if is_str(color):
-        return Color[color].value
+        return Color[color].value  # type: ignore
     elif isinstance(color, Color):
         return color.value
     elif isinstance(color, tuple):
diff --git a/mmcv/visualization/image.py b/mmcv/visualization/image.py
index cacfb5326ef6aec2691b00bb33b1807305eb6bf5..e7ac4c181744cb08e51a77707c970400a9198a74 100644
--- a/mmcv/visualization/image.py
+++ b/mmcv/visualization/image.py
@@ -1,12 +1,19 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
 import cv2
 import numpy as np
 
 from mmcv.image import imread, imwrite
-from .color import color_val
+from .color import Color, color_val
+
+# a type alias declares the optional types of color argument
+ColorType = Union[Color, str, tuple, int, np.ndarray]
 
 
-def imshow(img, win_name='', wait_time=0):
+def imshow(img: Union[str, np.ndarray],
+           win_name: str = '',
+           wait_time: int = 0):
     """Show an image.
 
     Args:
@@ -27,21 +34,21 @@ def imshow(img, win_name='', wait_time=0):
         ret = cv2.waitKey(wait_time)
 
 
-def imshow_bboxes(img,
-                  bboxes,
-                  colors='green',
-                  top_k=-1,
-                  thickness=1,
-                  show=True,
-                  win_name='',
-                  wait_time=0,
-                  out_file=None):
+def imshow_bboxes(img: Union[str, np.ndarray],
+                  bboxes: Union[list, np.ndarray],
+                  colors: ColorType = 'green',
+                  top_k: int = -1,
+                  thickness: int = 1,
+                  show: bool = True,
+                  win_name: str = '',
+                  wait_time: int = 0,
+                  out_file: Optional[str] = None):
     """Draw bboxes on an image.
 
     Args:
         img (str or ndarray): The image to be displayed.
         bboxes (list or ndarray): A list of ndarray of shape (k, 4).
-        colors (list[str or tuple or Color]): A list of colors.
+        colors (Color or str or tuple or int or ndarray): A list of colors.
         top_k (int): Plot the first k bboxes only if set positive.
         thickness (int): Thickness of lines.
         show (bool): Whether to show the image.
@@ -81,19 +88,19 @@ def imshow_bboxes(img,
     return img
 
 
-def imshow_det_bboxes(img,
-                      bboxes,
-                      labels,
-                      class_names=None,
-                      score_thr=0,
-                      bbox_color='green',
-                      text_color='green',
-                      thickness=1,
-                      font_scale=0.5,
-                      show=True,
-                      win_name='',
-                      wait_time=0,
-                      out_file=None):
+def imshow_det_bboxes(img: Union[str, np.ndarray],
+                      bboxes: np.ndarray,
+                      labels: np.ndarray,
+                      class_names: List[str] = None,
+                      score_thr: float = 0,
+                      bbox_color: ColorType = 'green',
+                      text_color: ColorType = 'green',
+                      thickness: int = 1,
+                      font_scale: float = 0.5,
+                      show: bool = True,
+                      win_name: str = '',
+                      wait_time: int = 0,
+                      out_file: Optional[str] = None):
     """Draw bboxes and class labels (with scores) on an image.
 
     Args:
@@ -103,8 +110,10 @@ def imshow_det_bboxes(img,
         labels (ndarray): Labels of bboxes.
         class_names (list[str]): Names of each classes.
         score_thr (float): Minimum score of bboxes to be shown.
-        bbox_color (str or tuple or :obj:`Color`): Color of bbox lines.
-        text_color (str or tuple or :obj:`Color`): Color of texts.
+        bbox_color (Color or str or tuple or int or ndarray): Color
+            of bbox lines.
+        text_color (Color or str or tuple or int or ndarray): Color
+            of texts.
         thickness (int): Thickness of lines.
         font_scale (float): Font scales of texts.
         show (bool): Whether to show the image.
diff --git a/mmcv/visualization/optflow.py b/mmcv/visualization/optflow.py
index ee9e2c1ccf081441951856a5e6ee8ecd9e56e423..080b0e61f401c2aab3eedd307d8fc8686b0cae08 100644
--- a/mmcv/visualization/optflow.py
+++ b/mmcv/visualization/optflow.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from __future__ import division
+from typing import Optional, Union
 
 import numpy as np
 
@@ -8,7 +8,9 @@ from mmcv.video import flowread
 from .image import imshow
 
 
-def flowshow(flow, win_name='', wait_time=0):
+def flowshow(flow: Union[np.ndarray, str],
+             win_name: str = '',
+             wait_time: int = 0) -> None:
     """Show optical flow.
 
     Args:
@@ -21,14 +23,16 @@ def flowshow(flow, win_name='', wait_time=0):
     imshow(rgb2bgr(flow_img), win_name, wait_time)
 
 
-def flow2rgb(flow, color_wheel=None, unknown_thr=1e6):
+def flow2rgb(flow: np.ndarray,
+             color_wheel: Optional[np.ndarray] = None,
+             unknown_thr: float = 1e6) -> np.ndarray:
     """Convert flow map to RGB image.
 
     Args:
         flow (ndarray): Array of optical flow.
         color_wheel (ndarray or None): Color wheel used to map flow field to
             RGB colorspace. Default color wheel will be used if not specified.
-        unknown_thr (str): Values above this threshold will be marked as
+        unknown_thr (float): Values above this threshold will be marked as
             unknown and thus ignored.
 
     Returns:
@@ -73,7 +77,7 @@ def flow2rgb(flow, color_wheel=None, unknown_thr=1e6):
     return flow_img
 
 
-def make_color_wheel(bins=None):
+def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
     """Build a color wheel.
 
     Args:
diff --git a/requirements.txt b/requirements.txt
index f80e094dbb217ec06c7154e670825e7d1f27eeab..448e224f92ec0e79f5aed2efc5c749f1b4447fd0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
-addict
-numpy
-pyyaml
-regex;sys_platform=='win32'
-yapf
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/test.txt
diff --git a/requirements/build.txt b/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abf514853e58db1b0903721c7624cb313bf3aa57
--- /dev/null
+++ b/requirements/build.txt
@@ -0,0 +1 @@
+pytest-runner
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 988b5ead21189467d9595e4a60aa558cd10659d8..d502a8fe55c8a5e4d3b92f11457f15f1cb2ceec3 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,5 +1,5 @@
 docutils==0.16.0
-m2r
+markdown<3.4.0
 myst-parser
 opencv-python
 -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
diff --git a/requirements/optional.txt b/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63730036fd34e349d7856c5401d395262f99db16
--- /dev/null
+++ b/requirements/optional.txt
@@ -0,0 +1 @@
+ninja
diff --git a/requirements/test.txt b/requirements/test.txt
index ee706dee2bdab935fc22a585d5eca76422051586..9b7e0c0210969efd3a63d1ab20a74c3546b304c7 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -1,9 +1,10 @@
 coverage
 lmdb
-onnx==1.7.0
-onnxoptimizer
-onnxruntime>=1.8.0
+onnx==1.7.0; python_version < '3.10'
+onnxoptimizer; python_version < '3.10'
+onnxruntime>=1.8.0; python_version < '3.10'
+protobuf~=3.19.0
 pytest
 PyTurboJPEG
 scipy
-tiffile
+tifffile
diff --git a/setup.cfg b/setup.cfg
index 32222e54d4f6c7bb2d441775294692c38a12f1a8..f3ba2bdeb16e27fac82c92d1ab60e396667bb80d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -12,11 +12,15 @@ split_before_expression_after_opening_paren = true
 [isort]
 line_length = 79
 multi_line_output = 0
-known_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
+extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
 known_first_party = mmcv
-known_third_party = addict,cv2,m2r,numpy,onnx,onnxruntime,packaging,pytest,pytorch_sphinx_theme,recommonmark,scipy,sphinx,tensorrt,torch,torchvision,yaml,yapf
+known_third_party = addict,cv2,matplotlib,numpy,onnx,onnxruntime,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,tensorrt,torch,torchvision,yaml,yapf
 no_lines_before = STDLIB,LOCALFOLDER
 default_section = THIRDPARTY
 
+# ignore-words-list needs to be lowercase format. For example, if we want to
+# ignore word "BA", then we need to append "ba" to ignore-words-list rather
+# than "BA"
 [codespell]
-ignore-words-list = inout,hist
+quiet-level = 3
+ignore-words-list = inout,hist,ba
diff --git a/setup.py b/setup.py
index 64c882d589085fa2fc3485ba432f37b71e665f44..274c13de339786b635d1851909c70f0a4d810cce 100644
--- a/setup.py
+++ b/setup.py
@@ -2,6 +2,7 @@ import glob
 import os
 import platform
 import re
+import warnings
 from pkg_resources import DistributionNotFound, get_distribution
 from setuptools import find_packages, setup
 
@@ -11,6 +12,10 @@ try:
     if torch.__version__ == 'parrots':
         from parrots.utils.build_extension import BuildExtension
         EXT_TYPE = 'parrots'
+    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
+            os.getenv('FORCE_MLU', '0') == '1':
+        from torch_mlu.utils.cpp_extension import BuildExtension
+        EXT_TYPE = 'pytorch'
     else:
         from torch.utils.cpp_extension import BuildExtension
         EXT_TYPE = 'pytorch'
@@ -34,7 +39,7 @@ def choose_requirement(primary, secondary):
 
 def get_version():
     version_file = 'mmcv/version.py'
-    with open(version_file, 'r', encoding='utf-8') as f:
+    with open(version_file, encoding='utf-8') as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
@@ -89,12 +94,11 @@ def parse_requirements(fname='requirements/runtime.txt', with_version=True):
             yield info
 
     def parse_require_file(fpath):
-        with open(fpath, 'r') as f:
+        with open(fpath) as f:
             for line in f.readlines():
                 line = line.strip()
                 if line and not line.startswith('#'):
-                    for info in parse_line(line):
-                        yield info
+                    yield from parse_line(line)
 
     def gen_packages_items():
         if exists(require_fpath):
@@ -135,6 +139,21 @@ def get_extensions():
     extensions = []
 
     if os.getenv('MMCV_WITH_TRT', '0') != '0':
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: ' + \
+            'Custom TensorRT Ops will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
+
         ext_name = 'mmcv._ext_trt'
         from torch.utils.cpp_extension import include_paths, library_paths
         library_dirs = []
@@ -161,6 +180,9 @@ def get_extensions():
         define_macros += [('MMCV_WITH_TRT', None)]
         cuda_args = os.getenv('MMCV_CUDA_ARGS')
         extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+        # prevent cub/thrust conflict with other python library
+        # More context See issues #1454
+        extra_compile_args['nvcc'] += ['-Xcompiler=-fno-gnu-unique']
         library_dirs += library_paths(cuda=True)
 
         from setuptools import Extension
@@ -181,18 +203,20 @@ def get_extensions():
     if EXT_TYPE == 'parrots':
         ext_name = 'mmcv._ext'
         from parrots.utils.build_extension import Extension
+
         # new parrots op impl do not use MMCV_USE_PARROTS
         # define_macros = [('MMCV_USE_PARROTS', None)]
         define_macros = []
         include_dirs = []
         op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
+            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
             glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
         include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
         cuda_args = os.getenv('MMCV_CUDA_ARGS')
         extra_compile_args = {
-            'nvcc': [cuda_args] if cuda_args else [],
-            'cxx': [],
+            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
+            'cxx': ['-std=c++14'],
         }
         if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
             define_macros += [('MMCV_WITH_CUDA', None)]
@@ -213,7 +237,7 @@ def get_extensions():
     elif EXT_TYPE == 'pytorch':
         ext_name = 'mmcv._ext'
         from torch.utils.cpp_extension import CppExtension, CUDAExtension
-        from fastpt import CUDAExtension
+
         # prevent ninja from using too many resources
         try:
             import psutil
@@ -253,30 +277,11 @@ def get_extensions():
         except ImportError:
             pass
 
-        project_dir = 'mmcv/ops/csrc/'
-        if is_rocm_pytorch and False:
-            from torch.utils.hipify import hipify_python
-            from fastpt import hipify_python
-
-            hipify_python.hipify(
-                project_directory=project_dir,
-                output_directory=project_dir,
-                includes='mmcv/ops/csrc/*',
-                show_detailed=True,
-                is_pytorch_extension=True,
-            )
-            define_macros += [('MMCV_WITH_CUDA', None)]
-            define_macros += [('HIP_DIFF', None)]
-            cuda_args = os.getenv('MMCV_CUDA_ARGS')
-            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
-            op_files = glob.glob('./mmcv/ops/csrc/pytorch/hip/*') \
-                + glob.glob('./mmcv/ops/csrc/pytorch/cpu/hip/*')
-            extension = CUDAExtension
-            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/hip'))
-        elif torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
-            define_macros += [('MMCV_WITH_CUDA', None)]
+        if is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
+                'FORCE_CUDA', '0') == '1':
             if is_rocm_pytorch:
                 define_macros += [('HIP_DIFF', None)]
+            define_macros += [('MMCV_WITH_CUDA', None)]
             cuda_args = os.getenv('MMCV_CUDA_ARGS')
             extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
@@ -286,8 +291,46 @@ def get_extensions():
             extension = CUDAExtension
             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
             include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        elif (hasattr(torch, 'is_mlu_available') and
+                torch.is_mlu_available()) or \
+                os.getenv('FORCE_MLU', '0') == '1':
+            from torch_mlu.utils.cpp_extension import MLUExtension
+            define_macros += [('MMCV_WITH_MLU', None)]
+            mlu_args = os.getenv('MMCV_MLU_ARGS')
+            extra_compile_args['cncc'] = [mlu_args] if mlu_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu')
+            extension = MLUExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
+        elif (hasattr(torch.backends, 'mps')
+              and torch.backends.mps.is_available()) or os.getenv(
+                  'FORCE_MPS', '0') == '1':
+            # objc compiler support
+            from distutils.unixccompiler import UnixCCompiler
+            if '.mm' not in UnixCCompiler.src_extensions:
+                UnixCCompiler.src_extensions.append('.mm')
+                UnixCCompiler.language_map['.mm'] = 'objc'
+
+            define_macros += [('MMCV_WITH_MPS', None)]
+            extra_compile_args = {}
+            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
+            extra_compile_args['cxx'] += [
+                '-framework', 'Metal', '-framework', 'Foundation'
+            ]
+            extra_compile_args['cxx'] += ['-ObjC++']
+            # src
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
         else:
-            print(f'Compiling {ext_name} without CUDA')
+            print(f'Compiling {ext_name} only with CPU')
             op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
                 glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
             extension = CppExtension
@@ -312,9 +355,23 @@ def get_extensions():
         extensions.append(ext_ops)
 
     if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: ' + \
+            'Custom ONNXRuntime Ops will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
         ext_name = 'mmcv._ext_ort'
-        from torch.utils.cpp_extension import library_paths, include_paths
         import onnxruntime
+        from torch.utils.cpp_extension import include_paths, library_paths
         library_dirs = []
         libraries = []
         include_dirs = []
@@ -372,14 +429,19 @@ setup(
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: Utilities',
     ],
     url='https://github.com/open-mmlab/mmcv',
     author='MMCV Contributors',
     author_email='openmmlab@gmail.com',
-    setup_requires=['pytest-runner'],
-    tests_require=['pytest'],
     install_requires=install_requires,
+    extras_require={
+        'all': parse_requirements('requirements.txt'),
+        'tests': parse_requirements('requirements/test.txt'),
+        'build': parse_requirements('requirements/build.txt'),
+        'optional': parse_requirements('requirements/optional.txt'),
+    },
     ext_modules=get_extensions(),
     cmdclass=cmd_class,
     zip_safe=False)
diff --git a/tests/data/config/a.b.py b/tests/data/config/a.b.py
index a591adf0cb719de1549519c092ed7b6f82351b27..2364e1d10b054e99c2e1e5780cf8d0e007d659c2 100644
--- a/tests/data/config/a.b.py
+++ b/tests/data/config/a.b.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/a.py b/tests/data/config/a.py
index a591adf0cb719de1549519c092ed7b6f82351b27..2364e1d10b054e99c2e1e5780cf8d0e007d659c2 100644
--- a/tests/data/config/a.py
+++ b/tests/data/config/a.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/base.py b/tests/data/config/base.py
index a591adf0cb719de1549519c092ed7b6f82351b27..2364e1d10b054e99c2e1e5780cf8d0e007d659c2 100644
--- a/tests/data/config/base.py
+++ b/tests/data/config/base.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/code.py b/tests/data/config/code.py
index 2825c9b8822a64382bc3a782e2c03b9250a5a5b5..65f70045d2c23223d38803a284d341b6a35256b2 100644
--- a/tests/data/config/code.py
+++ b/tests/data/config/code.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 from mmcv import Config  # isort:skip
 
 cfg = Config.fromfile('./tests/data/config/a.py')
diff --git a/tests/data/config/d.py b/tests/data/config/d.py
index f40e92117033639d2c039699229507663287c4d0..19edcf82d0c9a40c007ba6a1eca03153f7056ce0 100644
--- a/tests/data/config/d.py
+++ b/tests/data/config/d.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
 item1 = [2, 3]
 item2 = {'a': 1}
diff --git a/tests/data/config/delete.py b/tests/data/config/delete.py
index 2b055c84a5123dae51dd520772b6a85e97c8e268..f8a1eaf64c46d301f47a90d4ac907d1a0362e84e 100644
--- a/tests/data/config/delete.py
+++ b/tests/data/config/delete.py
@@ -1,2 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
-item2 = {'b': 0, '_delete_': True}
+item1 = {'a': 0, '_delete_': True}
+item2 = {'b': 0}
diff --git a/tests/data/config/deprecated.py b/tests/data/config/deprecated.py
index d4c40b2708333a23d378ad3c885e23aba893a73d..791b0f6ad8c41dbe14c4dd373beee1d8613b859a 100644
--- a/tests/data/config/deprecated.py
+++ b/tests/data/config/deprecated.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './expected.py'
 
 _deprecation_ = dict(
diff --git a/tests/data/config/deprecated_as_base.py b/tests/data/config/deprecated_as_base.py
index 22e534fb6d90db8f7b6ffc065be867ca69460fe6..406964d102ef0bfe1a6ab7513cee9e32052621cc 100644
--- a/tests/data/config/deprecated_as_base.py
+++ b/tests/data/config/deprecated_as_base.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './deprecated.py'
\ No newline at end of file
diff --git a/tests/data/config/e.py b/tests/data/config/e.py
index 173fbe4716abd8863f5334bb03f6e8335cd9e7dd..1340e4bd27198e3d3ef82dbf516f22d8daf236f2 100644
--- a/tests/data/config/e.py
+++ b/tests/data/config/e.py
@@ -1,2 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './base.py'
 item3 = {'a': 1}
diff --git a/tests/data/config/expected.py b/tests/data/config/expected.py
index 96eb408cfb682eb51c9e1919eb49ca1a112d75a3..7f6b729171a5b0c6158514bc500390c4ddbbbc76 100644
--- a/tests/data/config/expected.py
+++ b/tests/data/config/expected.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = 'expected'
diff --git a/tests/data/config/f.py b/tests/data/config/f.py
index c626885f31a4c7c468595c706bc5b81db0eb8613..b6ed109bdeb01c0fede98d01d7f5e308113f7591 100644
--- a/tests/data/config/f.py
+++ b/tests/data/config/f.py
@@ -1,2 +1,3 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './d.py'
 item4 = 'test_recursive_bases'
diff --git a/tests/data/config/g.py b/tests/data/config/g.py
index 9c555899471a0a157d92ed764433713f75557236..34d4ebe2f898a01ee8aa11a51f0383040213dc7f 100644
--- a/tests/data/config/g.py
+++ b/tests/data/config/g.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 filename = 'reserved.py'
diff --git a/tests/data/config/h.py b/tests/data/config/h.py
index 9562bb0ebc2c757230cc1fcdb26ba90cd6ef3ca9..82594590cf4a73bed123e92ad8c392f3d4723148 100644
--- a/tests/data/config/h.py
+++ b/tests/data/config/h.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = '{{fileBasename}}'
 item2 = '{{ fileDirname}}'
 item3 = 'abc_{{ fileBasenameNoExtension }}'
diff --git a/tests/data/config/i_base.py b/tests/data/config/i_base.py
index 5032f1628798fbb546c97f10c4ae1a3b3736cac8..f31a46a15de9d84191e25e8117d84a50fc967474 100644
--- a/tests/data/config/i_base.py
+++ b/tests/data/config/i_base.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
 item2 = {'a': 0}
 item3 = True
diff --git a/tests/data/config/i_child.py b/tests/data/config/i_child.py
index 9fb59721e031ca2d2ad94260365575fbf0c263fc..dfb91d16e973530dd8e07e45611ea4dc77f720ae 100644
--- a/tests/data/config/i_child.py
+++ b/tests/data/config/i_child.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = './i_base.py'
 item_cfg = {'b': 2}
 item6 = {'cfg': item_cfg}
diff --git a/tests/data/config/l.py b/tests/data/config/l.py
index b63dd5ef713876e880fc299fe5dd3aba47882736..4a17bfcbcfdebf42a527ca63a8e33ef71ed67e7c 100644
--- a/tests/data/config/l.py
+++ b/tests/data/config/l.py
@@ -1,3 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+
+def func(x):
+    return x
+
 _base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/l1.py b/tests/data/config/l1.py
index 939ac08d93206923999fce6f20d7906c406a93ff..13db1375e71095d4295bde140bceaad9db9e1c31 100644
--- a/tests/data/config/l1.py
+++ b/tests/data/config/l1.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item1 = [1, 2]
diff --git a/tests/data/config/l4.py b/tests/data/config/l4.py
index 2bfc70bef8c06bbd55e6d57167aba52bcc0ba994..cb7b4365ec3674339d3de106bee06c451d4d09ee 100644
--- a/tests/data/config/l4.py
+++ b/tests/data/config/l4.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item5 = dict(a=0, b=1)
 item6 = [dict(a=0), dict(b=1)]
 item7 = dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
diff --git a/tests/data/config/m.py b/tests/data/config/m.py
index 950e40e528f90a4d87c34337375a4cb751dfde17..af81ca35ca5086e5288a823f7c60269d8e751e99 100644
--- a/tests/data/config/m.py
+++ b/tests/data/config/m.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./l1.py', './l2.yaml', './l3.json', 'a.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/n.py b/tests/data/config/n.py
index fb193e6881f22c841d67a703b63325449d2924c5..8d295984c85573a28c08717f0e81a4eb104b7299 100644
--- a/tests/data/config/n.py
+++ b/tests/data/config/n.py
@@ -1,3 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+
+def func(x):
+    return x
+
 test_item1 = [1, 2]
 bool_item2 = True
 str_item3 = 'test'
diff --git a/tests/data/config/q.py b/tests/data/config/q.py
index 440a3056d1c1308e0c6c30a1100429e4dab2230d..f7ca0a70bb381f5b7249fec97b5ab8630f5dd57c 100644
--- a/tests/data/config/q.py
+++ b/tests/data/config/q.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 custom_imports = dict(imports=['r'], allow_failed_imports=False)
diff --git a/tests/data/config/r.py b/tests/data/config/r.py
index 9360128d57d0fdab0b043d03a1cb3d7e101732ee..26d982e82ac83fb94c500ee3155b50837cdd0028 100644
--- a/tests/data/config/r.py
+++ b/tests/data/config/r.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 os.environ["TEST_VALUE"] = 'test'
diff --git a/tests/data/config/s.py b/tests/data/config/s.py
index 7c142f117aef83998c5d64c41f38b974cce328df..cca07539c8942c7d0424d685d7a3c5e829f27d3a 100644
--- a/tests/data/config/s.py
+++ b/tests/data/config/s.py
@@ -1 +1,2 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 item = [{'a': 0}, {'b': 0, 'c': 0}]
diff --git a/tests/data/config/t.py b/tests/data/config/t.py
index 9f085ae675b33900f73434c3a7d631f7f24fb98d..1df57cb5ad2343c3791925b00a53a0bfe726c626 100644
--- a/tests/data/config/t.py
+++ b/tests/data/config/t.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./l1.py', './l2.yaml', './l3.json', './l4.py']
 item3 = False
 item4 = 'test'
diff --git a/tests/data/config/u.py b/tests/data/config/u.py
index bdd96a7e46bc44feb4e4c3cc380928e4d51d3a8f..be6c5bbb7e36fc6b6f47bd1b0573fe5a72a23e20 100644
--- a/tests/data/config/u.py
+++ b/tests/data/config/u.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./t.py']
 base = '_base_.item8'
 item11 = {{ _base_.item8 }}
diff --git a/tests/data/config/v.py b/tests/data/config/v.py
index 3d2a1a436c8baecb1545fa676722e9e68d5e5e55..13d204d24f5df313661f05cd523642305e9ae408 100644
--- a/tests/data/config/v.py
+++ b/tests/data/config/v.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 _base_ = ['./u.py']
 item21 = {{ _base_.item11 }}
 item22 = item21
diff --git a/tests/data/scripts/hello.py b/tests/data/scripts/hello.py
index d7215d0b7fa360133bf4548188f28355b7e124d1..2ed1a1e319fa36eb11ed3f0fcd365eb43a382d01 100755
--- a/tests/data/scripts/hello.py
+++ b/tests/data/scripts/hello.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 #!/usr/bin/env python
 
 import argparse
diff --git a/tests/test_arraymisc.py b/tests/test_arraymisc.py
index 8e24b8a524d8a96b30e55ff1485fd58cc6e5ec4c..b29e5f670c3b43663a3390c0e5d4206d49680b70 100644
--- a/tests/test_arraymisc.py
+++ b/tests/test_arraymisc.py
@@ -1,5 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from __future__ import division
 
 import numpy as np
 import pytest
diff --git a/tests/test_cnn/test_build_layers.py b/tests/test_cnn/test_build_layers.py
index fbaf8624c7feae52e4768d00e639256ac8d52a65..4a9b3eb9087b04ed6e809af1d1172e43f74c19d3 100644
--- a/tests/test_cnn/test_build_layers.py
+++ b/tests/test_cnn/test_build_layers.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
@@ -59,12 +60,21 @@ def test_build_conv_layer():
     assert layer.groups == kwargs['groups']
     assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
 
+    # sparse convs cannot support the case when groups>1
+    kwargs.pop('groups')
+
     for type_name, module in CONV_LAYERS.module_dict.items():
         cfg = dict(type=type_name)
+        # SparseInverseConv2d and SparseInverseConv3d do not have the argument
+        # 'dilation'
+        if type_name == 'SparseInverseConv2d' or type_name == \
+                'SparseInverseConv3d':
+            kwargs.pop('dilation')
         layer = build_conv_layer(cfg, **kwargs)
         assert isinstance(layer, module)
         assert layer.in_channels == kwargs['in_channels']
         assert layer.out_channels == kwargs['out_channels']
+        kwargs['dilation'] = 2  # recover the key
 
 
 def test_infer_norm_abbr():
@@ -150,7 +160,7 @@ def test_build_norm_layer():
         for postfix in ['_test', 1]:
             cfg = dict(type=type_name)
             if type_name == 'GN':
-                cfg['num_groups'] = 2
+                cfg['num_groups'] = 3
             name, layer = build_norm_layer(cfg, 3, postfix=postfix)
             assert name == abbr_mapping[type_name] + str(postfix)
             assert isinstance(layer, module)
diff --git a/tests/test_cnn/test_context_block.py b/tests/test_cnn/test_context_block.py
index 8aa18f86a7947f7d8b62fd2e185bde380121ef9a..864cb417937603d162235c4a72b4eff09b151518 100644
--- a/tests/test_cnn/test_context_block.py
+++ b/tests/test_cnn/test_context_block.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_cnn/test_conv2d_adaptive_padding.py b/tests/test_cnn/test_conv2d_adaptive_padding.py
index 051d6e585c7008e848d1d169de8e2661bc059a9b..83114bd5b5588dd37523a2a7476cef15b8c15df5 100644
--- a/tests/test_cnn/test_conv2d_adaptive_padding.py
+++ b/tests/test_cnn/test_conv2d_adaptive_padding.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 from mmcv.cnn.bricks import Conv2dAdaptivePadding
diff --git a/tests/test_cnn/test_conv_module.py b/tests/test_cnn/test_conv_module.py
index e231ef3ae3437bc0fb69fd76f4e60552702a81c7..3fa8600f6a0250982d0c9ea7af7cbaf842384dd5 100644
--- a/tests/test_cnn/test_conv_module.py
+++ b/tests/test_cnn/test_conv_module.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 from unittest.mock import patch
 
@@ -6,6 +7,7 @@ import torch
 import torch.nn as nn
 
 from mmcv.cnn.bricks import CONV_LAYERS, ConvModule, HSigmoid, HSwish
+from mmcv.utils import TORCH_VERSION, digit_version
 
 
 @CONV_LAYERS.register_module()
@@ -21,7 +23,7 @@ class ExampleConv(nn.Module):
                  groups=1,
                  bias=True,
                  norm_cfg=None):
-        super(ExampleConv, self).__init__()
+        super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.kernel_size = kernel_size
@@ -138,7 +140,12 @@ def test_conv_module():
 
     # HSwish
     conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))
-    assert isinstance(conv.activate, HSwish)
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.7')):
+        assert isinstance(conv.activate, HSwish)
+    else:
+        assert isinstance(conv.activate, nn.Hardswish)
+
     output = conv(x)
     assert output.shape == (1, 8, 256, 256)
 
diff --git a/tests/test_cnn/test_depthwise_seperable_conv_module.py b/tests/test_cnn/test_depthwise_seperable_conv_module.py
index 10b4c568240f3af945449bba18478d18a8bed327..748fc1bf88166b50aec9665900e664e638b78186 100644
--- a/tests/test_cnn/test_depthwise_seperable_conv_module.py
+++ b/tests/test_cnn/test_depthwise_seperable_conv_module.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_flops_counter.py b/tests/test_cnn/test_flops_counter.py
index 99a53b757143c146c34d245a425b3c30bbabb60b..e2ba6e242fd95f0d5f7f645046e2871915fae086 100644
--- a/tests/test_cnn/test_flops_counter.py
+++ b/tests/test_cnn/test_flops_counter.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_fuse_conv_bn.py b/tests/test_cnn/test_fuse_conv_bn.py
index f1346f6de77966ac387a7ed55645b130b9ad2720..e60be5386c5cc96c765caf066a8d9a82de127996 100644
--- a/tests/test_cnn/test_fuse_conv_bn.py
+++ b/tests/test_cnn/test_fuse_conv_bn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/tests/test_cnn/test_generalized_attention.py b/tests/test_cnn/test_generalized_attention.py
index 27207c92410d2032e1600ec96ade0fd0085f5458..6b844f0ad57ec8a1410956d7c928e40714d06eeb 100644
--- a/tests/test_cnn/test_generalized_attention.py
+++ b/tests/test_cnn/test_generalized_attention.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 from mmcv.cnn.bricks import GeneralizedAttention
diff --git a/tests/test_cnn/test_hsigmoid.py b/tests/test_cnn/test_hsigmoid.py
index 2de72313e98e33a29dc783ff20a433f1b87b1322..43e9f624a2ccf369d844a9e8ec7238158b364187 100644
--- a/tests/test_cnn/test_hsigmoid.py
+++ b/tests/test_cnn/test_hsigmoid.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -15,7 +16,7 @@ def test_hsigmoid():
     input = torch.randn(input_shape)
     output = act(input)
     expected_output = torch.min(
-        torch.max((input + 1) / 2, torch.zeros(input_shape)),
+        torch.max((input + 3) / 6, torch.zeros(input_shape)),
         torch.ones(input_shape))
     # test output shape
     assert output.shape == expected_output.shape
@@ -23,12 +24,12 @@ def test_hsigmoid():
     assert torch.equal(output, expected_output)
 
     # test with designated parameters
-    act = HSigmoid(3, 6, 0, 1)
+    act = HSigmoid(1, 2, 0, 1)
     input_shape = torch.Size([1, 3, 64, 64])
     input = torch.randn(input_shape)
     output = act(input)
     expected_output = torch.min(
-        torch.max((input + 3) / 6, torch.zeros(input_shape)),
+        torch.max((input + 1) / 2, torch.zeros(input_shape)),
         torch.ones(input_shape))
     # test output shape
     assert output.shape == expected_output.shape
diff --git a/tests/test_cnn/test_hswish.py b/tests/test_cnn/test_hswish.py
index 05300840a6ffdf1a8180e02559ed316a6762b890..5cd1bcf31221b14fec0b60537b869f4ebe12f26a 100644
--- a/tests/test_cnn/test_hswish.py
+++ b/tests/test_cnn/test_hswish.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from torch.nn.functional import relu6
 
diff --git a/tests/test_cnn/test_model_registry.py b/tests/test_cnn/test_model_registry.py
index 86fb15b6852c033deb2ef38d327ce2ba7e8f2ed5..dd446cef5d6b688afca7fb498d27ff9368e59a38 100644
--- a/tests/test_cnn/test_model_registry.py
+++ b/tests/test_cnn/test_model_registry.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch.nn as nn
 
 import mmcv
diff --git a/tests/test_cnn/test_non_local.py b/tests/test_cnn/test_non_local.py
index de231f957ee0be441ab4716a3e4d523d67f97708..25d78833912a195532eb946a8939d1ea986043a5 100644
--- a/tests/test_cnn/test_non_local.py
+++ b/tests/test_cnn/test_non_local.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 import torch.nn as nn
diff --git a/tests/test_cnn/test_revert_syncbn.py b/tests/test_cnn/test_revert_syncbn.py
index 69c916160ca22354e6930b229c7ce8297921cd02..187c2a6d0bbace6b80c7152abaf731b60e10812a 100644
--- a/tests/test_cnn/test_revert_syncbn.py
+++ b/tests/test_cnn/test_revert_syncbn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import platform
 
@@ -15,6 +16,8 @@ else:
     import re
 
 
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
 def test_revert_syncbn():
     conv = ConvModule(3, 8, 2, norm_cfg=dict(type='SyncBN'))
     x = torch.randn(1, 3, 10, 10)
diff --git a/tests/test_cnn/test_scale.py b/tests/test_cnn/test_scale.py
index a380b902c82b13502153bbbcbeb2200e513bc753..bee78eb57f2f4ddd519b4aad101b25dc31798bef 100644
--- a/tests/test_cnn/test_scale.py
+++ b/tests/test_cnn/test_scale.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 from mmcv.cnn.bricks import Scale
diff --git a/tests/test_cnn/test_swish.py b/tests/test_cnn/test_swish.py
index d8e777290a5b404e35859f5510baae8b29982478..2317f5a139a5228c049848a260ea914ac02eecee 100644
--- a/tests/test_cnn/test_swish.py
+++ b/tests/test_cnn/test_swish.py
@@ -1,5 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from torch.nn.functional import sigmoid
+import torch.nn.functional as F
 
 from mmcv.cnn.bricks import Swish
 
@@ -7,7 +8,7 @@ from mmcv.cnn.bricks import Swish
 def test_swish():
     act = Swish()
     input = torch.randn(1, 3, 64, 64)
-    expected_output = input * sigmoid(input)
+    expected_output = input * F.sigmoid(input)
     output = act(input)
     # test output shape
     assert output.shape == expected_output.shape
diff --git a/tests/test_cnn/test_transformer.py b/tests/test_cnn/test_transformer.py
index 106753b42316be4d703c4c6dd87152521be4a823..b330aed34a8d9837784b1e7cb03f783ecbe5fae8 100644
--- a/tests/test_cnn/test_transformer.py
+++ b/tests/test_cnn/test_transformer.py
@@ -1,15 +1,474 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import copy
 
 import pytest
 import torch
 
 from mmcv.cnn.bricks.drop import DropPath
-from mmcv.cnn.bricks.transformer import (FFN, BaseTransformerLayer,
-                                         MultiheadAttention,
+from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
+                                         BaseTransformerLayer,
+                                         MultiheadAttention, PatchEmbed,
+                                         PatchMerging,
                                          TransformerLayerSequence)
 from mmcv.runner import ModuleList
 
 
+def test_adaptive_padding():
+
+    for padding in ('same', 'corner'):
+        kernel_size = 16
+        stride = 16
+        dilation = 1
+        input = torch.rand(1, 1, 15, 17)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+        input = torch.rand(1, 1, 16, 17)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+
+        kernel_size = (2, 2)
+        stride = (2, 2)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        # padding to divisible by 2
+        assert (out.shape[2], out.shape[3]) == (12, 14)
+
+        kernel_size = (2, 2)
+        stride = (10, 10)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 10, 13)
+        out = adap_pad(input)
+        #  no padding
+        assert (out.shape[2], out.shape[3]) == (10, 13)
+
+        kernel_size = (11, 11)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        #  all padding
+        assert (out.shape[2], out.shape[3]) == (21, 21)
+
+        # test padding as kernel is (7,9)
+        input = torch.rand(1, 1, 11, 13)
+        stride = (3, 4)
+        kernel_size = (4, 5)
+        dilation = (2, 2)
+        # actually (7, 9)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        dilation_out = adap_pad(input)
+        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
+        kernel_size = (7, 9)
+        dilation = (1, 1)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        kernel79_out = adap_pad(input)
+        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
+        assert kernel79_out.shape == dilation_out.shape
+
+    # assert only support "same" "corner"
+    with pytest.raises(AssertionError):
+        AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=1)
+
+
+def test_patch_embed():
+    B = 2
+    H = 3
+    W = 4
+    C = 3
+    embed_dims = 10
+    kernel_size = 3
+    stride = 1
+    dummy_input = torch.rand(B, C, H, W)
+    patch_merge_1 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=1,
+        norm_cfg=None)
+
+    x1, shape = patch_merge_1(dummy_input)
+    # test out shape
+    assert x1.shape == (2, 2, 10)
+    # test outsize is correct
+    assert shape == (1, 2)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x1.shape[1]
+
+    B = 2
+    H = 10
+    W = 10
+    C = 3
+    embed_dims = 10
+    kernel_size = 5
+    stride = 2
+    dummy_input = torch.rand(B, C, H, W)
+    # test dilation
+    patch_merge_2 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=None,
+    )
+
+    x2, shape = patch_merge_2(dummy_input)
+    # test out shape
+    assert x2.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x2.shape[1]
+
+    stride = 2
+    input_size = (10, 10)
+
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    x3, shape = patch_merge_3(dummy_input)
+    # test out shape
+    assert x3.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x3.shape[1]
+
+    # test the init_out_size with nn.Unfold
+    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    H = 11
+    W = 12
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    # test adap padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        embed_dims = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_patch_merging():
+
+    # Test the model with int padding
+    in_c = 3
+    out_c = 4
+    kernel_size = 3
+    stride = 3
+    padding = 1
+    dilation = 1
+    bias = False
+    # test the case `pad_to_stride` is False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 3
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 16, 4)
+    assert out_size == (4, 4)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+    in_c = 4
+    out_c = 5
+    kernel_size = 6
+    stride = 3
+    padding = 2
+    dilation = 2
+    bias = False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 4
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 4, 5)
+    assert out_size == (2, 2)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+
+    # Test with adaptive padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        out_c = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
 def test_multiheadattention():
     MultiheadAttention(
         embed_dims=5,
@@ -117,8 +576,27 @@ def test_basetransformerlayer_cuda():
         assert x.shape == torch.Size([2, 10, 256])
 
 
-def test_basetransformerlayer():
+@pytest.mark.parametrize('embed_dims', [False, 256])
+def test_basetransformerlayer(embed_dims):
     attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
+    if embed_dims:
+        ffn_cfgs = dict(
+            type='FFN',
+            embed_dims=embed_dims,
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+    else:
+        ffn_cfgs = dict(
+            type='FFN',
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+
     feedforward_channels = 2048
     ffn_dropout = 0.1
     operation_order = ('self_attn', 'norm', 'ffn', 'norm')
@@ -126,6 +604,7 @@ def test_basetransformerlayer():
     # test deprecated_args
     baselayer = BaseTransformerLayer(
         attn_cfgs=attn_cfgs,
+        ffn_cfgs=ffn_cfgs,
         feedforward_channels=feedforward_channels,
         ffn_dropout=ffn_dropout,
         operation_order=operation_order)
diff --git a/tests/test_cnn/test_weight_init.py b/tests/test_cnn/test_weight_init.py
index 7c2a4b796c918d7a2b51753c2603b0bdcc37d775..c14be66287e944e42e87c8a6eecfed7daf992188 100644
--- a/tests/test_cnn/test_weight_init.py
+++ b/tests/test_cnn/test_weight_init.py
@@ -14,6 +14,9 @@ from mmcv.cnn import (Caffe2XavierInit, ConstantInit, KaimingInit, NormalInit,
                       initialize, kaiming_init, normal_init, trunc_normal_init,
                       uniform_init, xavier_init)
 
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
 
 def test_constant_init():
     conv_module = nn.Conv2d(3, 16, 3)
diff --git a/tests/test_cnn/test_wrappers.py b/tests/test_cnn/test_wrappers.py
index ffc933fec2b51f5c1eff895a71b8d4a75c3e84db..02e0f13cd790de613fac388d54a7f33bc2b9ce5d 100644
--- a/tests/test_cnn/test_wrappers.py
+++ b/tests/test_cnn/test_wrappers.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 from unittest.mock import patch
 
 import pytest
diff --git a/tests/test_device/test_device_utils.py b/tests/test_device/test_device_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6597efa5a3c1f16de4ca6c10d66a09cc77f45686
--- /dev/null
+++ b/tests/test_device/test_device_utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.device import get_device
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def test_get_device():
+    current_device = get_device()
+    if IS_CUDA_AVAILABLE:
+        assert current_device == 'cuda'
+    elif IS_MLU_AVAILABLE:
+        assert current_device == 'mlu'
+    elif IS_MPS_AVAILABLE:
+        assert current_device == 'mps'
+    else:
+        assert current_device == 'cpu'
diff --git a/tests/test_device/test_functions.py b/tests/test_device/test_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb8978b522ce8fe1fb5155b8b880eec7b281fa
--- /dev/null
+++ b/tests/test_device/test_functions.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.device._functions import Scatter, scatter
+from mmcv.utils import IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def test_scatter():
+    # if the device is CPU, just return the input
+    input = torch.zeros([1, 3, 3, 3])
+    output = scatter(input=input, devices=[-1])
+    assert torch.allclose(input, output)
+
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = scatter(input=inputs, devices=[-1])
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is MLU, copy the input from CPU to MLU
+    if IS_MLU_AVAILABLE:
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.to('mlu'), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mlu'), output)
+
+    # if the device is MPS, copy the input from CPU to MPS
+    if IS_MPS_AVAILABLE:
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.to('mps'), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mps'), output)
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        scatter(5, [-1])
+
+
+def test_Scatter():
+    # if the device is CPU, just return the input
+    target_devices = [-1]
+    input = torch.zeros([1, 3, 3, 3])
+    outputs = Scatter.forward(target_devices, input)
+    assert isinstance(outputs, tuple)
+    assert torch.allclose(input, outputs[0])
+
+    target_devices = [-1]
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = Scatter.forward(target_devices, inputs)
+    assert isinstance(outputs, tuple)
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is MLU, copy the input from CPU to MLU
+    if IS_MLU_AVAILABLE:
+        target_devices = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_devices, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.to('mlu'), outputs[0])
+
+        target_devices = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_devices, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mlu'), output[0])
+
+    # if the device is MPS, copy the input from CPU to MPS
+    if IS_MPS_AVAILABLE:
+        target_devices = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_devices, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.to('mps'), outputs[0])
+
+        target_devices = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_devices, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mps'), output[0])
diff --git a/tests/test_device/test_ipu/test_hierarchicaldatamanager.py b/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
new file mode 100755
index 0000000000000000000000000000000000000000..e0a0f012fa57a471464f1ddf8a1832879d58ed9e
--- /dev/null
+++ b/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hierarchical_data_manager import \
+        HierarchicalDataManager
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+@skip_no_ipu
+def test_HierarchicalData():
+    # test hierarchical data
+    hierarchical_data_sample = {
+        'a': torch.rand(3, 4),
+        'b': np.random.rand(3, 4),
+        'c': DataContainer({
+            'a': torch.rand(3, 4),
+            'b': 4,
+            'c': 'd'
+        }),
+        'd': 123,
+        'e': [1, 3, torch.rand(3, 4),
+              np.random.rand(3, 4)],
+        'f': {
+            'a': torch.rand(3, 4),
+            'b': np.random.rand(3, 4),
+            'c': [1, 'asd']
+        }
+    }
+    all_tensors = []
+    all_tensors.append(hierarchical_data_sample['a'])
+    all_tensors.append(hierarchical_data_sample['c'].data['a'])
+    all_tensors.append(hierarchical_data_sample['e'][2])
+    all_tensors.append(hierarchical_data_sample['f']['a'])
+    all_tensors_id = [id(ele) for ele in all_tensors]
+
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    tensors[0].add_(1)
+    hd.update_all_tensors(tensors)
+    data = hd.hierarchical_data
+    data['c'].data['a'].sub_(1)
+    hd.record_hierarchical_data(data)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    hd.quick()
+
+    with pytest.raises(
+            AssertionError,
+            match='original hierarchical data is not torch.tensor'):
+        hd.record_hierarchical_data(torch.rand(3, 4))
+
+    class AuxClass:
+        pass
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hd.record_hierarchical_data(AuxClass())
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.update_all_tensors(tensors)
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.collect_all_tensors()
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.clean_all_tensors()
+
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    hierarchical_data_sample['a'] = torch.rand(3, 4)
+    with pytest.raises(ValueError, match='all data except torch.Tensor'):
+        new_hierarchical_data_sample = {
+            **hierarchical_data_sample, 'b': np.random.rand(3, 4)
+        }
+        hd.update_hierarchical_data(new_hierarchical_data_sample)
+
+    hd.update_hierarchical_data(new_hierarchical_data_sample, strict=False)
+
+    hd.clean_all_tensors()
+
+    # test single tensor
+    single_tensor = torch.rand(3, 4)
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(single_tensor)
+    tensors = hd.collect_all_tensors()
+    assert len(tensors) == 1 and single_tensor in tensors
+    single_tensor_to_update = [torch.rand(3, 4)]
+    hd.update_all_tensors(single_tensor_to_update)
+    new_tensors = hd.collect_all_tensors()
+    assert new_tensors == single_tensor_to_update
diff --git a/tests/test_device/test_ipu/test_ipu_dataloder.py b/tests/test_device/test_ipu/test_ipu_dataloder.py
new file mode 100755
index 0000000000000000000000000000000000000000..b1db1480517247411f1478d1dcc26b2572e64af2
--- /dev/null
+++ b/tests/test_device/test_ipu/test_ipu_dataloder.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from torch.utils.data import Dataset
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, cfg2options
+    from mmcv.device.ipu.dataloader import collate
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class ToyDataset(Dataset):
+
+    def __getitem__(self, index):
+        return 111
+
+    def __len__(self, ):
+        return 3
+
+
+@skip_no_ipu
+def test_ipu_dataloader():
+    # test lazy initialization
+    dataloader = IPUDataLoader(
+        ToyDataset(), None, batch_size=256, num_workers=1, mode='async')
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)
+    dataloader.init(ipu_options['training'])
+
+    # test normal initialization
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)['training']
+    dataloader = IPUDataLoader(
+        ToyDataset(), ipu_options, batch_size=256, num_workers=1, mode='async')
+
+
+@skip_no_ipu
+def test_ipu_collate():
+    with pytest.raises(TypeError, match='`batch` should be a sequence'):
+        collate(123)
+
+    with pytest.raises(TypeError, match='DataContainer is not supported'):
+        collate([DataContainer(666)])
+
+    data_list = [[1, 2, 3], [2, 3, 4], DataContainer(666)]
+    batch0 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch1 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch = [batch1, batch0]
+    results = collate(batch)
+    assert results['tensor'].shape == (2, 3, 4, 5)
+    assert results['arr'].shape == (2, 3, 4, 5, 6)
+    for data in results['data_list']:
+        for tensor in data:
+            assert not isinstance(tensor, DataContainer)
+            assert tensor.shape == (2, )
diff --git a/tests/test_device/test_ipu/test_ipu_hooks.py b/tests/test_device/test_ipu/test_ipu_hooks.py
new file mode 100755
index 0000000000000000000000000000000000000000..d76291a372dc63a8486d5c889264cad753c78c0d
--- /dev/null
+++ b/tests/test_device/test_ipu/test_ipu_hooks.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner import build_runner
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hook_wrapper import IPUFp16OptimizerHook
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+@skip_no_ipu
+def test_ipu_hook_wrapper(tmp_path):
+
+    model = ToyModel()
+    dummy_input = {
+        'data': {
+            'img': torch.rand((16, 3, 10, 10)),
+            'gt_label': torch.rand((16, 3, 10, 10))
+        }
+    }
+
+    dir_name = 'a_tmp_dir'
+    working_dir = osp.join(tmp_path, dir_name)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+    default_args = dict(
+        model=model,
+        work_dir=working_dir,
+        optimizer=optimizer,
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    dummy_runner = build_runner(cfg, default_args=default_args)
+
+    # learning policy
+    lr_config = dict(policy='step', step=[1, 150])
+    # test optimizer config
+    optimizer_config = dict(
+        grad_clip=dict(max_norm=2), detect_anomalous_params=True)
+
+    # test building ipu_lr_hook_class
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config, optimizer_config=None, timer_config=None)
+
+    # test _set_lr()
+    output = dummy_runner.model.train_step(**dummy_input)
+    dummy_runner.outputs = output
+    dummy_runner.call_hook('before_train_epoch')
+
+    # test building ipu_optimizer_hook_class
+    with pytest.raises(
+            NotImplementedError, match='IPU does not support gradient clip'):
+        dummy_runner.register_training_hooks(
+            lr_config=None,
+            optimizer_config=optimizer_config,
+            timer_config=None)
+
+    # test fp16 optimizer hook
+    lr_config = dict(policy='step', step=[1, 150])
+    optimizer_config = dict(grad_clip=dict(max_norm=2))
+    dummy_runner.hooks.pop(0)
+
+    with pytest.raises(NotImplementedError, match='IPU mode does not support'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale='dynamic', distributed=False)
+
+    with pytest.raises(NotImplementedError, match='IPU mode supports single'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale={}, distributed=False)
+
+    with pytest.raises(ValueError, match='loss_scale should be float'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale=[], distributed=False)
+
+    optimizer_config = IPUFp16OptimizerHook(loss_scale=2.0, distributed=False)
+
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config,
+        optimizer_config=optimizer_config,
+        timer_config=None)
+
+    dummy_runner.call_hook('after_train_iter')
diff --git a/tests/test_device/test_ipu/test_ipu_model.py b/tests/test_device/test_ipu/test_ipu_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..390d09a1344417b5b9ff07df8ea81a6b7343a64e
--- /dev/null
+++ b/tests/test_device/test_ipu/test_ipu_model.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import cfg2options, ipu_model_wrapper
+    from mmcv.device.ipu.utils import compare_ndarray
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class MyBN(nn.BatchNorm2d):
+
+    def forward(self, *args, **kwargs):
+        result = super().forward(*args, **kwargs)
+        return result, self.running_mean
+
+
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = MyBN(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x, running_mean = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+@skip_no_ipu
+def test_build_model():
+    for execution_strategy in \
+            ['SameAsIpu', 'ShardedExecution', 'error_strategy']:
+        if execution_strategy == 'error_strategy':
+
+            def maybe_catch_error(_error):
+                return pytest.raises(_error)
+        else:
+
+            class NullContextManager:
+
+                def __enter__(self, ):
+                    pass
+
+                def __exit__(self, exc_type, exc_value, exc_traceback):
+                    pass
+
+            def maybe_catch_error(_error):
+                return NullContextManager()
+
+        with maybe_catch_error(NotImplementedError):
+            options_cfg = dict(
+                randomSeed=888,
+                enableExecutableCaching='cache_engine',
+                train_cfg=dict(
+                    executionStrategy=execution_strategy,
+                    Training=dict(gradientAccumulation=8),
+                    availableMemoryProportion=[0.3, 0.3, 0.3, 0.3]),
+                eval_cfg=dict(deviceIterations=1, ),
+                partialsType='half')
+
+            ipu_options = cfg2options(options_cfg)
+            model = ToyModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+            logger = logging.getLogger()
+            modules_to_record = None
+            ipu_model_cfg = dict(
+                train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+                train_ckpt_nodes=['bn', 'conv'])
+            fp16_cfg = {'loss_scale': 0.5}
+            ipu_model = ipu_model_wrapper(
+                model,
+                ipu_options,
+                optimizer,
+                logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+
+            ipu_model.train()
+            ipu_model.eval()
+            ipu_model.train()
+
+
+def run_model(ipu_options,
+              fp16_cfg,
+              modules_to_record,
+              ipu_model_wrapper_func,
+              only_eval=False):
+    model = ToyModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\
+        if not only_eval else None
+    logger = logging.getLogger()
+    ipu_model_cfg = dict(
+        train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+        train_ckpt_nodes=['bn', 'conv'])
+    ipu_model = ipu_model_wrapper_func(
+        model,
+        ipu_options,
+        optimizer,
+        logger,
+        modules_to_record=modules_to_record,
+        ipu_model_cfg=ipu_model_cfg,
+        fp16_cfg=fp16_cfg)
+
+    def get_dummy_input(training):
+        if training:
+            return {
+                'data': {
+                    'img': torch.rand((16, 3, 10, 10)),
+                    'gt_label': torch.rand((16, 3, 10, 10))
+                }
+            }
+        else:
+            return {
+                'img': torch.rand((16, 3, 10, 10)),
+                'img_metas': {
+                    'img': torch.rand((16, 3, 10, 10))
+                },
+                'return_loss': False
+            }
+
+    if not only_eval:
+        training = True
+        ipu_model.train()
+        for _ in range(3):
+            dummy_input = get_dummy_input(training)
+            output = ipu_model.train_step(**dummy_input)
+    training = False
+    ipu_model.eval()
+    for _ in range(3):
+        dummy_input = get_dummy_input(training)
+        output = ipu_model(**dummy_input)
+    return output, ipu_model
+
+
+@skip_no_ipu
+def test_run_model():
+
+    # test feature alignment not support gradientAccumulation mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment not support multi-replica mode
+    options_cfg = dict(
+        randomSeed=888,
+        replicationFactor=2,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment not support fp16 mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'half',
+        'accum_type': 'half'
+    }
+    modules_to_record = ['bn']
+    with pytest.raises(NotImplementedError):
+        run_model(ipu_options, fp16_cfg, modules_to_record, ipu_model_wrapper)
+
+    # test velocity_accum_type and accum_type
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'float',
+        'accum_type': 'float'
+    }
+    run_model(ipu_options, fp16_cfg, None, ipu_model_wrapper)
+
+    # test compile and run
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ))
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = None
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test inference mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {'loss_scale': 0.5}
+    modules_to_record = None
+    _, ipu_model = run_model(
+        ipu_options,
+        fp16_cfg,
+        modules_to_record,
+        ipu_model_wrapper,
+        only_eval=True)
+    with pytest.raises(RuntimeError):
+        ipu_model.train()
+    with pytest.raises(ValueError):
+        ipu_model.train(123)
+    _, ipu_model = run_model(ipu_options, None, modules_to_record,
+                             ipu_model_wrapper)
+
+    # test NotImplementedError in __call__
+    ipu_model.train()
+    with pytest.raises(NotImplementedError):
+        ipu_model()
+
+    # test parse_losses
+    with pytest.raises(TypeError):
+        ipu_model._model.model._parse_losses({'loss': None})
+
+
+@skip_no_ipu
+def test_compare_tensor():
+    compare_ndarray(np.random.rand(3, 4), np.random.rand(3, 4))
diff --git a/tests/test_device/test_ipu/test_ipu_runner.py b/tests/test_device/test_ipu/test_ipu_runner.py
new file mode 100755
index 0000000000000000000000000000000000000000..4de4fb7089829f0729a46737d27f4b24b2add549
--- /dev/null
+++ b/tests/test_device/test_ipu/test_ipu_runner.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from mmcv.runner import build_runner
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, runner
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+# Most of its functions are inherited from EpochBasedRunner and IterBasedRunner
+# So only do incremental testing on overridden methods
+# Comparing with base runner,
+# Overridden functions are listed below:
+# __init__, register_lr_hook, register_optimizer_hook
+# register_lr_hook and register_optimizer_hook are tested in test_runner.py
+
+
+class OldStyleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Model(OldStyleModel):
+
+    def train_step(self):
+        pass
+
+    def val_step(self):
+        pass
+
+
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {'loss': loss, 'loss1': loss + 1}
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], {'loss1': losses['loss']}
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+class ToyDataset(Dataset):
+
+    def __getitem__(self, index):
+        return {
+            'img': torch.rand((3, 10, 10)),
+            'gt_label': torch.rand((3, 10, 10))
+        }
+
+    def __len__(self, ):
+        return 3
+
+
+@skip_no_ipu
+def test_build_runner(tmp_path):
+    # __init__
+    dir_name = 'a_tmp_dir'
+
+    default_args = dict(
+        model=Model(),
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_epochs == 1
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_iters == 1
+
+    runner.IS_IPU_AVAILABLE = False
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    with pytest.raises(
+            NotImplementedError,
+            match='cpu mode on IPURunner is not supported'):
+        ipu_runner = build_runner(cfg, default_args=default_args)
+
+    runner.IS_IPU_AVAILABLE = True
+    with pytest.raises(ValueError, match='Only one of'):
+        cfg = dict(type='IPUIterBasedRunner', max_epochs=1, max_iters=1)
+        ipu_runner = build_runner(cfg, default_args=default_args)
+
+    model = ToyModel()
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    dataloader = IPUDataLoader(ToyDataset(), None, num_workers=1)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    cfg = dict(type='IPUIterBasedRunner', max_iters=2, options_cfg=options_cfg)
+    default_args = dict(
+        model=model,
+        optimizer=optimizer,
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    ipu_runner.run([dataloader], [('train', 2)])
+    ipu_runner.get_options('val')
+    with pytest.raises(ValueError, match='mode should be train or val'):
+        ipu_runner.get_options('666')
diff --git a/tests/test_device/test_ipu/test_ipu_utils.py b/tests/test_device/test_ipu/test_ipu_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..f554c2414f63f6a94f4309fd0105df1249706856
--- /dev/null
+++ b/tests/test_device/test_ipu/test_ipu_utils.py
@@ -0,0 +1,194 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch.nn as nn
+
+import mmcv
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from poptorch.options import _IExecutionStrategy
+
+    from mmcv.device.ipu import cfg2options
+    from mmcv.device.ipu.utils import (build_from_cfg_with_wrapper,
+                                       model_sharding)
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+
+
+@skip_no_ipu
+def test_build_from_cfg():
+    BACKBONES = mmcv.Registry('backbone')
+
+    @BACKBONES.register_module()
+    class ResNet:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    @BACKBONES.register_module()
+    class ResNeXt:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args={'stages': 3})
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type=ResNet, depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # type defined using default_args
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type='ResNet'))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type=ResNet))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # not a registry
+    with pytest.raises(TypeError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+
+    # non-registered class
+    with pytest.raises(KeyError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # default_args must be a dict or None
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=1)
+
+    # cfg['type'] should be a str or class
+    with pytest.raises(TypeError):
+        cfg = dict(type=1000)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50, stages=4)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg or default_args should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50)
+        model = build_from_cfg_with_wrapper(
+            cfg, BACKBONES, default_args=dict(stages=4))
+
+    # incorrect registry type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+
+    # incorrect default_args type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=0)
+
+    # incorrect arguments
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', non_existing_arg=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg not dict
+    with pytest.raises(TypeError):
+        cfg = []
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+
+@skip_no_ipu
+def test_cast_to_options():
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+    )
+    ipu_options = cfg2options(copy.deepcopy(options_cfg))
+    assert 'training' in ipu_options
+    assert 'inference' in ipu_options
+    assert ipu_options['training']._values['random_seed'] == 888
+    assert ipu_options['training']._values['replication_factor'] == 1
+    assert ipu_options['training']._values['available_memory_proportion'] == {
+        0: 0.3,
+        1: 0.3,
+        2: 0.3,
+        3: 0.3
+    }
+    assert ipu_options['training']._popart.options[
+        'cachePath'] == 'cache_engine'
+    assert isinstance(ipu_options['training']._execution_strategy,
+                      _IExecutionStrategy)
+    assert ipu_options['inference']._values['device_iterations'] == 1
+
+    with pytest.raises(NotImplementedError, match='cfg type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['randomSeed'] = (1, 3)
+        cfg2options(_options_cfg)
+
+    with pytest.raises(NotImplementedError, match='options_node type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['train_cfg']['Precision'] = {'autocast_policy': 123}
+        cfg2options(_options_cfg)
+
+
+@skip_no_ipu
+def test_model_sharding():
+
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='666', ipu_id=0)]
+
+    with pytest.raises(RuntimeError, match='split_edges:'):
+        model_sharding(model, split_edges)
+
+    model = ToyModel()
+    split_edges = [
+        dict(layer_to_call='conv', ipu_id=0),
+        dict(layer_to_call=1, ipu_id=0)
+    ]
+
+    with pytest.raises(ValueError, match='The same layer is referenced'):
+        model_sharding(model, split_edges)
+
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='conv', ipu_id=0)]
+    model_sharding(model, split_edges)
diff --git a/tests/test_device/test_mlu/test_mlu_parallel.py b/tests/test_device/test_mlu/test_mlu_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d04fb6551cfb0b9f871c2c3f0f4274af1a1a83c
--- /dev/null
+++ b/tests/test_device/test_mlu/test_mlu_parallel.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock, patch
+
+import torch.nn as nn
+
+from mmcv.device.mlu import MLUDataParallel, MLUDistributedDataParallel
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import IS_MLU_AVAILABLE
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    if IS_MLU_AVAILABLE:
+        mludp = MLUDataParallel(model)
+        assert is_module_wrapper(mludp)
+
+        mluddp = MLUDistributedDataParallel(model, process_group=MagicMock())
+        assert is_module_wrapper(mluddp)
diff --git a/tests/test_device/test_mps/test_mps_parallel.py b/tests/test_device/test_mps/test_mps_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4e0b86e1dcee5e7e64ba5b69fe62138a939f9c
--- /dev/null
+++ b/tests/test_device/test_mps/test_mps_parallel.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import torch.nn as nn
+
+from mmcv.device.mps import MPSDataParallel
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import IS_MPS_AVAILABLE
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    if IS_MPS_AVAILABLE:
+        mpsdp = MPSDataParallel(model)
+        assert is_module_wrapper(mpsdp)
diff --git a/tests/test_fileclient.py b/tests/test_fileclient.py
index 30f32432a9da8e8d8f6b39897734e781900f1a02..292779f36a924de0171f422d33ed4f219d51e5e3 100644
--- a/tests/test_fileclient.py
+++ b/tests/test_fileclient.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
 import sys
@@ -201,21 +202,22 @@ class TestFileClient:
         # test `list_dir_or_file`
         with build_temporary_directory() as tmp_dir:
             # 1. list directories and files
-            assert set(disk_backend.list_dir_or_file(tmp_dir)) == set(
-                ['dir1', 'dir2', 'text1.txt', 'text2.txt'])
+            assert set(disk_backend.list_dir_or_file(tmp_dir)) == {
+                'dir1', 'dir2', 'text1.txt', 'text2.txt'
+            }
             # 2. list directories and files recursively
             assert set(disk_backend.list_dir_or_file(
-                tmp_dir, recursive=True)) == set([
+                tmp_dir, recursive=True)) == {
                     'dir1',
                     osp.join('dir1', 'text3.txt'), 'dir2',
                     osp.join('dir2', 'dir3'),
                     osp.join('dir2', 'dir3', 'text4.txt'),
                     osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
-                ])
+                }
             # 3. only list directories
             assert set(
                 disk_backend.list_dir_or_file(
-                    tmp_dir, list_file=False)) == set(['dir1', 'dir2'])
+                    tmp_dir, list_file=False)) == {'dir1', 'dir2'}
             with pytest.raises(
                     TypeError,
                     match='`suffix` should be None when `list_dir` is True'):
@@ -226,30 +228,30 @@ class TestFileClient:
             # 4. only list directories recursively
             assert set(
                 disk_backend.list_dir_or_file(
-                    tmp_dir, list_file=False, recursive=True)) == set(
-                        ['dir1', 'dir2',
-                         osp.join('dir2', 'dir3')])
+                    tmp_dir, list_file=False, recursive=True)) == {
+                        'dir1', 'dir2',
+                        osp.join('dir2', 'dir3')
+                    }
             # 5. only list files
             assert set(disk_backend.list_dir_or_file(
-                tmp_dir, list_dir=False)) == set(['text1.txt', 'text2.txt'])
+                tmp_dir, list_dir=False)) == {'text1.txt', 'text2.txt'}
             # 6. only list files recursively
             assert set(
                 disk_backend.list_dir_or_file(
-                    tmp_dir, list_dir=False, recursive=True)) == set([
+                    tmp_dir, list_dir=False, recursive=True)) == {
                         osp.join('dir1', 'text3.txt'),
                         osp.join('dir2', 'dir3', 'text4.txt'),
                         osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
-                    ])
+                    }
             # 7. only list files ending with suffix
             assert set(
                 disk_backend.list_dir_or_file(
                     tmp_dir, list_dir=False,
-                    suffix='.txt')) == set(['text1.txt', 'text2.txt'])
+                    suffix='.txt')) == {'text1.txt', 'text2.txt'}
             assert set(
                 disk_backend.list_dir_or_file(
                     tmp_dir, list_dir=False,
-                    suffix=('.txt',
-                            '.jpg'))) == set(['text1.txt', 'text2.txt'])
+                    suffix=('.txt', '.jpg'))) == {'text1.txt', 'text2.txt'}
             with pytest.raises(
                     TypeError,
                     match='`suffix` must be a string or tuple of strings'):
@@ -259,22 +261,22 @@ class TestFileClient:
             assert set(
                 disk_backend.list_dir_or_file(
                     tmp_dir, list_dir=False, suffix='.txt',
-                    recursive=True)) == set([
+                    recursive=True)) == {
                         osp.join('dir1', 'text3.txt'),
                         osp.join('dir2', 'dir3', 'text4.txt'), 'text1.txt',
                         'text2.txt'
-                    ])
+                    }
             # 7. only list files ending with suffix
             assert set(
                 disk_backend.list_dir_or_file(
                     tmp_dir,
                     list_dir=False,
                     suffix=('.txt', '.jpg'),
-                    recursive=True)) == set([
+                    recursive=True)) == {
                         osp.join('dir1', 'text3.txt'),
                         osp.join('dir2', 'dir3', 'text4.txt'),
                         osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
-                    ])
+                    }
 
     @patch('ceph.S3Client', MockS3Client)
     def test_ceph_backend(self):
@@ -462,21 +464,21 @@ class TestFileClient:
 
         with build_temporary_directory() as tmp_dir:
             # 1. list directories and files
-            assert set(petrel_backend.list_dir_or_file(tmp_dir)) == set(
-                ['dir1', 'dir2', 'text1.txt', 'text2.txt'])
+            assert set(petrel_backend.list_dir_or_file(tmp_dir)) == {
+                'dir1', 'dir2', 'text1.txt', 'text2.txt'
+            }
             # 2. list directories and files recursively
             assert set(
-                petrel_backend.list_dir_or_file(
-                    tmp_dir, recursive=True)) == set([
-                        'dir1', '/'.join(('dir1', 'text3.txt')), 'dir2',
-                        '/'.join(('dir2', 'dir3')), '/'.join(
+                petrel_backend.list_dir_or_file(tmp_dir, recursive=True)) == {
+                    'dir1', '/'.join(('dir1', 'text3.txt')), 'dir2', '/'.join(
+                        ('dir2', 'dir3')), '/'.join(
                             ('dir2', 'dir3', 'text4.txt')), '/'.join(
                                 ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
-                    ])
+                }
             # 3. only list directories
             assert set(
                 petrel_backend.list_dir_or_file(
-                    tmp_dir, list_file=False)) == set(['dir1', 'dir2'])
+                    tmp_dir, list_file=False)) == {'dir1', 'dir2'}
             with pytest.raises(
                     TypeError,
                     match=('`list_dir` should be False when `suffix` is not '
@@ -488,31 +490,30 @@ class TestFileClient:
             # 4. only list directories recursively
             assert set(
                 petrel_backend.list_dir_or_file(
-                    tmp_dir, list_file=False, recursive=True)) == set(
-                        ['dir1', 'dir2', '/'.join(('dir2', 'dir3'))])
+                    tmp_dir, list_file=False, recursive=True)) == {
+                        'dir1', 'dir2', '/'.join(('dir2', 'dir3'))
+                    }
             # 5. only list files
             assert set(
-                petrel_backend.list_dir_or_file(tmp_dir,
-                                                list_dir=False)) == set(
-                                                    ['text1.txt', 'text2.txt'])
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False)) == {'text1.txt', 'text2.txt'}
             # 6. only list files recursively
             assert set(
                 petrel_backend.list_dir_or_file(
-                    tmp_dir, list_dir=False, recursive=True)) == set([
+                    tmp_dir, list_dir=False, recursive=True)) == {
                         '/'.join(('dir1', 'text3.txt')), '/'.join(
                             ('dir2', 'dir3', 'text4.txt')), '/'.join(
                                 ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
-                    ])
+                    }
             # 7. only list files ending with suffix
             assert set(
                 petrel_backend.list_dir_or_file(
                     tmp_dir, list_dir=False,
-                    suffix='.txt')) == set(['text1.txt', 'text2.txt'])
+                    suffix='.txt')) == {'text1.txt', 'text2.txt'}
             assert set(
                 petrel_backend.list_dir_or_file(
                     tmp_dir, list_dir=False,
-                    suffix=('.txt',
-                            '.jpg'))) == set(['text1.txt', 'text2.txt'])
+                    suffix=('.txt', '.jpg'))) == {'text1.txt', 'text2.txt'}
             with pytest.raises(
                     TypeError,
                     match='`suffix` must be a string or tuple of strings'):
@@ -522,22 +523,22 @@ class TestFileClient:
             assert set(
                 petrel_backend.list_dir_or_file(
                     tmp_dir, list_dir=False, suffix='.txt',
-                    recursive=True)) == set([
+                    recursive=True)) == {
                         '/'.join(('dir1', 'text3.txt')), '/'.join(
                             ('dir2', 'dir3', 'text4.txt')), 'text1.txt',
                         'text2.txt'
-                    ])
+                    }
             # 7. only list files ending with suffix
             assert set(
                 petrel_backend.list_dir_or_file(
                     tmp_dir,
                     list_dir=False,
                     suffix=('.txt', '.jpg'),
-                    recursive=True)) == set([
+                    recursive=True)) == {
                         '/'.join(('dir1', 'text3.txt')), '/'.join(
                             ('dir2', 'dir3', 'text4.txt')), '/'.join(
                                 ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
-                    ])
+                    }
 
     @patch('mc.MemcachedClient.GetInstance', MockMemcachedClient)
     @patch('mc.pyvector', MagicMock)
@@ -654,7 +655,8 @@ class TestFileClient:
         FileClient.register_backend('dummy_backend', DummyBackend2, force=True)
         client3 = FileClient(backend='dummy_backend')
         client4 = FileClient(backend='dummy_backend')
-        assert client3 is not client4
+        assert client2 is not client3
+        assert client3 is client4
 
     def test_parse_uri_prefix(self):
         # input path is None
diff --git a/tests/test_fileio.py b/tests/test_fileio.py
index 556a44a1333b242700f77586c78a616eb0e58376..f5e23bf7f4c2f4ba9aa2d98f04793971a89bb6de 100644
--- a/tests/test_fileio.py
+++ b/tests/test_fileio.py
@@ -128,7 +128,7 @@ def test_register_handler():
     assert content == '1.jpg\n2.jpg\n3.jpg\n4.jpg\n5.jpg'
     tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.txt2')
     mmcv.dump(content, tmp_filename)
-    with open(tmp_filename, 'r') as f:
+    with open(tmp_filename) as f:
         written = f.read()
     os.remove(tmp_filename)
     assert written == '\n' + content
diff --git a/tests/test_image/test_geometric.py b/tests/test_image/test_geometric.py
index 4a924af4653c4983a0d4d3fa05be0198ba415199..308f96c1c3f312eb15dff9469c74bb15733ef9d0 100644
--- a/tests/test_image/test_geometric.py
+++ b/tests/test_image/test_geometric.py
@@ -461,6 +461,10 @@ class TestGeometric:
         with pytest.raises(AssertionError):
             mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))
 
+        # Pad shape smaller than image shape
+        padded_img = mmcv.impad(img, shape=(8, 8))
+        assert padded_img.shape == (10, 10, 3)
+
     def test_impad_to_multiple(self):
         img = np.random.rand(11, 14, 3).astype(np.float32)
         padded_img = mmcv.impad_to_multiple(img, 4)
diff --git a/tests/test_image/test_image_misc.py b/tests/test_image/test_image_misc.py
index 7573b161181d5fad230e817574d1f96b8a9e37e4..51e61d8e60e719f118bd275f1c7637c7a7adab1e 100644
--- a/tests/test_image/test_image_misc.py
+++ b/tests/test_image/test_image_misc.py
@@ -24,15 +24,29 @@ def test_tensor2imgs():
         tensor = torch.randn(2, 3, 3)
         mmcv.tensor2imgs(tensor)
 
+    # test tensor dim-1
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 4, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
     # test mean length
     with pytest.raises(AssertionError):
         tensor = torch.randn(2, 3, 5, 5)
         mmcv.tensor2imgs(tensor, mean=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))
 
     # test std length
     with pytest.raises(AssertionError):
         tensor = torch.randn(2, 3, 5, 5)
         mmcv.tensor2imgs(tensor, std=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, std=(1, 1, 1))
+
+    # test to_rgb
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)
 
     # test rgb=True
     tensor = torch.randn(2, 3, 5, 5)
@@ -50,3 +64,10 @@ def test_tensor2imgs():
     outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
     for gt, output in zip(gts, outputs):
         assert_array_equal(gt, output)
+
+    # test tensor channel 1 and rgb=False
+    tensor = torch.randn(2, 1, 5, 5)
+    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
diff --git a/tests/test_image/test_io.py b/tests/test_image/test_io.py
index 35bdc9d1c3b9e62a0db57ff74b76e87f71dbad6c..7c1b4dd68337e1267aba0a11dc7cc1aee1cc7d79 100644
--- a/tests/test_image/test_io.py
+++ b/tests/test_image/test_io.py
@@ -1,16 +1,22 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import os.path as osp
+import sys
 import tempfile
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
 
 import cv2
 import numpy as np
 import pytest
+import torch
 from numpy.testing import assert_allclose, assert_array_equal
 
 import mmcv
+from mmcv.fileio.file_client import HTTPBackend, PetrelBackend
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not necessary in parrots test', allow_module_level=True)
 
 
 class TestIO:
@@ -29,6 +35,18 @@ class TestIO:
         cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')
         cls.img = cv2.imread(cls.img_path)
         cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')
+        # petrel s3 path
+        cls.s3_path = 's3://path/of/your/file.jpg'
+        # http path
+        cls.http_path = 'http://path/of/your/file.jpg'
+        # add mock package
+        sys.modules['petrel_client'] = MagicMock()
+        sys.modules['petrel_client.client'] = MagicMock()
+
+    @classmethod
+    def teardown_class(cls):
+        # clean instances avoid to influence other unittest
+        mmcv.FileClient._instances = {}
 
     def assert_img_equal(self, img, ref_img, ratio_thr=0.999):
         assert img.shape == ref_img.shape
@@ -41,6 +59,7 @@ class TestIO:
         # backend cv2
         mmcv.use_backend('cv2')
 
+        # HardDiskBackend
         img_cv2_color_bgr = mmcv.imread(self.img_path)
         assert img_cv2_color_bgr.shape == (300, 400, 3)
         img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')
@@ -69,6 +88,37 @@ class TestIO:
         with pytest.raises(TypeError):
             mmcv.imread(1)
 
+        # PetrelBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                PetrelBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')
+            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
+                self.s3_path,
+                backend='cv2',
+                file_client_args={'backend': 'petrel'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_petrel,
+                               img_cv2_color_bgr_petrel_with_args)
+
+        # HTTPBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                HTTPBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')
+            img_cv2_color_bgr_http_with_args = mmcv.imread(
+                self.http_path,
+                backend='cv2',
+                file_client_args={'backend': 'http'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_http,
+                               img_cv2_color_bgr_http_with_args)
+
+        with pytest.raises(FileNotFoundError):
+            mmcv.imread('/not/exists/' + self.img_path)
+
         # test arg backend pillow
         img_pil_gray_alpha = mmcv.imread(
             self.gray_alpha_img_path, 'grayscale', backend='pillow')
@@ -311,9 +361,18 @@ class TestIO:
         os.remove(out_file)
         self.assert_img_equal(img, rewrite_img)
 
-        ret = mmcv.imwrite(
-            img, './non_exist_path/mmcv_test.jpg', auto_mkdir=False)
-        assert ret is False
+        # test petrel client
+        with patch.object(
+                PetrelBackend, 'put', return_value=None) as mock_method:
+            ret = mmcv.imwrite(img, self.s3_path)
+            ret_with_args = mmcv.imwrite(
+                img, self.s3_path, file_client_args={'backend': 'petrel'})
+            assert ret
+            assert ret_with_args
+            mock_method.assert_called()
+
+        with pytest.raises(cv2.error):
+            mmcv.imwrite(img, 'error_file.jppg')
 
     @patch('mmcv.image.io.TurboJPEG', None)
     def test_no_turbojpeg(self):
diff --git a/tests/test_image/test_photometric.py b/tests/test_image/test_photometric.py
index 8552bd19bd6a735e450dba87dadd2f01593a22dc..1391a7ebcafc6d90acb94bf39628ba278e894cfb 100644
--- a/tests/test_image/test_photometric.py
+++ b/tests/test_image/test_photometric.py
@@ -112,7 +112,7 @@ class TestPhotometric:
 
         def _imequalize(img):
             # equalize the image using PIL.ImageOps.equalize
-            from PIL import ImageOps, Image
+            from PIL import Image, ImageOps
             img = Image.fromarray(img)
             equalized_img = np.asarray(ImageOps.equalize(img))
             return equalized_img
@@ -141,8 +141,8 @@ class TestPhotometric:
         def _adjust_brightness(img, factor):
             # adjust the brightness of image using
             # PIL.ImageEnhance.Brightness
-            from PIL.ImageEnhance import Brightness
             from PIL import Image
+            from PIL.ImageEnhance import Brightness
             img = Image.fromarray(img)
             brightened_img = Brightness(img).enhance(factor)
             return np.asarray(brightened_img)
@@ -169,8 +169,9 @@ class TestPhotometric:
     def test_adjust_contrast(self, nb_rand_test=100):
 
         def _adjust_contrast(img, factor):
-            from PIL.ImageEnhance import Contrast
             from PIL import Image
+            from PIL.ImageEnhance import Contrast
+
             # Image.fromarray defaultly supports RGB, not BGR.
             # convert from BGR to RGB
             img = Image.fromarray(img[..., ::-1], mode='RGB')
@@ -204,8 +205,9 @@ class TestPhotometric:
     def test_auto_contrast(self, nb_rand_test=100):
 
         def _auto_contrast(img, cutoff=0):
-            from PIL.ImageOps import autocontrast
             from PIL import Image
+            from PIL.ImageOps import autocontrast
+
             # Image.fromarray defaultly supports RGB, not BGR.
             # convert from BGR to RGB
             img = Image.fromarray(img[..., ::-1], mode='RGB')
@@ -250,8 +252,8 @@ class TestPhotometric:
         def _adjust_sharpness(img, factor):
             # adjust the sharpness of image using
             # PIL.ImageEnhance.Sharpness
-            from PIL.ImageEnhance import Sharpness
             from PIL import Image
+            from PIL.ImageEnhance import Sharpness
             img = Image.fromarray(img)
             sharpened_img = Sharpness(img).enhance(factor)
             return np.asarray(sharpened_img)
@@ -333,7 +335,7 @@ class TestPhotometric:
 
         input_img = np.array(
             [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],
-            dtype=np.float)
+            dtype=float)
         img = mmcv.lut_transform(input_img, lut_table)
         baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
         assert np.allclose(img, baseline)
@@ -376,3 +378,37 @@ class TestPhotometric:
             assert np.allclose(img, img_std)
             assert id(img) != id(self.img[:, :, i])
             assert id(img_std) != id(self.img[:, :, i])
+
+    def test_adjust_hue(self):
+        from PIL import Image
+
+        def _adjust_hue(img, hue_factor):
+            input_mode = img.mode
+            if input_mode in {'L', '1', 'I', 'F'}:
+                return img
+            h, s, v = img.convert('HSV').split()
+            np_h = np.array(h, dtype=np.uint8)
+            # uint8 addition take cares of rotation across boundaries
+            with np.errstate(over='ignore'):
+                np_h += np.uint8(hue_factor * 255)
+            h = Image.fromarray(np_h, 'L')
+            img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+            return img
+
+        pil_img = Image.fromarray(self.img)
+
+        # test case with img is not ndarray
+        with pytest.raises(TypeError):
+            mmcv.adjust_hue(pil_img, hue_factor=0.0)
+
+        # test case with hue_factor > 0.5 or hue_factor < -0.5
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=-0.6)
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=0.6)
+
+        for i in np.arange(-0.5, 0.5, 0.2):
+            pil_res = _adjust_hue(pil_img, hue_factor=i)
+            pil_res = np.array(pil_res)
+            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)
+            assert np.allclose(pil_res, cv2_res, atol=10.0)
diff --git a/tests/test_load_model_zoo.py b/tests/test_load_model_zoo.py
index c230bfa62e15853011d5f55e93bc3855e7bd83b4..904cb940313efa634ead0a02d7d6f8b3850d5e89 100644
--- a/tests/test_load_model_zoo.py
+++ b/tests/test_load_model_zoo.py
@@ -4,6 +4,7 @@ import os.path as osp
 from unittest.mock import patch
 
 import pytest
+import torchvision
 
 import mmcv
 from mmcv.runner.checkpoint import (DEFAULT_CACHE_DIR, ENV_MMCV_HOME,
@@ -11,7 +12,7 @@ from mmcv.runner.checkpoint import (DEFAULT_CACHE_DIR, ENV_MMCV_HOME,
                                     _load_checkpoint,
                                     get_deprecated_model_names,
                                     get_external_models)
-from mmcv.utils import TORCH_VERSION
+from mmcv.utils import digit_version
 
 
 @patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
@@ -77,24 +78,33 @@ def load(filepath, map_location=None):
 @patch('torch.load', load)
 def test_load_external_url():
     # test modelzoo://
-    url = _load_checkpoint('modelzoo://resnet50')
-    if TORCH_VERSION < '1.9.0':
-        assert url == ('url:https://download.pytorch.org/models/resnet50-19c8e'
-                       '357.pth')
+    torchvision_version = torchvision.__version__
+    if digit_version(torchvision_version) < digit_version('0.10.0a0'):
+        assert (_load_checkpoint('modelzoo://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-19c8e'
+                '357.pth')
+        assert (_load_checkpoint('torchvision://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-19c8e'
+                '357.pth')
     else:
-        # filename of checkpoint is renamed in torch1.9.0
-        assert url == ('url:https://download.pytorch.org/models/resnet50-0676b'
-                       'a61.pth')
-
-    # test torchvision://
-    url = _load_checkpoint('torchvision://resnet50')
-    if TORCH_VERSION < '1.9.0':
-        assert url == ('url:https://download.pytorch.org/models/resnet50-19c8e'
-                       '357.pth')
-    else:
-        # filename of checkpoint is renamed in torch1.9.0
-        assert url == ('url:https://download.pytorch.org/models/resnet50-0676b'
-                       'a61.pth')
+        assert (_load_checkpoint('modelzoo://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-0676b'
+                'a61.pth')
+        assert (_load_checkpoint('torchvision://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-0676b'
+                'a61.pth')
+
+    if digit_version(torchvision_version) >= digit_version('0.13.0a0'):
+        # Test load new format torchvision models.
+        assert (
+            _load_checkpoint('torchvision://resnet50.imagenet1k_v1') ==
+            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
+
+        assert (
+            _load_checkpoint('torchvision://ResNet50_Weights.IMAGENET1K_V1') ==
+            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
+
+        _load_checkpoint('torchvision://resnet50.default')
 
     # test open-mmlab:// with default MMCV_HOME
     os.environ.pop(ENV_MMCV_HOME, None)
@@ -128,7 +138,7 @@ def test_load_external_url():
     os.environ[ENV_MMCV_HOME] = mmcv_home
     url = _load_checkpoint('open-mmlab://train')
     assert url == 'url:https://localhost/train.pth'
-    with pytest.raises(IOError, match='train.pth is not a checkpoint file'):
+    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
         _load_checkpoint('open-mmlab://train_empty')
     url = _load_checkpoint('open-mmlab://test')
     assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
@@ -140,7 +150,7 @@ def test_load_external_url():
     assert url == 'url:http://localhost/train.pth'
 
     # test local file
-    with pytest.raises(IOError, match='train.pth is not a checkpoint file'):
+    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
         _load_checkpoint('train.pth')
     url = _load_checkpoint(osp.join(_get_mmcv_home(), 'test.pth'))
     assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
diff --git a/tests/test_ops/test_active_rotated_filter.py b/tests/test_ops/test_active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ea59c5c62a4fd7c01fbd03a98485be359984f4
--- /dev/null
+++ b/tests/test_ops/test_active_rotated_filter.py
@@ -0,0 +1,258 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import active_rotated_filter
+
+np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],
+                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],
+                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],
+                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                          [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],
+                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                          [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],
+                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],
+                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                          [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],
+                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])
+
+np_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],
+                         [3, 6, 9, 8, 7, 4, 1, 2]],
+                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],
+                         [6, 9, 8, 7, 4, 1, 2, 3]],
+                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],
+                         [9, 8, 7, 4, 1, 2, 3, 6]]]])
+
+expected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],
+                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],
+                              [1.4808e+00, -1.1463e+00, -1.6241e-01],
+                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],
+                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],
+                              [7.6572e-01, -1.1463e+00, 1.1341e+00],
+                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],
+                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],
+                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],
+                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],
+                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],
+                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],
+                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],
+                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],
+                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],
+                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],
+                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],
+                              [1.1341e+00, -1.1463e+00, 7.6572e-01],
+                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],
+                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],
+                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],
+                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],
+                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],
+                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],
+                              [9.8669e-01, 7.6566e-01, 2.5834e-02],
+                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],
+                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],
+                              [4.1356e-01, 7.6566e-01, 6.2799e-01],
+                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],
+                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],
+                              [6.1068e-01, 7.6566e-01, 1.9370e+00],
+                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],
+                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],
+                              [1.0015e+00, 7.6566e-01, -1.4242e+00],
+                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],
+                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],
+                              [2.5834e-02, 7.6566e-01, 9.8669e-01],
+                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],
+                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],
+                              [6.2799e-01, 7.6566e-01, 4.1356e-01],
+                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],
+                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],
+                              [1.9370e+00, 7.6566e-01, 6.1068e-01],
+                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],
+                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],
+                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],
+                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],
+                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],
+                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],
+                              [1.0658e+00, 7.5942e-01, 1.4960e+00],
+                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],
+                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],
+                              [3.8311e-01, 7.5942e-01, 1.4565e+00],
+                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],
+                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],
+                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],
+                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],
+                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],
+                              [2.4339e-01, 7.5942e-01, -1.7077e+00],
+                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],
+                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],
+                              [1.4960e+00, 7.5942e-01, 1.0658e+00],
+                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],
+                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],
+                              [1.4565e+00, 7.5942e-01, 3.8311e-01],
+                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],
+                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                              [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],
+                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],
+                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],
+                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],
+                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],
+                              [7.0902e-02, 2.2260e-01, 1.0904e+00],
+                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],
+                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],
+                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],
+                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],
+                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],
+                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],
+                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],
+                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],
+                              [2.4356e+00, 2.2260e-01, -1.5078e+00],
+                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],
+                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],
+                              [1.0904e+00, 2.2260e-01, 7.0902e-02],
+                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],
+                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],
+                              [8.4734e-01, 2.2260e-01, -1.5921e+00],
+                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],
+                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                              [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],
+                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],
+                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],
+                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],
+                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],
+                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],
+                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],
+                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],
+                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],
+                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],
+                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],
+                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],
+                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],
+                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],
+                              [1.8888e+00, 4.7033e-01, -1.0736e+00],
+                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],
+                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],
+                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],
+                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],
+                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],
+                              [2.1173e+00, 4.7033e-01, -2.8733e-01],
+                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],
+                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],
+                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],
+                              [1.6552e-03, -4.3128e-01, -1.5826e+00],
+                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],
+                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],
+                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],
+                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],
+                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],
+                              [2.6639e-01, -4.3128e-01, -5.6433e-01],
+                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],
+                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],
+                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],
+                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],
+                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],
+                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],
+                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],
+                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],
+                              [1.5835e+00, -4.3128e-01, -1.7292e+00],
+                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],
+                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],
+                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],
+                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],
+                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                              [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],
+                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],
+                              [3.3680e-01, -5.1189e-01, -1.9329e+00],
+                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],
+                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],
+                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],
+                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],
+                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],
+                              [5.0592e-01, -5.1189e-01, -1.2951e-01],
+                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],
+                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],
+                              [1.3614e+00, -5.1189e-01, 5.6248e-01],
+                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],
+                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],
+                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],
+                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],
+                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],
+                              [1.3493e+00, -5.1189e-01, -8.7148e-01],
+                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],
+                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],
+                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],
+                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],
+                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],
+                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],
+                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],
+                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],
+                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],
+                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],
+                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],
+                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],
+                              [3.0827e-01, 4.7821e-01, 1.6781e-02],
+                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],
+                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],
+                              [1.5342e+00, 4.7821e-01, -1.0764e+00],
+                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],
+                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],
+                              [1.2060e+00, 4.7821e-01, -4.4542e-01],
+                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],
+                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],
+                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],
+                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],
+                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],
+                              [1.6781e-02, 4.7821e-01, 3.0827e-01],
+                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])
+
+expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason='requires CUDA support')),
+])
+def test_active_rotated_filter(device):
+    feature = torch.tensor(
+        np_feature, dtype=torch.float, device=device, requires_grad=True)
+    indices = torch.tensor(np_indices, dtype=torch.int, device=device)
+    output = active_rotated_filter(feature, indices)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)
+    assert np.allclose(
+        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)
diff --git a/tests/test_ops/test_assign_score_withk.py b/tests/test_ops/test_assign_score_withk.py
index 4f45525310efbce32175ba3c3da800ec67517f4c..f8fc6ae6261b77a634e7681c4939612fe80ddf38 100644
--- a/tests/test_ops/test_assign_score_withk.py
+++ b/tests/test_ops/test_assign_score_withk.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_ball_query.py b/tests/test_ops/test_ball_query.py
index cf30a7efab5dd52d4effff5661d8495212beab0d..4c78dc6600a7e4e0fba60c9ff06512776a591728 100644
--- a/tests/test_ops/test_ball_query.py
+++ b/tests/test_ops/test_ball_query.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_bbox.py b/tests/test_ops/test_bbox.py
index cff7bcca6c93eb4652aadd3ab6440ebfce25ee94..7123b1ee103fa5a0e1b8865ce97c14a359e4918d 100644
--- a/tests/test_ops/test_bbox.py
+++ b/tests/test_ops/test_bbox.py
@@ -1,42 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
 
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-class TestBBox(object):
 
-    def _test_bbox_overlaps(self, dtype=torch.float):
+class TestBBox:
 
+    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):
         from mmcv.ops import bbox_overlaps
         b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],
-                           [7.0, 7.0, 8.0, 8.0]]).cuda().type(dtype)
+                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)
         b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
-                                                  3.0]]).cuda().type(dtype)
+                                                  3.0]]).to(device).type(dtype)
         should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])
         out = bbox_overlaps(b1, b2, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
         b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,
-                                                  4.0]]).cuda().type(dtype)
+                                                  4.0]]).to(device).type(dtype)
         b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
-                                                  3.0]]).cuda().type(dtype)
+                                                  3.0]]).to(device).type(dtype)
         should_output = np.array([0.33333334, 0.5])
         out = bbox_overlaps(b1, b2, aligned=True, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
-        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype)
-        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).cuda().type(dtype)
+        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)
         b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],
                            [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,
-                                                  3.0]]).cuda().type(dtype)
+                                                  3.0]]).to(device).type(dtype)
         should_output = np.array([0, 0.2, 0.5, 0.5])
         out = bbox_overlaps(b1, b2, offset=1)
         assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
 
-    def test_bbox_overlaps_float(self):
-        self._test_bbox_overlaps(torch.float)
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'mps',
+            marks=pytest.mark.skipif(
+                not IS_MPS_AVAILABLE, reason='requires MPS support'))
+    ])
+    def test_bbox_overlaps_float(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.float)
 
-    def test_bbox_overlaps_half(self):
-        self._test_bbox_overlaps(torch.half)
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_bbox_overlaps_half(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.half)
diff --git a/tests/test_ops/test_bilinear_grid_sample.py b/tests/test_ops/test_bilinear_grid_sample.py
index cf0bf437defc15e0e20e049f9fa9be1be7681fe1..8f43d4ff244457a56594068db234313bd1b1a2af 100644
--- a/tests/test_ops/test_bilinear_grid_sample.py
+++ b/tests/test_ops/test_bilinear_grid_sample.py
@@ -1,10 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 
 
-class TestBilinearGridSample(object):
+class TestBilinearGridSample:
 
     def _test_bilinear_grid_sample(self,
                                    dtype=torch.float,
@@ -15,7 +15,8 @@ class TestBilinearGridSample(object):
 
         input = torch.rand(1, 1, 20, 20, dtype=dtype)
         grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-        grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+        grid = F.affine_grid(
+            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
         grid *= multiplier
 
         out = bilinear_grid_sample(input, grid, align_corners=align_corners)
diff --git a/tests/test_ops/test_border_align.py b/tests/test_ops/test_border_align.py
index d8c2c0b247d45e0224a9fe3c1a1b20e7d0e150ff..71518ce9606f8bc0e9bf54c66c9118c483f6e0f5 100644
--- a/tests/test_ops/test_border_align.py
+++ b/tests/test_ops/test_border_align.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import copy
 
 import numpy as np
@@ -51,7 +52,7 @@ def _test_border_align_allclose(device, dtype, pool_size):
     if not torch.cuda.is_available() and device == 'cuda':
         pytest.skip('test requires GPU')
     try:
-        from mmcv.ops import border_align, BorderAlign
+        from mmcv.ops import BorderAlign, border_align
     except ModuleNotFoundError:
         pytest.skip('BorderAlign op is not successfully compiled')
 
diff --git a/tests/test_ops/test_box_iou_rotated.py b/tests/test_ops/test_box_iou_rotated.py
index ad7c918ce0ebd8deb927d1561c6012f89bfaab65..9f5e0dfa3e4e56a4e5c5ea43df2b1ee2b625fbbe 100644
--- a/tests/test_ops/test_box_iou_rotated.py
+++ b/tests/test_ops/test_box_iou_rotated.py
@@ -1,9 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
 
-class TestBoxIoURotated(object):
+class TestBoxIoURotated:
 
     def test_box_iou_rotated_cpu(self):
         from mmcv.ops import box_iou_rotated
@@ -25,6 +26,7 @@ class TestBoxIoURotated(object):
         boxes1 = torch.from_numpy(np_boxes1)
         boxes2 = torch.from_numpy(np_boxes2)
 
+        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2)
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
@@ -32,6 +34,16 @@ class TestBoxIoURotated(object):
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
     def test_box_iou_rotated_cuda(self):
@@ -54,6 +66,7 @@ class TestBoxIoURotated(object):
         boxes1 = torch.from_numpy(np_boxes1).cuda()
         boxes2 = torch.from_numpy(np_boxes2).cuda()
 
+        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2)
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
@@ -61,6 +74,16 @@ class TestBoxIoURotated(object):
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
     def test_box_iou_rotated_iof_cpu(self):
         from mmcv.ops import box_iou_rotated
         np_boxes1 = np.asarray(
@@ -81,12 +104,23 @@ class TestBoxIoURotated(object):
         boxes1 = torch.from_numpy(np_boxes1)
         boxes2 = torch.from_numpy(np_boxes2)
 
+        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2, mode='iof')
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
         ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
 
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
     def test_box_iou_rotated_iof_cuda(self):
@@ -109,9 +143,21 @@ class TestBoxIoURotated(object):
         boxes1 = torch.from_numpy(np_boxes1).cuda()
         boxes2 = torch.from_numpy(np_boxes2).cuda()
 
+        # test cw angle definition
         ious = box_iou_rotated(boxes1, boxes2, mode='iof')
         assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
         ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
         assert np.allclose(
             ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/tests/test_ops/test_carafe.py b/tests/test_ops/test_carafe.py
index 0265bdeced4fecc560dfb7edaa0f9bd8023974e7..6b545a02762481f342e4e88e655b1278f02eb1cc 100644
--- a/tests/test_ops/test_carafe.py
+++ b/tests/test_ops/test_carafe.py
@@ -1,8 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 from torch.autograd import gradcheck
 
 
-class TestCarafe(object):
+class TestCarafe:
 
     def test_carafe_naive_gradcheck(self):
         if not torch.cuda.is_available():
diff --git a/tests/test_ops/test_cc_attention.py b/tests/test_ops/test_cc_attention.py
index 5dd948e5df438050b2f0af01a2be961d3daf7426..b2a8d22a39424c4401b0d6c35a1169da72c58dc2 100644
--- a/tests/test_ops/test_cc_attention.py
+++ b/tests/test_ops/test_cc_attention.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 import torch.nn as nn
@@ -14,7 +15,7 @@ class Loss(nn.Module):
         return torch.mean(input - target)
 
 
-class TestCrissCrossAttention(object):
+class TestCrissCrossAttention:
 
     def test_cc_attention(self):
         device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
diff --git a/tests/test_ops/test_chamfer_distance.py b/tests/test_ops/test_chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..522dcdddc76d49cab6e5b5846bee9ae32d116c66
--- /dev/null
+++ b/tests/test_ops/test_chamfer_distance.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import chamfer_distance
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_chamfer_distance():
+    pointset1 = torch.tensor(
+        [[[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.6, 9.99], [2.3, 9.99], [2.3, 10.39], [1.6, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    pointset2 = torch.tensor(
+        [[[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    expected_dist1 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.5200, 0.6500, 0.4900, 0.3600]],
+        device='cuda')
+    expected_dist2 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.7200, 0.8500, 0.4900, 0.3600]],
+        device='cuda')
+
+    expected_pointset1_grad = torch.tensor(
+        [[[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[1.2000, -0.8000], [-1.4000, -0.8000], [-1.4000, 0.0000],
+          [1.2000, 0.0000]]],
+        device='cuda')
+
+    expected_pointset2_grad = torch.tensor(
+        [[[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[0.0000, 0.0000], [0.0000, 0.0000], [2.8000, 0.8000],
+          [-2.4000, 0.8000]]],
+        device='cuda')
+
+    dist1, dist2, idx1, idx2 = chamfer_distance(pointset1, pointset2)
+    dist1.backward(torch.ones_like(dist1))
+    assert torch.allclose(dist1, expected_dist1, 1e-2)
+    assert torch.allclose(dist2, expected_dist2, 1e-2)
+    assert torch.allclose(pointset1.grad.data, expected_pointset1_grad, 1e-2)
+    assert torch.allclose(pointset2.grad.data, expected_pointset2_grad, 1e-2)
diff --git a/tests/test_ops/test_contour_expand.py b/tests/test_ops/test_contour_expand.py
index c337b44b5076f9f5487b59ecd8bbf530bf8091b5..b36bbf4155c282418b3659984a536a24fad0d8b4 100644
--- a/tests/test_ops/test_contour_expand.py
+++ b/tests/test_ops/test_contour_expand.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 
diff --git a/tests/test_ops/test_convex_iou.py b/tests/test_ops/test_convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..95dc482434bf2189a714dce62883ce0f0309d174
--- /dev/null
+++ b/tests/test_ops/test_convex_iou.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import convex_giou, convex_iou
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,
+                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0
+                           ]])
+
+np_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],
+                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])
+
+np_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])
+
+np_expected_giou = np.asarray([0.2857, 0.3831])
+
+np_expected_grad = np.asarray([[
+    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,
+    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,
+    0.0000
+],
+                               [
+                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,
+                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,
+                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,
+                                   -0.0367, 0.0000, 0.0000
+                               ]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_iou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()
+    assert torch.allclose(
+        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_giou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()
+    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()
+    giou, grad = convex_giou(pointsets, polygons)
+    assert torch.allclose(giou, expected_giou, atol=1e-3)
+    assert torch.allclose(grad, expected_grad, atol=1e-3)
diff --git a/tests/test_ops/test_corner_pool.py b/tests/test_ops/test_corner_pool.py
index 95d32f3a1a3c4acc940e93b91954b52b66ab8f90..d6dd25f2232f0a420249b8e538357280bf05de61 100644
--- a/tests/test_ops/test_corner_pool.py
+++ b/tests/test_ops/test_corner_pool.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 """
 CommandLine:
     pytest tests/test_corner_pool.py
diff --git a/tests/test_ops/test_correlation.py b/tests/test_ops/test_correlation.py
index 6b75a9f38bb4ab21f843594b4e1e39bce4f4ccbd..6cf5f9f72d23fd846fc34932ae336c2d46e16107 100644
--- a/tests/test_ops/test_correlation.py
+++ b/tests/test_ops/test_correlation.py
@@ -30,10 +30,13 @@ class TestCorrelation:
         out = layer(input1, input2)
         out.backward(torch.ones_like(out))
 
-        gt_out = torch.tensor(_gt_out, dtype=dtype)
-        assert_equal_tensor(out.cpu(), gt_out)
-        assert_equal_tensor(input1.grad.detach().cpu(), input2.cpu())
-        assert_equal_tensor(input2.grad.detach().cpu(), input1.cpu())
+        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,
+        # so we need to make a comparison for cuda tensor
+        # rather than cpu tensor
+        gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()
+        assert_equal_tensor(out, gt_out)
+        assert_equal_tensor(input1.grad.detach(), input2)
+        assert_equal_tensor(input2.grad.detach(), input1)
 
     @pytest.mark.skipif(
         not torch.cuda.is_available(), reason='requires CUDA support')
diff --git a/tests/test_ops/test_deform_conv.py b/tests/test_ops/test_deform_conv.py
index 2118cfb62fc278de46e0caa58ac1fd06519b3025..e77b5f97536b2b97f32e9416b501708d1bb1131d 100644
--- a/tests/test_ops/test_deform_conv.py
+++ b/tests/test_ops/test_deform_conv.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
@@ -34,7 +35,7 @@ gt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],
 gt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]
 
 
-class TestDeformconv(object):
+class TestDeformconv:
 
     def _test_deformconv(self,
                          dtype=torch.float,
diff --git a/tests/test_ops/test_deform_roi_pool.py b/tests/test_ops/test_deform_roi_pool.py
index a3eeda97fc67a8368338205357bcf3167e83232d..37a279ec9b698e8d4399693ab885c311acbdcc27 100644
--- a/tests/test_ops/test_deform_roi_pool.py
+++ b/tests/test_ops/test_deform_roi_pool.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
@@ -34,7 +35,7 @@ outputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
                                 0.00390625]]]])]
 
 
-class TestDeformRoIPool(object):
+class TestDeformRoIPool:
 
     def test_deform_roi_pool_gradcheck(self):
         if not torch.cuda.is_available():
diff --git a/tests/test_ops/test_diff_iou_rotated.py b/tests/test_ops/test_diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e05551b04b4df2994cebe4af65ad232be1234d
--- /dev/null
+++ b/tests/test_ops/test_diff_iou_rotated.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_diff_iou_rotated_2d():
+    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0]]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],
+          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],
+          [1.5, 1.5, 1., 1., .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
+    ious = diff_iou_rotated_2d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_diff_iou_rotated_3d():
+    np_boxes1 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0]]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],
+          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],
+          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
+    ious = diff_iou_rotated_3d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
diff --git a/tests/test_ops/test_focal_loss.py b/tests/test_ops/test_focal_loss.py
index e52f060f6a91bd83b60ab278282a77e0010e6cb9..316f58469df262c5c1ca11e59bf0e73388c0e2ad 100644
--- a/tests/test_ops/test_focal_loss.py
+++ b/tests/test_ops/test_focal_loss.py
@@ -1,6 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
+import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -33,7 +37,7 @@ sigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],
                                  [-0.02462499, 0.08277918, 0.18050370]])]
 
 
-class Testfocalloss(object):
+class Testfocalloss:
 
     def _test_softmax(self, dtype=torch.float):
         if not torch.cuda.is_available():
@@ -56,9 +60,7 @@ class Testfocalloss(object):
             assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
             assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
 
-    def _test_sigmoid(self, dtype=torch.float):
-        if not torch.cuda.is_available():
-            return
+    def _test_sigmoid(self, device, dtype=torch.float):
         from mmcv.ops import sigmoid_focal_loss
         alpha = 0.25
         gamma = 2.0
@@ -67,9 +69,9 @@ class Testfocalloss(object):
             np_y = np.array(case[1])
             np_x_grad = np.array(output[1])
 
-            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x = torch.from_numpy(np_x).to(device).type(dtype)
             x.requires_grad_()
-            y = torch.from_numpy(np_y).cuda().long()
+            y = torch.from_numpy(np_y).to(device).long()
 
             loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')
             loss.backward()
@@ -127,11 +129,31 @@ class Testfocalloss(object):
     def test_softmax_half(self):
         self._test_softmax(dtype=torch.half)
 
-    def test_sigmoid_float(self):
-        self._test_sigmoid(dtype=torch.float)
-
-    def test_sigmoid_half(self):
-        self._test_sigmoid(dtype=torch.half)
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_float(self, device):
+        self._test_sigmoid(device=device, dtype=torch.float)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_half(self, device):
+        self._test_sigmoid(device, dtype=torch.half)
 
     def test_grad_softmax_float(self):
         self._test_grad_softmax(dtype=torch.float)
diff --git a/tests/test_ops/test_furthest_point_sample.py b/tests/test_ops/test_furthest_point_sample.py
index d64dedc0344e452ecf8324cc201186255f3f3560..7e61e64a91f541f49828d1e91e6b79c06aa1470a 100644
--- a/tests/test_ops/test_furthest_point_sample.py
+++ b/tests/test_ops/test_furthest_point_sample.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_fused_bias_leakyrelu.py b/tests/test_ops/test_fused_bias_leakyrelu.py
index 5f92757b9a677bc2feca8124d94f5e40852e70a2..47357860de0d0202c69db7b60799b8b38d4fc633 100644
--- a/tests/test_ops/test_fused_bias_leakyrelu.py
+++ b/tests/test_ops/test_fused_bias_leakyrelu.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -9,7 +10,7 @@ except ImportError:
     _USING_PARROTS = False
 
 
-class TestFusedBiasLeakyReLU(object):
+class TestFusedBiasLeakyReLU:
 
     @classmethod
     def setup_class(cls):
diff --git a/tests/test_ops/test_gather_points.py b/tests/test_ops/test_gather_points.py
index 7ca2df02d79dd958cc55c0777a89b38217b76de5..a93df692a58425140fc1fd73f5cefe9c07cf0d6b 100644
--- a/tests/test_ops/test_gather_points.py
+++ b/tests/test_ops/test_gather_points.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -44,3 +45,7 @@ def test_gather_points():
           [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
 
     assert torch.allclose(output, expected_output)
+
+    # test fp16
+    output_half = gather_points(features.half(), idx)
+    assert torch.allclose(output_half, expected_output.half())
diff --git a/tests/test_ops/test_group_points.py b/tests/test_ops/test_group_points.py
index 1b495c2850c3e5a274f4a3d5a393e8d7813bed56..b295437fb8c0da0317c3d61c6187252334ba88d2 100644
--- a/tests/test_ops/test_group_points.py
+++ b/tests/test_ops/test_group_points.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_info.py b/tests/test_ops/test_info.py
index 4c95c291298738c0b8fe7b73fc592b180382bca7..e3c1722eba09f41222352bf54962a3859566702a 100644
--- a/tests/test_ops/test_info.py
+++ b/tests/test_ops/test_info.py
@@ -1,7 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 
-class TestInfo(object):
+class TestInfo:
 
     def test_info(self):
         if not torch.cuda.is_available():
diff --git a/tests/test_ops/test_iou3d.py b/tests/test_ops/test_iou3d.py
index 21ed84a9e50ff65503b10fa9f2ebcc4c12878040..0576a1b816f6f0d903d08765e7592a62d3236edf 100644
--- a/tests/test_ops/test_iou3d.py
+++ b/tests/test_ops/test_iou3d.py
@@ -1,60 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
-from mmcv.ops import boxes_iou_bev, nms_bev, nms_normal_bev
+from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
 
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-def test_boxes_iou_bev():
-    np_boxes1 = np.asarray(
-        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
-         [7.0, 7.0, 8.0, 8.0, 0.4]],
-        dtype=np.float32)
-    np_boxes2 = np.asarray(
-        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
-         [5.0, 5.0, 6.0, 7.0, 0.4]],
+def test_boxes_overlap_bev():
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
+    np_expect_overlaps = np.asarray(
+        [[4.0, 4.0, (8 + 8 * 2**0.5) /
+          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
         dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    # test for 3 boxes
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)
+
+    # test for many boxes
+    boxes2 = boxes2.repeat_interleave(555, 0)
+
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(
+        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_boxes_iou3d():
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
     np_expect_ious = np.asarray(
-        [[0.2621, 0.2948, 0.0000], [0.0549, 0.1587, 0.0000],
-         [0.0000, 0.0000, 0.0000]],
+        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],
+         [0.0, 0.0, 0.0]],
         dtype=np.float32)
 
     boxes1 = torch.from_numpy(np_boxes1).cuda()
     boxes2 = torch.from_numpy(np_boxes2).cuda()
 
-    ious = boxes_iou_bev(boxes1, boxes2)
+    ious = boxes_iou3d(boxes1, boxes2)
     assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
 
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-def test_nms_bev():
-    np_boxes = np.array(
-        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
-         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
-        dtype=np.float32)
-    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+def test_nms3d():
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
     np_inds = np.array([1, 0, 3])
     boxes = torch.from_numpy(np_boxes)
     scores = torch.from_numpy(np_scores)
-    inds = nms_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
+    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
 
     assert np.allclose(inds.cpu().numpy(), np_inds)
 
+    # test for many boxes
+    np.random.seed(42)
+    np_boxes = np.random.rand(555, 7).astype(np.float32)
+    np_scores = np.random.rand(555).astype(np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert len(inds.cpu().numpy()) == 176
+
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-def test_nms_normal_bev():
-    np_boxes = np.array(
-        [[6.0, 3.0, 8.0, 7.0, 2.0], [3.0, 6.0, 9.0, 11.0, 1.0],
-         [3.0, 7.0, 10.0, 12.0, 1.0], [1.0, 4.0, 13.0, 7.0, 3.0]],
-        dtype=np.float32)
-    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+def test_nms3d_normal():
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
     np_inds = np.array([1, 0, 3])
     boxes = torch.from_numpy(np_boxes)
     scores = torch.from_numpy(np_scores)
-    inds = nms_normal_bev(boxes.cuda(), scores.cuda(), thresh=0.3)
+    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
 
     assert np.allclose(inds.cpu().numpy(), np_inds)
+
+    # test for many boxes
+    np.random.seed(42)
+    np_boxes = np.random.rand(555, 7).astype(np.float32)
+    np_scores = np.random.rand(555).astype(np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert len(inds.cpu().numpy()) == 148
diff --git a/tests/test_ops/test_knn.py b/tests/test_ops/test_knn.py
index 2740cb5e1ba9c4b293bc4dafe02a1caf767d9d55..1236a5fcbe732fd287cea0a97e3166dbcd5555fa 100644
--- a/tests/test_ops/test_knn.py
+++ b/tests/test_ops/test_knn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_masked_conv2d.py b/tests/test_ops/test_masked_conv2d.py
index 6df70bcefade7c52fb3294083b85ed5ea261d4fd..4516b22e99202095a7afe432567059f286e94f21 100644
--- a/tests/test_ops/test_masked_conv2d.py
+++ b/tests/test_ops/test_masked_conv2d.py
@@ -1,7 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
 
-class TestMaskedConv2d(object):
+class TestMaskedConv2d:
 
     def test_masked_conv2d(self):
         if not torch.cuda.is_available():
diff --git a/tests/test_ops/test_merge_cells.py b/tests/test_ops/test_merge_cells.py
index 47080ff2c3b836306802ba09753b788016f9d7d2..51551c1416eb39340ed0ec170ce5dd35e436df68 100644
--- a/tests/test_ops/test_merge_cells.py
+++ b/tests/test_ops/test_merge_cells.py
@@ -1,7 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 """
 CommandLine:
     pytest tests/test_merge_cells.py
 """
+import math
+
+import pytest
 import torch
 import torch.nn.functional as F
 
@@ -9,33 +13,41 @@ from mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,
                                   SumCell)
 
 
-def test_sum_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 16, 16])
+# All size (14, 7) below is to test the situation that
+# the input size can't be divisible by the target size.
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_sum_cell(inputs_x, inputs_y):
     sum_cell = SumCell(256, 256)
     output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert output.size() == inputs_x.size()
     output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
     assert output.size() == inputs_y.size()
     output = sum_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_x.size()
+    assert output.size() == inputs_y.size()
 
 
-def test_concat_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 16, 16])
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_concat_cell(inputs_x, inputs_y):
     concat_cell = ConcatCell(256, 256)
     output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert output.size() == inputs_x.size()
     output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
     assert output.size() == inputs_y.size()
     output = concat_cell(inputs_x, inputs_y)
-    assert output.size() == inputs_x.size()
+    assert output.size() == inputs_y.size()
 
 
-def test_global_pool_cell():
-    inputs_x = torch.randn([2, 256, 32, 32])
-    inputs_y = torch.randn([2, 256, 32, 32])
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_global_pool_cell(inputs_x, inputs_y):
     gp_cell = GlobalPoolingCell(with_out_conv=False)
     gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
     assert (gp_cell_out.size() == inputs_x.size())
@@ -44,22 +56,40 @@ def test_global_pool_cell():
     assert (gp_cell_out.size() == inputs_x.size())
 
 
-def test_resize_methods():
+@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),
+                                         (14, 7)])
+def test_resize_methods(target_size):
     inputs_x = torch.randn([2, 256, 128, 128])
-    target_resize_sizes = [(128, 128), (256, 256)]
-    resize_methods_list = ['nearest', 'bilinear']
+    h, w = inputs_x.shape[-2:]
+    target_h, target_w = target_size
+    if (h <= target_h) or w <= target_w:
+        rs_mode = 'upsample'
+    else:
+        rs_mode = 'downsample'
 
-    for method in resize_methods_list:
-        merge_cell = BaseMergeCell(upsample_mode=method)
-        for target_size in target_resize_sizes:
+    if rs_mode == 'upsample':
+        upsample_methods_list = ['nearest', 'bilinear']
+        for method in upsample_methods_list:
+            merge_cell = BaseMergeCell(upsample_mode=method)
             merge_cell_out = merge_cell._resize(inputs_x, target_size)
             gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
             assert merge_cell_out.equal(gt_out)
-
-    target_size = (64, 64)  # resize to a smaller size
-    merge_cell = BaseMergeCell()
-    merge_cell_out = merge_cell._resize(inputs_x, target_size)
-    kernel_size = inputs_x.shape[-1] // target_size[-1]
-    gt_out = F.max_pool2d(
-        inputs_x, kernel_size=kernel_size, stride=kernel_size)
-    assert (merge_cell_out == gt_out).all()
+    elif rs_mode == 'downsample':
+        merge_cell = BaseMergeCell()
+        merge_cell_out = merge_cell._resize(inputs_x, target_size)
+        if h % target_h != 0 or w % target_w != 0:
+            pad_h = math.ceil(h / target_h) * target_h - h
+            pad_w = math.ceil(w / target_w) * target_w - w
+            pad_l = pad_w // 2
+            pad_r = pad_w - pad_l
+            pad_t = pad_h // 2
+            pad_b = pad_h - pad_t
+            pad = (pad_l, pad_r, pad_t, pad_b)
+            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)
+        kernel_size = (inputs_x.shape[-2] // target_h,
+                       inputs_x.shape[-1] // target_w)
+        gt_out = F.max_pool2d(
+            inputs_x, kernel_size=kernel_size, stride=kernel_size)
+        print(merge_cell_out.shape, gt_out.shape)
+        assert (merge_cell_out == gt_out).all()
+        assert merge_cell_out.shape[-2:] == target_size
diff --git a/tests/test_ops/test_min_area_polygons.py b/tests/test_ops/test_min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..649bdecfd62bcba2f782758802c97b265eaa9887
--- /dev/null
+++ b/tests/test_ops/test_min_area_polygons.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import min_area_polygons
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,
+                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5
+                           ]])
+
+expected_polygons = np.asarray(
+    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],
+     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_min_area_polygons():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+
+    assert np.allclose(
+        min_area_polygons(pointsets).cpu().numpy(),
+        expected_polygons,
+        atol=1e-4)
diff --git a/tests/test_ops/test_modulated_deform_conv.py b/tests/test_ops/test_modulated_deform_conv.py
index b528c5112ba7815cbedb69707bb09238c3e1a2cc..3b9070491a4c4efca070a92aadfbebec59f3fec5 100644
--- a/tests/test_ops/test_modulated_deform_conv.py
+++ b/tests/test_ops/test_modulated_deform_conv.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy
@@ -36,7 +37,7 @@ dcn_offset_b_grad = [
 ]
 
 
-class TestMdconv(object):
+class TestMdconv:
 
     def _test_mdconv(self, dtype=torch.float, device='cuda'):
         if not torch.cuda.is_available() and device == 'cuda':
diff --git a/tests/test_ops/test_ms_deformable_attn.py b/tests/test_ops/test_ms_deformable_attn.py
index 0c350deee052a157c9034944db1a60a319df1bf4..3ebbf6bdf5d691a0b2f1f948775118b5356a88a6 100644
--- a/tests/test_ops/test_ms_deformable_attn.py
+++ b/tests/test_ops/test_ms_deformable_attn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -54,7 +55,7 @@ def test_forward_multi_scale_deformable_attn_pytorch():
     N, M, D = 1, 2, 2
     Lq, L, P = 2, 2, 2
     shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
-    S = sum([(H * W).item() for H, W in shapes])
+    S = sum((H * W).item() for H, W in shapes)
 
     torch.manual_seed(3)
     value = torch.rand(N, S, M, D) * 0.01
@@ -77,7 +78,7 @@ def test_forward_equal_with_pytorch_double():
     shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum([(H * W).item() for H, W in shapes])
+    S = sum((H * W).item() for H, W in shapes)
 
     torch.manual_seed(3)
     value = torch.rand(N, S, M, D).cuda() * 0.01
@@ -110,7 +111,7 @@ def test_forward_equal_with_pytorch_float():
     shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum([(H * W).item() for H, W in shapes])
+    S = sum((H * W).item() for H, W in shapes)
 
     torch.manual_seed(3)
     value = torch.rand(N, S, M, D).cuda() * 0.01
@@ -154,7 +155,7 @@ def test_gradient_numerical(channels,
     shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).cuda()
     level_start_index = torch.cat((shapes.new_zeros(
         (1, )), shapes.prod(1).cumsum(0)[:-1]))
-    S = sum([(H * W).item() for H, W in shapes])
+    S = sum((H * W).item() for H, W in shapes)
 
     value = torch.rand(N, S, M, channels).cuda() * 0.01
     sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
diff --git a/tests/test_ops/test_nms.py b/tests/test_ops/test_nms.py
index 3c59204b1bde1d548bfe9ea3c9d56777199be524..aece8ad5e43305c89317f2e15dc873155f409f26 100644
--- a/tests/test_ops/test_nms.py
+++ b/tests/test_ops/test_nms.py
@@ -1,13 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
 
-class Testnms(object):
 
-    def test_nms_allclose(self):
-        if not torch.cuda.is_available():
-            return
+class Testnms:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_nms_allclose(self, device):
         from mmcv.ops import nms
         np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                              [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
@@ -23,7 +34,7 @@ class Testnms(object):
         assert np.allclose(dets, np_dets)  # test cpu
         assert np.allclose(inds, np_inds)  # test cpu
         dets, inds = nms(
-            boxes.cuda(), scores.cuda(), iou_threshold=0.3, offset=0)
+            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
         assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
         assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu
 
@@ -118,8 +129,7 @@ class Testnms(object):
         scores = tensor_dets[:, 4]
         nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
                             iou_thr)[1]
-        assert set([g[0].item()
-                    for g in np_groups]) == set(nms_keep_inds.tolist())
+        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())
 
         # non empty tensor input
         tensor_dets = torch.from_numpy(np_dets)
@@ -182,3 +192,14 @@ class Testnms(object):
 
         assert torch.equal(keep, seq_keep)
         assert torch.equal(boxes, seq_boxes)
+
+        # test skip nms when `nms_cfg` is None
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            None,
+            class_agnostic=False)
+        assert len(seq_keep) == len(results['boxes'])
+        # assert score is descending order
+        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()
diff --git a/tests/test_ops/test_nms_rotated.py b/tests/test_ops/test_nms_rotated.py
index 4ae74eaa4655a46713c5afb70bf6fc93015d49c5..1b7f3607b0dedcb23564b021f13125d1681c1485 100644
--- a/tests/test_ops/test_nms_rotated.py
+++ b/tests/test_ops/test_nms_rotated.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
@@ -26,11 +27,20 @@ class TestNmsRotated:
         boxes = torch.from_numpy(np_boxes).cuda()
         labels = torch.from_numpy(np_labels).cuda()
 
+        # test cw angle definition
         dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
 
         assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
         assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
 
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
     def test_nms_rotated(self):
         from mmcv.ops import nms_rotated
         np_boxes = np.array(
@@ -47,6 +57,60 @@ class TestNmsRotated:
 
         boxes = torch.from_numpy(np_boxes).cuda()
 
+        # test cw angle definition
         dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
         assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
         assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    def test_batched_nms(self):
+        # test batched_nms with nms_rotated
+        from mmcv.ops import batched_nms
+
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_agnostic_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
+
+        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)
+
+        # test class_agnostic is True
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=True)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
+
+        # test class_agnostic is False
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=False)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/tests/test_ops/test_onnx.py b/tests/test_ops/test_onnx.py
index c2fc7ff3fbacccc3e1b85dee2712a916035e4b57..b80140a3750bc4be91e1e6349594b0c5c6c7f5b9 100644
--- a/tests/test_ops/test_onnx.py
+++ b/tests/test_ops/test_onnx.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import warnings
 from functools import partial
@@ -8,9 +9,12 @@ import onnxruntime as rt
 import pytest
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from packaging import version
 
 onnx_file = 'tmp.onnx'
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
 
 
 @pytest.fixture(autouse=True)
@@ -29,7 +33,7 @@ def run_before_and_after_test():
 class WrapFunction(nn.Module):
 
     def __init__(self, wrapped_function):
-        super(WrapFunction, self).__init__()
+        super().__init__()
         self.wrapped_function = wrapped_function
 
     def forward(self, *args, **kwargs):
@@ -87,10 +91,11 @@ def test_grid_sample(mode, padding_mode, align_corners):
 
     input = torch.rand(1, 1, 10, 10)
     grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+    grid = F.affine_grid(
+        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
 
     def func(input, grid):
-        return nn.functional.grid_sample(
+        return F.grid_sample(
             input,
             grid,
             mode=mode,
@@ -110,7 +115,8 @@ def test_bilinear_grid_sample(align_corners):
 
     input = torch.rand(1, 1, 10, 10)
     grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = nn.functional.affine_grid(grid, (1, 1, 15, 15)).type_as(input)
+    grid = F.affine_grid(
+        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
 
     def func(input, grid):
         return bilinear_grid_sample(input, grid, align_corners=align_corners)
@@ -119,8 +125,6 @@ def test_bilinear_grid_sample(align_corners):
 
 
 def test_nms():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
     from mmcv.ops import get_onnxruntime_op_path, nms
     np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
                          [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
@@ -167,8 +171,6 @@ def test_nms():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
 def test_softnms():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
     from mmcv.ops import get_onnxruntime_op_path, soft_nms
 
     # only support pytorch >= 1.7.0
@@ -243,8 +245,6 @@ def test_softnms():
 
 
 def test_roialign():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
     try:
         from mmcv.ops import get_onnxruntime_op_path, roi_align
     except (ImportError, ModuleNotFoundError):
@@ -315,8 +315,6 @@ def test_roialign():
 
 
 def test_roialign_rotated():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
     try:
         from mmcv.ops import get_onnxruntime_op_path, roi_align_rotated
     except (ImportError, ModuleNotFoundError):
@@ -394,8 +392,6 @@ def test_roialign_rotated():
 
 @pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
 def test_roipool():
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
     from mmcv.ops import roi_pool
 
     # roi pool config
@@ -462,7 +458,7 @@ def test_interpolate():
     register_extra_symbolics(opset_version)
 
     def func(feat, scale_factor=2):
-        out = nn.functional.interpolate(feat, scale_factor=scale_factor)
+        out = F.interpolate(feat, scale_factor=scale_factor)
         return out
 
     net = WrapFunction(func)
@@ -481,10 +477,123 @@ def test_interpolate():
     assert np.allclose(pytorch_result, onnx_result, atol=1e-3)
 
 
-@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
-def test_corner_pool(mode, opset=11):
+def test_rotated_feature_align():
     if torch.__version__ == 'parrots':
         pytest.skip('onnx is not supported in parrots directly')
+    try:
+        from mmcv.ops import get_onnxruntime_op_path, rotated_feature_align
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('rotated_feature_align op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    spatial_scale = 1.0 / 8
+    points = 1
+
+    def warpped_function(feature, bbox):
+        return rotated_feature_align(
+            feature, bbox, spatial_scale=spatial_scale, points=points)
+
+    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
+                              [0.9144, 1.2248, 1.3115, -0.9690],
+                              [-0.8949, -1.1797, -0.9093, -0.3961],
+                              [-0.4586, 0.5062, -0.7947, -0.7397]],
+                             [[-1.0943, -0.7495, 1.3461, -1.1652],
+                              [0.2034, 0.6763, -1.2357, 0.5231],
+                              [-1.0062, 1.2592, 1.4225, -0.3951],
+                              [-0.1242, -1.6240, 0.1932, 2.7181]],
+                             [[-1.6271, -1.0276, 0.0578, -0.2997],
+                              [-0.9684, -1.6946, -1.3188, -1.1938],
+                              [-1.6744, -0.8917, -0.6556, 1.0073],
+                              [-0.1205, 0.3671, -0.3731, -0.5347]]],
+                            [[[0.7035, 0.2089, -0.1774, 3.4670],
+                              [-0.8505, -0.9278, 1.4714, 0.1644],
+                              [0.0898, 0.3531, -0.4007, 0.1927],
+                              [1.2569, -0.2636, -0.5223, 0.0616]],
+                             [[0.1760, -0.7639, -0.4600, -1.3260],
+                              [-0.9921, -0.2970, -0.8955, 1.0508],
+                              [1.3515, -0.1641, 1.9679, 1.1986],
+                              [-0.3616, 0.6287, 0.4933, 0.3360]],
+                             [[-0.5860, 0.2124, -0.8700, 2.4200],
+                              [-0.0551, -1.5103, -1.6779, 0.8399],
+                              [0.8431, 1.2414, -1.1243, -0.3887],
+                              [-2.1254, 0.6047, -0.3515, 0.7254]]]])
+
+    bbox = torch.tensor(
+        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
+           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
+           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
+           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
+          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
+           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
+           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
+           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
+          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
+           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
+           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
+           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
+          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
+           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
+           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
+           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
+         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
+           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
+           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
+           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
+          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
+           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
+           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
+           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
+          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
+           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
+           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
+           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
+          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
+           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
+           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
+           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]])
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_output = rotated_feature_align(
+            feature, bbox, spatial_scale=spatial_scale, points=points)
+
+    # export and load onnx model
+    wrapped_model = WrapFunction(warpped_function)
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (feature, bbox),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['feature', 'bbox'],
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # compute onnx_output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_output = sess.run(None, {
+        'feature': feature.detach().numpy(),
+        'bbox': bbox.detach().numpy()
+    })
+    onnx_output = onnx_output[0]
+
+    # allclose
+    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
+def test_corner_pool(mode, opset=11):
 
     from mmcv.ops import get_onnxruntime_op_path
     ort_custom_op_path = get_onnxruntime_op_path()
@@ -529,8 +638,6 @@ def test_corner_pool(mode, opset=11):
 
 @pytest.mark.parametrize('key', ['cummax', 'cummin'])
 def test_cummax_cummin(key, opset=11):
-    if torch.__version__ == 'parrots':
-        pytest.skip('onnx is not supported in parrots directly')
 
     # Note generally `cummax` or `cummin` is exportable to ONNX
     # as long as the pytorch version >= 1.5.0, since `torch.cummax`
@@ -555,7 +662,7 @@ def test_cummax_cummin(key, opset=11):
     input_list = [
         # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
         torch.rand((2, 3, 4, 1, 5)),
-        torch.rand((1)),
+        torch.rand(1),
         torch.rand((2, 0, 1)),  # tensor.numel() is 0
         torch.FloatTensor(),  # empty tensor
     ]
@@ -639,9 +746,6 @@ def test_roll(shifts_dims_pair):
     torch.testing.assert_allclose(ort_output, pytorch_output)
 
 
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots',
-    reason='onnx is not supported in parrots directly')
 @pytest.mark.skipif(
     not torch.cuda.is_available(),
     reason='modulated_deform_conv2d only supports in GPU')
@@ -652,6 +756,9 @@ def test_modulated_deform_conv2d():
         pytest.skip('modulated_deform_conv op is not successfully compiled')
 
     ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
     # modulated deform conv config
     in_channels = 3
     out_channels = 64
@@ -730,9 +837,6 @@ def test_modulated_deform_conv2d():
         assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
 
 
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots',
-    reason='onnx is not supported in parrots directly')
 def test_deform_conv2d(threshold=1e-3):
     try:
         from mmcv.ops import DeformConv2d, get_onnxruntime_op_path
diff --git a/tests/test_ops/test_pixel_group.py b/tests/test_ops/test_pixel_group.py
index c7052ce30d0404d4d23ec67bd2c7619136d37256..ceb257365729d0238359ccce8d6a0e60939c6ef6 100644
--- a/tests/test_ops/test_pixel_group.py
+++ b/tests/test_ops/test_pixel_group.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import torch
 
diff --git a/tests/test_ops/test_points_in_polygons.py b/tests/test_ops/test_points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..dde8ab023913f04214e44f1642212b13b817589a
--- /dev/null
+++ b/tests/test_ops/test_points_in_polygons.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import points_in_polygons
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_polygons():
+    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
+                       [100, 0]])
+    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
+                         [400., 400., 500., 500., 600., 300., 500., 200.],
+                         [300., 300., 600., 700., 700., 700., 700., 100.]])
+    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
+                                [1., 0., 0.], [0., 0., 0.]])
+    points = torch.from_numpy(points).cuda().float()
+    polygons = torch.from_numpy(polygons).cuda().float()
+    expected_output = torch.from_numpy(expected_output).cuda().float()
+    assert torch.allclose(
+        points_in_polygons(points, polygons), expected_output, 1e-3)
diff --git a/tests/test_ops/test_prroi_pool.py b/tests/test_ops/test_prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee471e82858d0040fd42f9606376624384431b4
--- /dev/null
+++ b/tests/test_ops/test_prroi_pool.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [
+    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],
+                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),
+    ([[[[1.75, 2.25], [2.75, 3.25]],
+       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],
+                                         [[1., 1.],
+                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),
+    ([[[[3.75, 6.91666651],
+        [10.08333302,
+         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],
+                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],
+                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],
+                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],
+     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])
+]
+
+
+class TestPrRoiPool:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_gradcheck(self, device):
+        from mmcv.ops import PrRoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+
+            x = torch.tensor(np_input, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, device=device)
+
+            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import prroi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+            np_output = np.array(output[0], dtype=np.float32)
+            np_input_grad = np.array(output[1], dtype=np.float32)
+            np_rois_grad = np.array(output[2], dtype=np.float32)
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(
+                np_rois, dtype=dtype, device=device, requires_grad=True)
+
+            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)
+            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,
+                               1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_allclose_float(self, device):
+        self._test_roipool_allclose(device, dtype=torch.float)
diff --git a/tests/test_ops/test_psa_mask.py b/tests/test_ops/test_psa_mask.py
index df70fd50ded8bead1b3630c18192813a5613c73e..8c1f3101ab0d419713d68dd29f81f8bef5e1c7b9 100644
--- a/tests/test_ops/test_psa_mask.py
+++ b/tests/test_ops/test_psa_mask.py
@@ -1,7 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
+import pytest
 import torch
 import torch.nn as nn
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 
 class Loss(nn.Module):
 
@@ -14,11 +18,19 @@ class Loss(nn.Module):
         return torch.mean(input - target)
 
 
-class TestPSAMask(object):
+class TestPSAMask:
 
-    def test_psa_mask_collect(self):
-        if not torch.cuda.is_available():
-            return
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_psa_mask_collect(self, device):
         from mmcv.ops import PSAMask
         test_loss = Loss()
 
@@ -44,11 +56,11 @@ class TestPSAMask(object):
         assert np.allclose(test_output, output_collect)
         assert test_output.shape == output_collect.shape
 
-        psamask_collect.cuda()
-        input = input.cuda()
-        label = label.cuda()
+        psamask_collect.to(device)
+        input = input.to(device)
+        label = label.to(device)
 
-        # test collect cuda
+        # test collect on device
         test_output = psamask_collect(input)
         loss = test_loss(test_output, label)
         loss.backward()
@@ -56,9 +68,17 @@ class TestPSAMask(object):
         assert np.allclose(test_output, output_collect)
         assert test_output.shape == output_collect.shape
 
-    def test_psa_mask_distribute(self):
-        if not torch.cuda.is_available():
-            return
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_psa_mask_distribute(self, device):
         from mmcv.ops import PSAMask
         test_loss = Loss()
 
@@ -85,11 +105,11 @@ class TestPSAMask(object):
         assert np.allclose(test_output, output_distribute)
         assert test_output.shape == output_distribute.shape
 
-        psamask_distribute.cuda()
-        input = input.cuda()
-        label = label.cuda()
+        psamask_distribute.to(device)
+        input = input.to(device)
+        label = label.to(device)
 
-        # test distribute cuda
+        # test distribute on device
         test_output = psamask_distribute(input)
         loss = test_loss(test_output, label)
         loss.backward()
diff --git a/tests/test_ops/test_riroi_align_rotated.py b/tests/test_ops/test_riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7b501cf44b89b687cc8bf687e0583c84705143e
--- /dev/null
+++ b/tests/test_ops/test_riroi_align_rotated.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import RiRoIAlignRotated
+
+if torch.__version__ == 'parrots':
+    from parrots.autograd import gradcheck
+    _USING_PARROTS = True
+else:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+np_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],
+                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,
+                                                                       14]],
+                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])
+np_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],
+                    [0., 1., 1., 3., 3., np.pi / 2]])
+expect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],
+                           [[2.4779, 1.7416], [3.2173, 2.5632]],
+                           [[2.7149, 2.2638], [2.6540, 2.3673]],
+                           [[2.9461, 2.8638], [2.8028, 2.7205]],
+                           [[4.1943, 2.7214], [5.6119, 4.1391]],
+                           [[7.5276, 6.0547], [8.9453, 7.4724]],
+                           [[12.1943, 10.7214], [13.6119, 12.1391]],
+                           [[9.5489, 8.4237], [10.5763, 9.4511]]],
+                          [[[7.6562, 12.5625], [4.0000, 6.6250]],
+                           [[1.0000, 1.3125], [0.5000, 0.6562]],
+                           [[1.6562, 1.9375], [1.0000, 1.3125]],
+                           [[1.8438, 2.0547], [0.7500, 1.1562]],
+                           [[0.8438, 3.0625], [0.2500, 1.1875]],
+                           [[2.6562, 2.5625], [1.5000, 1.6250]],
+                           [[3.6562, 4.5625], [2.0000, 2.6250]],
+                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])
+
+expect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+num_samples = 2
+sampling_ratio = 2
+num_orientations = 8
+clockwise = False
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_gradcheck():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    if _USING_PARROTS:
+        gradcheck(
+            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)
+    else:
+        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_allclose():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    output = froipool(x, rois)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)
+    assert np.allclose(
+        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)
diff --git a/tests/test_ops/test_roi_align.py b/tests/test_ops/test_roi_align.py
index db7c037401a139cbfb227e3c231fcdeeb644d08d..6caf5c53566d5a903b1dee64a6aa33fa6daf5b03 100644
--- a/tests/test_ops/test_roi_align.py
+++ b/tests/test_ops/test_roi_align.py
@@ -1,7 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -10,6 +13,7 @@ except ImportError:
     _USING_PARROTS = False
 
 # yapf:disable
+
 inputs = [([[[[1., 2.], [3., 4.]]]],
            [[0., 0., 0., 1., 1.]]),
           ([[[[1., 2.], [3., 4.]],
@@ -38,8 +42,6 @@ sampling_ratio = 2
 
 
 def _test_roialign_gradcheck(device, dtype):
-    if not torch.cuda.is_available() and device == 'cuda':
-        pytest.skip('test requires GPU')
     try:
         from mmcv.ops import RoIAlign
     except ModuleNotFoundError:
@@ -64,8 +66,6 @@ def _test_roialign_gradcheck(device, dtype):
 
 
 def _test_roialign_allclose(device, dtype):
-    if not torch.cuda.is_available() and device == 'cuda':
-        pytest.skip('test requires GPU')
     try:
         from mmcv.ops import roi_align
     except ModuleNotFoundError:
@@ -74,7 +74,6 @@ def _test_roialign_allclose(device, dtype):
     pool_w = 2
     spatial_scale = 1.0
     sampling_ratio = 2
-
     for case, output in zip(inputs, outputs):
         np_input = np.array(case[0])
         np_rois = np.array(case[1])
@@ -94,8 +93,26 @@ def _test_roialign_allclose(device, dtype):
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
 
 
-@pytest.mark.parametrize('device', ['cuda', 'cpu'])
-@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
 def test_roialign(device, dtype):
     # check double only
     if dtype is torch.double:
diff --git a/tests/test_ops/test_roi_align_rotated.py b/tests/test_ops/test_roi_align_rotated.py
index bed2e05f9b8c7ba7841faf3350789ab7e67dabb7..1ad6b6e9273af9cbdc569a586d88a9ee65dbd5f0 100644
--- a/tests/test_ops/test_roi_align_rotated.py
+++ b/tests/test_ops/test_roi_align_rotated.py
@@ -1,7 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -50,8 +53,6 @@ sampling_ratio = 2
 
 
 def _test_roialign_rotated_gradcheck(device, dtype):
-    if not torch.cuda.is_available() and device == 'cuda':
-        pytest.skip('unittest does not support GPU yet.')
     try:
         from mmcv.ops import RoIAlignRotated
     except ModuleNotFoundError:
@@ -68,7 +69,6 @@ def _test_roialign_rotated_gradcheck(device, dtype):
 
         froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
                                    sampling_ratio)
-
         if torch.__version__ == 'parrots':
             gradcheck(
                 froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
@@ -77,10 +77,8 @@ def _test_roialign_rotated_gradcheck(device, dtype):
 
 
 def _test_roialign_rotated_allclose(device, dtype):
-    if not torch.cuda.is_available() and device == 'cuda':
-        pytest.skip('unittest does not support GPU yet.')
     try:
-        from mmcv.ops import roi_align_rotated
+        from mmcv.ops import RoIAlignRotated, roi_align_rotated
     except ModuleNotFoundError:
         pytest.skip('test requires compilation')
     pool_h = 2
@@ -106,11 +104,48 @@ def _test_roialign_rotated_allclose(device, dtype):
         assert np.allclose(
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
 
+    # Test deprecated parameters
+    roi_align_rotated_module_deprecated = RoIAlignRotated(
+        out_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sample_num=sampling_ratio)
+
+    output_1 = roi_align_rotated_module_deprecated(x, rois)
+
+    roi_align_rotated_module_new = RoIAlignRotated(
+        output_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sampling_ratio=sampling_ratio)
+
+    output_2 = roi_align_rotated_module_new(x, rois)
+
+    assert np.allclose(
+        output_1.data.type(torch.float).cpu().numpy(),
+        output_2.data.type(torch.float).cpu().numpy())
+
 
-@pytest.mark.parametrize('device', ['cuda', 'cpu'])
-@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
 def test_roialign_rotated(device, dtype):
     # check double only
-    if (dtype is torch.double):
+    if dtype is torch.double:
         _test_roialign_rotated_gradcheck(device=device, dtype=dtype)
     _test_roialign_rotated_allclose(device=device, dtype=dtype)
diff --git a/tests/test_ops/test_roi_pool.py b/tests/test_ops/test_roi_pool.py
index d38c45ad17f934fd520feba8db4dce209031c096..39d0ddea96396bc1c7f9c47cbc4dbec2dcfa8a61 100644
--- a/tests/test_ops/test_roi_pool.py
+++ b/tests/test_ops/test_roi_pool.py
@@ -1,8 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
+import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -28,7 +32,7 @@ outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
                                                               1.]]]])]
 
 
-class TestRoiPool(object):
+class TestRoiPool:
 
     def test_roipool_gradcheck(self):
         if not torch.cuda.is_available():
@@ -53,9 +57,7 @@ class TestRoiPool(object):
             else:
                 gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
 
-    def _test_roipool_allclose(self, dtype=torch.float):
-        if not torch.cuda.is_available():
-            return
+    def _test_roipool_allclose(self, device, dtype=torch.float):
         from mmcv.ops import roi_pool
         pool_h = 2
         pool_w = 2
@@ -68,15 +70,32 @@ class TestRoiPool(object):
             np_grad = np.array(output[1])
 
             x = torch.tensor(
-                np_input, dtype=dtype, device='cuda', requires_grad=True)
-            rois = torch.tensor(np_rois, dtype=dtype, device='cuda')
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device=device)
 
             output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
             output.backward(torch.ones_like(output))
             assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
             assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
 
-    def test_roipool_allclose(self):
-        self._test_roipool_allclose(torch.double)
-        self._test_roipool_allclose(torch.float)
-        self._test_roipool_allclose(torch.half)
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    @pytest.mark.parametrize('dtype', [
+        torch.float,
+        pytest.param(
+            torch.double,
+            marks=pytest.mark.skipif(
+                IS_MLU_AVAILABLE,
+                reason='MLU does not support for 64-bit floating point')),
+        torch.half
+    ])
+    def test_roipool_allclose(self, device, dtype):
+        self._test_roipool_allclose(device, dtype)
diff --git a/tests/test_ops/test_roiaware_pool3d.py b/tests/test_ops/test_roiaware_pool3d.py
index 1d63e398dabaf0ac6cfdf28bcb617a18368f6fd5..5d043b00f5a0354f5f8a7b201a82c099454d5f5d 100644
--- a/tests/test_ops/test_roiaware_pool3d.py
+++ b/tests/test_ops/test_roiaware_pool3d.py
@@ -133,3 +133,11 @@ def test_points_in_boxes_all():
         dtype=torch.int32).cuda()
     assert point_indices.shape == torch.Size([1, 15, 2])
     assert (point_indices == expected_point_indices).all()
+
+    if torch.cuda.device_count() > 1:
+        pts = pts.to('cuda:1')
+        boxes = boxes.to('cuda:1')
+        expected_point_indices = expected_point_indices.to('cuda:1')
+        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
+        assert point_indices.shape == torch.Size([1, 15, 2])
+        assert (point_indices == expected_point_indices).all()
diff --git a/tests/test_ops/test_roipoint_pool3d.py b/tests/test_ops/test_roipoint_pool3d.py
index 7db3885d71965e834f28a88d531aed1350bc9049..6619a36148cff02e937f56e926bae3cb5ecb2a68 100644
--- a/tests/test_ops/test_roipoint_pool3d.py
+++ b/tests/test_ops/test_roipoint_pool3d.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -6,7 +7,7 @@ from mmcv.ops import RoIPointPool3d
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
-def test_gather_points():
+def test_roipoint():
     feats = torch.tensor(
         [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
diff --git a/tests/test_ops/test_rotated_feature_align.py b/tests/test_ops/test_rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7422a3106bb71ccfdec1919c0a6fb939fb182ac
--- /dev/null
+++ b/tests/test_ops/test_rotated_feature_align.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import rotated_feature_align
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'cpu',
+        marks=pytest.mark.skipif(
+            torch.__version__ == 'parrots', reason='requires PyTorch support'))
+])
+def test_rotated_feature_align(device):
+    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
+                              [0.9144, 1.2248, 1.3115, -0.9690],
+                              [-0.8949, -1.1797, -0.9093, -0.3961],
+                              [-0.4586, 0.5062, -0.7947, -0.7397]],
+                             [[-1.0943, -0.7495, 1.3461, -1.1652],
+                              [0.2034, 0.6763, -1.2357, 0.5231],
+                              [-1.0062, 1.2592, 1.4225, -0.3951],
+                              [-0.1242, -1.6240, 0.1932, 2.7181]],
+                             [[-1.6271, -1.0276, 0.0578, -0.2997],
+                              [-0.9684, -1.6946, -1.3188, -1.1938],
+                              [-1.6744, -0.8917, -0.6556,
+                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],
+                            [[[0.7035, 0.2089, -0.1774, 3.4670],
+                              [-0.8505, -0.9278, 1.4714, 0.1644],
+                              [0.0898, 0.3531, -0.4007, 0.1927],
+                              [1.2569, -0.2636, -0.5223, 0.0616]],
+                             [[0.1760, -0.7639, -0.4600, -1.3260],
+                              [-0.9921, -0.2970, -0.8955, 1.0508],
+                              [1.3515, -0.1641, 1.9679, 1.1986],
+                              [-0.3616, 0.6287, 0.4933, 0.3360]],
+                             [[-0.5860, 0.2124, -0.8700, 2.4200],
+                              [-0.0551, -1.5103, -1.6779, 0.8399],
+                              [0.8431, 1.2414, -1.1243, -0.3887],
+                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                           device=device,
+                           requires_grad=True)
+
+    bbox = torch.tensor(
+        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
+           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
+           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
+           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
+          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
+           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
+           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
+           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
+          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
+           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
+           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
+           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
+          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
+           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
+           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
+           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
+         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
+           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
+           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
+           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
+          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
+           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
+           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
+           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
+          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
+           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
+           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
+           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
+          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
+           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
+           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
+           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],
+        device=device,
+        requires_grad=True)
+
+    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],
+                                      [0.9144, 0.7662, 1.0487, -0.9690],
+                                      [-0.8949, -1.6384, -0.9093, -0.3961],
+                                      [-0.8604, 0.5062, -0.7947, -0.7397]],
+                                     [[-0.3961, -0.7495, 1.3461, 1.5528],
+                                      [0.2034, 0.5522, -1.6722, 0.5231],
+                                      [-1.0062, 1.1350, 1.4225, -0.3951],
+                                      [-0.4826, -1.6240, 0.1932, 2.7181]],
+                                     [[-2.6436, -1.0276, 0.0578, -0.8344],
+                                      [-0.9684, -1.8151, -2.1843, -1.1938],
+                                      [-1.6744, -1.0121, -0.6556, 1.0073],
+                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],
+                                    [[[0.7035, 0.2089, -0.1774, 3.4670],
+                                      [-0.8505, -0.9278, 1.4714, 0.1644],
+                                      [0.0898, 0.3064, -0.4007, 0.5849],
+                                      [1.2569, -0.2636, -0.5223, 0.0616]],
+                                     [[0.1760, -0.7639, -0.4600, -1.3260],
+                                      [-0.9921, -0.2970, -0.8955, 1.0508],
+                                      [1.3515, -0.6125, 1.9679, 0.5550],
+                                      [-0.3616, 0.6287, 0.4933, 0.3360]],
+                                     [[-0.5860, 0.2124, -0.8700, 2.4200],
+                                      [-0.0551, -1.5103, -1.6779, 0.8399],
+                                      [0.8431, 0.8455, -1.1243, -1.5994],
+                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                                   device=device)
+
+    expected_grad = torch.tensor([
+        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],
+        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]
+    ],
+                                 device=device)
+
+    output = rotated_feature_align(
+        feature, bbox, spatial_scale=1 / 8, points=1)
+    output.backward(torch.ones_like(output))
+    assert torch.allclose(output, expected_output, 1e-2)
+    assert torch.allclose(feature.grad, expected_grad, 1e-2)
diff --git a/tests/test_ops/test_saconv.py b/tests/test_ops/test_saconv.py
index b34865fc9336f7bbf297be5239fdb1efdab3d432..607775c38511d5f3afd01ae4656a232474420761 100644
--- a/tests/test_ops/test_saconv.py
+++ b/tests/test_ops/test_saconv.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
 
diff --git a/tests/test_ops/test_scatter_points.py b/tests/test_ops/test_scatter_points.py
index 8fe1fe8cd0542ffe37841fed269a8d53dc25907c..cf4516047a11117fbd79b3a985d902446001afdf 100644
--- a/tests/test_ops/test_scatter_points.py
+++ b/tests/test_ops/test_scatter_points.py
@@ -1,18 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 from torch.autograd import gradcheck
 
 from mmcv.ops import DynamicScatter
 
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
 
 @pytest.mark.skipif(
     not torch.cuda.is_available(), reason='requires CUDA support')
 def test_dynamic_scatter():
-    feats = torch.rand(
-        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
-    coors = torch.randint(
-        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
-
     dsmean = DynamicScatter([0.32, 0.32, 6],
                             [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
     dsmax = DynamicScatter([0.32, 0.32, 6],
@@ -52,6 +51,47 @@ def test_dynamic_scatter():
     assert (empty_o_feats.grad == 0).all()
 
     # test non-empty input
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+
+    ref_voxel_coors = coors.unique(dim=0, sorted=True)
+    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
+    ref_voxel_feats_mean = []
+    ref_voxel_feats_max = []
+    for ref_voxel_coor in ref_voxel_coors:
+        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
+        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
+        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
+    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
+    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
+
+    feats_out_mean, coors_out_mean = dsmean(feats, coors)
+    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
+                coors_out_mean[:, 2]).argsort()
+    feats_out_mean = feats_out_mean[seq_mean]
+    coors_out_mean = coors_out_mean[seq_mean]
+
+    feats_out_max, coors_out_max = dsmax(feats, coors)
+    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
+               coors_out_max[:, 2]).argsort()
+    feats_out_max = feats_out_max[seq_max]
+    coors_cout_max = coors_out_max[seq_max]
+
+    assert (coors_out_mean == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
+    assert (coors_cout_max == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
+
+    # test non-empty input without any point out of bound
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=0, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+
     ref_voxel_coors = coors.unique(dim=0, sorted=True)
     ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
     ref_voxel_feats_mean = []
diff --git a/tests/test_ops/test_spconv.py b/tests/test_ops/test_spconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..098ff2189ae5c44ae2acac8f11f54aa43d5ba4cb
--- /dev/null
+++ b/tests/test_ops/test_spconv.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch import nn
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
+                      SubMConv3d)
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+def make_sparse_convmodule(in_channels,
+                           out_channels,
+                           kernel_size,
+                           indice_key,
+                           stride=1,
+                           padding=0,
+                           conv_type='SubMConv3d',
+                           norm_cfg=None,
+                           order=('conv', 'norm', 'act')):
+    """Make sparse convolution module.
+
+    Args:
+        in_channels (int): the number of input channels
+        out_channels (int): the number of out channels
+        kernel_size (int|tuple(int)): kernel size of convolution
+        indice_key (str): the indice key used for sparse tensor
+        stride (int|tuple(int)): the stride of convolution
+        padding (int or list[int]): the padding number of input
+        conv_type (str): sparse conv type in spconv
+        norm_cfg (dict[str]): config of normalization layer
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+
+    Returns:
+        spconv.SparseSequential: sparse convolution module.
+    """
+    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
+
+    conv_cfg = dict(type=conv_type, indice_key=indice_key)
+
+    layers = list()
+    for layer in order:
+        if layer == 'conv':
+            if conv_type not in [
+                    'SparseInverseConv3d', 'SparseInverseConv2d',
+                    'SparseInverseConv1d'
+            ]:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        bias=False))
+            else:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        bias=False))
+        elif layer == 'norm':
+            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
+        elif layer == 'act':
+            layers.append(nn.ReLU(inplace=True))
+
+    layers = SparseSequential(*layers)
+    return layers
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_make_sparse_convmodule():
+    torch.cuda.empty_cache()
+    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
+                                   [6.8162713, -2.480431, -1.3616394, 0.36],
+                                   [11.643568, -4.744306, -1.3580885, 0.16],
+                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
+                                  dtype=torch.float32,
+                                  device='cuda')  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32,
+        device='cuda')  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
+                                       [41, 1600, 1408], 2)
+
+    sparse_block0 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test0',
+        stride=1,
+        padding=0,
+        conv_type='SubMConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('conv', 'norm', 'act')).cuda()
+    assert isinstance(sparse_block0[0], SubMConv3d)
+    assert sparse_block0[0].in_channels == 4
+    assert sparse_block0[0].out_channels == 16
+    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
+    assert sparse_block0[1].eps == 0.001
+    assert sparse_block0[1].momentum == 0.01
+    assert isinstance(sparse_block0[2], torch.nn.ReLU)
+
+    # test forward
+    out_features = sparse_block0(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 16])
+
+    sparse_block1 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test1',
+        stride=1,
+        padding=0,
+        conv_type='SparseInverseConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('norm', 'act', 'conv')).cuda()
+    assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+    assert isinstance(sparse_block1[1], torch.nn.ReLU)
+    assert isinstance(sparse_block1[2], SparseInverseConv3d)
diff --git a/tests/test_ops/test_syncbn.py b/tests/test_ops/test_syncbn.py
index fb1c8379eb08c61122b4d209136fa5f3ad84ed2f..d1c1605ad5aa4f846cbd62db62a27e8af32b6840 100644
--- a/tests/test_ops/test_syncbn.py
+++ b/tests/test_ops/test_syncbn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import platform
 
@@ -13,7 +14,7 @@ else:
     import re
 
 
-class TestSyncBN(object):
+class TestSyncBN:
 
     def dist_init(self):
         rank = int(os.environ['SLURM_PROCID'])
diff --git a/tests/test_ops/test_tensorrt.py b/tests/test_ops/test_tensorrt.py
index c2eb61bf8b67dc5b2f3e6e05a66ad045cfa9ed39..f7a1057f898b8da91cebb07130a2011b4763351d 100644
--- a/tests/test_ops/test_tensorrt.py
+++ b/tests/test_ops/test_tensorrt.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 from functools import partial
 from typing import Callable
@@ -7,6 +8,7 @@ import onnx
 import pytest
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 try:
     from mmcv.tensorrt import (TRTWrapper, is_tensorrt_plugin_loaded, onnx2trt,
@@ -28,7 +30,7 @@ if not is_tensorrt_plugin_loaded():
 class WrapFunction(nn.Module):
 
     def __init__(self, wrapped_function):
-        super(WrapFunction, self).__init__()
+        super().__init__()
         self.wrapped_function = wrapped_function
 
     def forward(self, *args, **kwargs):
@@ -487,11 +489,10 @@ def test_grid_sample(mode, padding_mode, align_corners):
 
     input = torch.rand(1, 1, 10, 10).cuda()
     grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
-    grid = nn.functional.affine_grid(grid,
-                                     (1, 1, 15, 15)).type_as(input).cuda()
+    grid = F.affine_grid(grid, (1, 1, 15, 15)).type_as(input).cuda()
 
     def func(input, grid):
-        return nn.functional.grid_sample(
+        return F.grid_sample(
             input,
             grid,
             mode=mode,
@@ -575,7 +576,7 @@ def test_cummin_cummax(func: Callable):
     input_list = [
         # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
         torch.rand((2, 3, 4, 1, 5)).cuda(),
-        torch.rand((1)).cuda()
+        torch.rand(1).cuda()
     ]
 
     input_names = ['input']
@@ -755,7 +756,7 @@ def test_corner_pool(mode):
     class CornerPoolWrapper(CornerPool):
 
         def __init__(self, mode):
-            super(CornerPoolWrapper, self).__init__(mode)
+            super().__init__(mode)
 
         def forward(self, x):
             # no use `torch.cummax`, instead `corner_pool` is used
diff --git a/tests/test_ops/test_tensorrt_preprocess.py b/tests/test_ops/test_tensorrt_preprocess.py
index b5ade24b4b04c6813b2529c21c7df9b249e5c388..22d0db76e474889b6766af5e20b43e44f3ed6e7b 100644
--- a/tests/test_ops/test_tensorrt_preprocess.py
+++ b/tests/test_ops/test_tensorrt_preprocess.py
@@ -1,12 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 from functools import wraps
 
 import onnx
+import pytest
 import torch
 
 from mmcv.ops import nms
 from mmcv.tensorrt.preprocess import preprocess_onnx
 
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
 
 def remove_tmp_file(func):
 
diff --git a/tests/test_ops/test_three_interpolate.py b/tests/test_ops/test_three_interpolate.py
index 4dffe5a2eef6a59e33b4f028607e26328a527bad..900f451ff8de17f84b972a0364dc14c5e8520939 100644
--- a/tests/test_ops/test_three_interpolate.py
+++ b/tests/test_ops/test_three_interpolate.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_three_nn.py b/tests/test_ops/test_three_nn.py
index e7cba24a834dd370a66016a755a7b07396b099c0..c30690310c4a1c642872f9548b1c770d0ba3c6ae 100644
--- a/tests/test_ops/test_three_nn.py
+++ b/tests/test_ops/test_three_nn.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_ops/test_tin_shift.py b/tests/test_ops/test_tin_shift.py
old mode 100644
new mode 100755
index 93cea6ea58c7d0d0b440f9097907708f06805700..c8ce14465cf957e13df4dcd72c95d647c1cba3aa
--- a/tests/test_ops/test_tin_shift.py
+++ b/tests/test_ops/test_tin_shift.py
@@ -1,9 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 
 import numpy as np
 import pytest
 import torch
 
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
 _USING_PARROTS = True
 try:
     from parrots.autograd import gradcheck
@@ -130,7 +133,7 @@ grads = [
 ]
 
 
-def _test_tinshift_gradcheck(dtype):
+def _test_tinshift_gradcheck(device, dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
@@ -144,15 +147,15 @@ def _test_tinshift_gradcheck(dtype):
         np_shift = np.array(shift)
 
         x = torch.tensor(
-            np_input, dtype=dtype, device='cuda', requires_grad=True)
-        shift = torch.tensor(np_shift, device='cuda').int()
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
         if torch.__version__ == 'parrots':
             gradcheck(tin_shift, (x, shift))
         else:
             gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)
 
 
-def _test_tinshift_allclose(dtype):
+def _test_tinshift_allclose(device, dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
@@ -165,8 +168,8 @@ def _test_tinshift_allclose(dtype):
         np_grad = np.array(grad)
 
         x = torch.tensor(
-            np_input, dtype=dtype, device='cuda', requires_grad=True)
-        shift = torch.tensor(np_shift, device='cuda').int()
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
 
         output = tin_shift(x, shift)
         output.backward(torch.ones_like(output))
@@ -176,28 +179,48 @@ def _test_tinshift_allclose(dtype):
             x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)
 
 
-def _test_tinshift_assert(dtype):
+def _test_tinshift_assert(device, dtype):
     try:
         from mmcv.ops import tin_shift
     except ModuleNotFoundError:
         pytest.skip('TINShift op is not successfully compiled')
 
-    inputs = [torch.rand(2, 3, 4, 2), torch.rand(2, 3, 4, 2)]
+    inputs = [
+        torch.rand(2, 3, 4, 2),
+        torch.rand(2, 3, 4, 2),
+        torch.rand(1, 3, 4, 2)
+    ]
     shifts = [torch.rand(2, 3), torch.rand(2, 5)]
 
     for x, shift in zip(inputs, shifts):
-        x = x.cuda()
-        shift = shift.cuda()
+        x = x.to(device).type(dtype)
+        shift = shift.to(device).type(dtype)
 
         # A ValueError should be raised if ops get inputs with wrong shapes.
         with pytest.raises(ValueError):
             tin_shift(x, shift)
 
 
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-@pytest.mark.parametrize('dtype', [torch.float, torch.double, torch.half])
-def test_tinshift(dtype):
-    _test_tinshift_allclose(dtype=dtype)
-    _test_tinshift_gradcheck(dtype=dtype)
-    _test_tinshift_assert(dtype=dtype)
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_tinshift(device, dtype):
+    _test_tinshift_allclose(device=device, dtype=dtype)
+    _test_tinshift_gradcheck(device=device, dtype=dtype)
+    _test_tinshift_assert(device=device, dtype=dtype)
diff --git a/tests/test_ops/test_upfirdn2d.py b/tests/test_ops/test_upfirdn2d.py
index f1c33bcddcb253f346c56f6d4d656ab73a076b4f..6037a51c2f59285acb270192ab5e41f437b7c589 100644
--- a/tests/test_ops/test_upfirdn2d.py
+++ b/tests/test_ops/test_upfirdn2d.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
@@ -9,7 +10,7 @@ except ImportError:
     _USING_PARROTS = False
 
 
-class TestUpFirDn2d(object):
+class TestUpFirDn2d:
     """Unit test for UpFirDn2d.
 
     Here, we just test the basic case of upsample version. More gerneal tests
diff --git a/tests/test_ops/test_voxelization.py b/tests/test_ops/test_voxelization.py
index ad3253f952ae25811a2959ffa028726ce4c21f4d..d3555ac694d5fc0f1ebf03e50bbbd609d3e53682 100644
--- a/tests/test_ops/test_voxelization.py
+++ b/tests/test_ops/test_voxelization.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
@@ -39,7 +40,7 @@ def test_voxelization(device_type):
     device = torch.device(device_type)
 
     # test hard_voxelization on cpu/gpu
-    points = torch.tensor(points).contiguous().to(device)
+    points = points.contiguous().to(device)
     coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
     coors = coors.cpu().detach().numpy()
     voxels = voxels.cpu().detach().numpy()
@@ -59,3 +60,80 @@ def test_voxelization(device_type):
         assert np.all(
             points[indices] == expected_coors[i][:num_points_current_voxel])
         assert num_points_current_voxel == expected_num_points_per_voxel[i]
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_voxelization_nondeterministic():
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
+
+    max_num_points = 10
+    max_voxels = 50
+    hard_voxelization = Voxelization(
+        voxel_size,
+        point_cloud_range,
+        max_num_points,
+        max_voxels,
+        deterministic=False)
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    points = torch.tensor(points).contiguous().to(device='cuda:0')
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+    voxels = voxels.cpu().detach().numpy().tolist()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()
+
+    coors_all = dynamic_voxelization.forward(points)
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors)
+    assert len(coors_set - coors_all_set) == 0
+
+    points = points.cpu().detach().numpy().tolist()
+
+    coors_points_dict = {}
+    for c, ps in zip(coors_all, points):
+        if tuple(c) not in coors_points_dict:
+            coors_points_dict[tuple(c)] = set()
+        coors_points_dict[tuple(c)].add(tuple(ps))
+
+    for c, ps, n in zip(coors, voxels, num_points_per_voxel):
+        ideal_voxel_points_set = coors_points_dict[tuple(c)]
+        voxel_points_set = {tuple(p) for p in ps[:n]}
+        assert len(voxel_points_set) == n
+        if n < max_num_points:
+            assert voxel_points_set == ideal_voxel_points_set
+            for p in ps[n:]:
+                assert max(p) == min(p) == 0
+        else:
+            assert len(voxel_points_set - ideal_voxel_points_set) == 0
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    # with all input point in range
+    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]
+    coors_all = dynamic_voxelization.forward(points)
+    valid_mask = coors_all.ge(0).all(-1)
+    points = points[valid_mask]
+    coors_all = coors_all[valid_mask]
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors) == len(coors_all_set)
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
index e8e5456828b5b233b20a89d13891e74767e23718..814aaeadfba37a161f2211a8c2a90a604bcfd0f3 100644
--- a/tests/test_parallel.py
+++ b/tests/test_parallel.py
@@ -1,19 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 from unittest.mock import MagicMock, patch
 
+import pytest
 import torch
 import torch.nn as nn
 from torch.nn.parallel import DataParallel, DistributedDataParallel
 
 from mmcv.parallel import (MODULE_WRAPPERS, MMDataParallel,
                            MMDistributedDataParallel, is_module_wrapper)
+from mmcv.parallel._functions import Scatter, get_input_device, scatter
 from mmcv.parallel.distributed_deprecated import \
     MMDistributedDataParallel as DeprecatedMMDDP
+from mmcv.utils import Registry
 
 
 def mock(*args, **kwargs):
     pass
 
 
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
 @patch('torch.distributed._broadcast_coalesced', mock)
 @patch('torch.distributed.broadcast', mock)
 @patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
@@ -28,11 +34,15 @@ def test_is_module_wrapper():
         def forward(self, x):
             return self.conv(x)
 
-    # _verify_model_across_ranks is added in torch1.9.0 so we should check
-    # whether _verify_model_across_ranks is the member of torch.distributed
-    # before mocking
+    # _verify_model_across_ranks is added in torch1.9.0,
+    # _verify_params_across_processes is added in torch1.11.0,
+    # so we should check whether _verify_model_across_ranks
+    # and _verify_params_across_processes are the member of
+    # torch.distributed before mocking
     if hasattr(torch.distributed, '_verify_model_across_ranks'):
         torch.distributed._verify_model_across_ranks = mock
+    if hasattr(torch.distributed, '_verify_params_across_processes'):
+        torch.distributed._verify_params_across_processes = mock
 
     model = Model()
     assert not is_module_wrapper(model)
@@ -54,7 +64,7 @@ def test_is_module_wrapper():
 
     # test module wrapper registry
     @MODULE_WRAPPERS.register_module()
-    class ModuleWrapper(object):
+    class ModuleWrapper:
 
         def __init__(self, module):
             self.module = module
@@ -64,3 +74,115 @@ def test_is_module_wrapper():
 
     module_wraper = ModuleWrapper(model)
     assert is_module_wrapper(module_wraper)
+
+    # test module wrapper registry in downstream repo
+    MMRAZOR_MODULE_WRAPPERS = Registry(
+        'mmrazor module wrapper', parent=MODULE_WRAPPERS, scope='mmrazor')
+    MMPOSE_MODULE_WRAPPERS = Registry(
+        'mmpose module wrapper', parent=MODULE_WRAPPERS, scope='mmpose')
+
+    @MMRAZOR_MODULE_WRAPPERS.register_module()
+    class ModuleWrapperInRazor:
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    @MMPOSE_MODULE_WRAPPERS.register_module()
+    class ModuleWrapperInPose:
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    wrapped_module = ModuleWrapperInRazor(model)
+    assert is_module_wrapper(wrapped_module)
+
+    wrapped_module = ModuleWrapperInPose(model)
+    assert is_module_wrapper(wrapped_module)
+
+
+def test_get_input_device():
+    # if the device is CPU, return -1
+    input = torch.zeros([1, 3, 3, 3])
+    assert get_input_device(input) == -1
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    assert get_input_device(inputs) == -1
+
+    # if the device is GPU, return the index of device
+    if torch.cuda.is_available():
+        input = torch.zeros([1, 3, 3, 3]).cuda()
+        assert get_input_device(input) == 0
+        inputs = [
+            torch.zeros([1, 3, 3, 3]).cuda(),
+            torch.zeros([1, 4, 4, 4]).cuda()
+        ]
+        assert get_input_device(inputs) == 0
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        get_input_device(5)
+
+
+def test_scatter():
+    # if the device is CPU, just return the input
+    input = torch.zeros([1, 3, 3, 3])
+    output = scatter(input=input, devices=[-1])
+    assert torch.allclose(input, output)
+
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = scatter(input=inputs, devices=[-1])
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is GPU, copy the input from CPU to GPU
+    if torch.cuda.is_available():
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.cuda(), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.cuda(), output)
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        scatter(5, [-1])
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+def test_Scatter():
+    # if the device is CPU, just return the input
+    target_gpus = [-1]
+    input = torch.zeros([1, 3, 3, 3])
+    outputs = Scatter.forward(target_gpus, input)
+    assert isinstance(outputs, tuple)
+    assert torch.allclose(input, outputs[0])
+
+    target_gpus = [-1]
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = Scatter.forward(target_gpus, inputs)
+    assert isinstance(outputs, tuple)
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is GPU, copy the input from CPU to GPU
+    if torch.cuda.is_available():
+        target_gpus = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_gpus, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.cuda(), outputs[0])
+
+        target_gpus = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_gpus, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.cuda(), output[0])
diff --git a/tests/test_runner/test_basemodule.py b/tests/test_runner/test_basemodule.py
index fb44a1d1bd52ff564a0228b2285e7bbdba8c2c99..1f186bf8289ff4a21c5b88fedf1c9fe450531b59 100644
--- a/tests/test_runner/test_basemodule.py
+++ b/tests/test_runner/test_basemodule.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import tempfile
 
 import pytest
@@ -6,7 +7,7 @@ from torch import nn
 
 import mmcv
 from mmcv.cnn.utils.weight_init import update_init_info
-from mmcv.runner import BaseModule, ModuleList, Sequential
+from mmcv.runner import BaseModule, ModuleDict, ModuleList, Sequential
 from mmcv.utils import Registry, build_from_cfg
 
 COMPONENTS = Registry('component')
@@ -88,9 +89,11 @@ class FooModel(BaseModule):
 def test_initilization_info_logger():
     # 'override' has higher priority
 
+    import os
+
     import torch.nn as nn
+
     from mmcv.utils.logging import get_logger
-    import os
 
     class OverloadInitConv(nn.Conv2d, BaseModule):
 
@@ -102,7 +105,7 @@ def test_initilization_info_logger():
     class CheckLoggerModel(BaseModule):
 
         def __init__(self, init_cfg=None):
-            super(CheckLoggerModel, self).__init__(init_cfg)
+            super().__init__(init_cfg)
             self.conv1 = nn.Conv2d(1, 1, 1, 1)
             self.conv2 = OverloadInitConv(1, 1, 1, 1)
             self.conv3 = nn.Conv2d(1, 1, 1, 1)
@@ -148,7 +151,7 @@ def test_initilization_info_logger():
     class OverloadInitConvFc(nn.Conv2d, BaseModule):
 
         def __init__(self, *args, **kwargs):
-            super(OverloadInitConvFc, self).__init__(*args, **kwargs)
+            super().__init__(*args, **kwargs)
             self.conv1 = nn.Linear(1, 1)
 
         def init_weights(self):
@@ -159,7 +162,7 @@ def test_initilization_info_logger():
     class CheckLoggerModel(BaseModule):
 
         def __init__(self, init_cfg=None):
-            super(CheckLoggerModel, self).__init__(init_cfg)
+            super().__init__(init_cfg)
             self.conv1 = nn.Conv2d(1, 1, 1, 1)
             self.conv2 = OverloadInitConvFc(1, 1, 1, 1)
             self.conv3 = nn.Conv2d(1, 1, 1, 1)
@@ -168,7 +171,7 @@ def test_initilization_info_logger():
     class TopLevelModule(BaseModule):
 
         def __init__(self, init_cfg=None, checklog_init_cfg=None):
-            super(TopLevelModule, self).__init__(init_cfg)
+            super().__init__(init_cfg)
             self.module1 = CheckLoggerModel(checklog_init_cfg)
             self.module2 = OverloadInitConvFc(1, 1, 1, 1)
 
@@ -555,3 +558,54 @@ def test_modulelist_weight_init():
                        torch.full(modellist[1].conv2d.weight.shape, 2.))
     assert torch.equal(modellist[1].conv2d.bias,
                        torch.full(modellist[1].conv2d.bias.shape, 3.))
+
+
+def test_moduledict_weight_init():
+    models_cfg = dict(
+        foo_conv_1d=dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        foo_conv_2d=dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    )
+    layers = {
+        name: build_from_cfg(cfg, COMPONENTS)
+        for name, cfg in models_cfg.items()
+    }
+    modeldict = ModuleDict(layers)
+    modeldict.init_weights()
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.weight,
+        torch.full(modeldict['foo_conv_1d'].conv1d.weight.shape, 0.))
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.bias,
+        torch.full(modeldict['foo_conv_1d'].conv1d.bias.shape, 1.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.weight,
+        torch.full(modeldict['foo_conv_2d'].conv2d.weight.shape, 2.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.bias,
+        torch.full(modeldict['foo_conv_2d'].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = {
+        name: build_from_cfg(cfg, COMPONENTS)
+        for name, cfg in models_cfg.items()
+    }
+    modeldict = ModuleDict(
+        layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    modeldict.init_weights()
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.weight,
+        torch.full(modeldict['foo_conv_1d'].conv1d.weight.shape, 0.))
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.bias,
+        torch.full(modeldict['foo_conv_1d'].conv1d.bias.shape, 1.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.weight,
+        torch.full(modeldict['foo_conv_2d'].conv2d.weight.shape, 2.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.bias,
+        torch.full(modeldict['foo_conv_2d'].conv2d.bias.shape, 3.))
diff --git a/tests/test_runner/test_checkpoint.py b/tests/test_runner/test_checkpoint.py
index 9856724318ce73eb54f61b466ac047824e438e6d..95ab7bcaf7b9eb61844649d2c3a656716f3805f6 100644
--- a/tests/test_runner/test_checkpoint.py
+++ b/tests/test_runner/test_checkpoint.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import sys
 from collections import OrderedDict
 from tempfile import TemporaryDirectory
@@ -13,14 +14,15 @@ from mmcv.fileio.file_client import PetrelBackend
 from mmcv.parallel.registry import MODULE_WRAPPERS
 from mmcv.runner.checkpoint import (_load_checkpoint_with_prefix,
                                     get_state_dict, load_checkpoint,
-                                    load_from_pavi, save_checkpoint)
+                                    load_from_local, load_from_pavi,
+                                    save_checkpoint)
 
 sys.modules['petrel_client'] = MagicMock()
 sys.modules['petrel_client.client'] = MagicMock()
 
 
 @MODULE_WRAPPERS.register_module()
-class DDPWrapper(object):
+class DDPWrapper:
 
     def __init__(self, module):
         self.module = module
@@ -42,7 +44,7 @@ class Model(nn.Module):
         self.conv = nn.Conv2d(3, 3, 1)
 
 
-class Mockpavimodel(object):
+class Mockpavimodel:
 
     def __init__(self, name='fakename'):
         self.name = name
@@ -57,18 +59,18 @@ def assert_tensor_equal(tensor_a, tensor_b):
 
 def test_get_state_dict():
     if torch.__version__ == 'parrots':
-        state_dict_keys = set([
+        state_dict_keys = {
             'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
             'block.norm.bias', 'block.norm.running_mean',
             'block.norm.running_var', 'conv.weight', 'conv.bias'
-        ])
+        }
     else:
-        state_dict_keys = set([
+        state_dict_keys = {
             'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
             'block.norm.bias', 'block.norm.running_mean',
             'block.norm.running_var', 'block.norm.num_batches_tracked',
             'conv.weight', 'conv.bias'
-        ])
+        }
 
     model = Model()
     state_dict = get_state_dict(model)
@@ -196,8 +198,8 @@ def test_load_checkpoint_with_prefix():
 
 def test_load_checkpoint():
     import os
-    import tempfile
     import re
+    import tempfile
 
     class PrefixModel(nn.Module):
 
@@ -298,7 +300,6 @@ def test_load_checkpoint_metadata():
 
 def test_load_classes_name():
     import os
-
     import tempfile
 
     from mmcv.runner import load_checkpoint, save_checkpoint
@@ -331,9 +332,10 @@ def test_load_classes_name():
 
 
 def test_checkpoint_loader():
-    from mmcv.runner import _load_checkpoint, save_checkpoint, CheckpointLoader
-    import tempfile
     import os
+    import tempfile
+
+    from mmcv.runner import CheckpointLoader, _load_checkpoint, save_checkpoint
     checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
     model = Model()
     save_checkpoint(model, checkpoint_path)
@@ -347,13 +349,16 @@ def test_checkpoint_loader():
         'modelzoo://xx.xx/xx.pth', 'torchvision://xx.xx/xx.pth',
         'open-mmlab://xx.xx/xx.pth', 'openmmlab://xx.xx/xx.pth',
         'mmcls://xx.xx/xx.pth', 'pavi://xx.xx/xx.pth', 's3://xx.xx/xx.pth',
-        'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth'
+        'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth',
+        'open-mmlab:s3://xx.xx/xx.pth', 'openmmlab:s3://xx.xx/xx.pth',
+        'openmmlabs3://xx.xx/xx.pth', ':s3://xx.xx/xx.path'
     ]
     fn_names = [
         'load_from_http', 'load_from_http', 'load_from_torchvision',
         'load_from_torchvision', 'load_from_openmmlab', 'load_from_openmmlab',
         'load_from_mmcls', 'load_from_pavi', 'load_from_ceph',
-        'load_from_local', 'load_from_local'
+        'load_from_local', 'load_from_local', 'load_from_ceph',
+        'load_from_ceph', 'load_from_local', 'load_from_local'
     ]
 
     for filename, fn_name in zip(filenames, fn_names):
@@ -430,3 +435,18 @@ def test_save_checkpoint(tmp_path):
         save_checkpoint(
             model, filename, file_client_args={'backend': 'petrel'})
     mock_method.assert_called()
+
+
+def test_load_from_local():
+    import os
+    home_path = os.path.expanduser('~')
+    checkpoint_path = os.path.join(
+        home_path, 'dummy_checkpoint_used_to_test_load_from_local.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_from_local(
+        '~/dummy_checkpoint_used_to_test_load_from_local.pth',
+        map_location=None)
+    assert_tensor_equal(checkpoint['state_dict']['block.conv.weight'],
+                        model.block.conv.weight)
+    os.remove(checkpoint_path)
diff --git a/tests/test_runner/test_dist_utils.py b/tests/test_runner/test_dist_utils.py
index 00e43066cb743be6aaccd1e0d7c254ec9fff5c31..979c2e4f3f4747d65cc95ac95c81666c8ce653c9 100644
--- a/tests/test_runner/test_dist_utils.py
+++ b/tests/test_runner/test_dist_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 from unittest.mock import patch
 
diff --git a/tests/test_runner/test_eval_hook.py b/tests/test_runner/test_eval_hook.py
index 70fc82cbc2b3feacfdb122cf7360fe3e1d28cc01..e03ce82db2bcbd2206fbba3cc6cbe7f9a284499f 100644
--- a/tests/test_runner/test_eval_hook.py
+++ b/tests/test_runner/test_eval_hook.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import json
 import os.path as osp
 import sys
diff --git a/tests/test_runner/test_fp16.py b/tests/test_runner/test_fp16.py
index fb1c788140d2ec16451dec9cc94ce933756dc5b9..e34c909cb91576594e6daaed1edbc701560594d6 100644
--- a/tests/test_runner/test_fp16.py
+++ b/tests/test_runner/test_fp16.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 import torch
@@ -14,6 +15,22 @@ def test_cast_tensor_type():
     assert isinstance(outputs, torch.Tensor)
     assert outputs.dtype == dst_type
 
+    # convert torch.float to torch.half
+    inputs = torch.FloatTensor([5.])
+    src_type = torch.float
+    dst_type = torch.half
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == dst_type
+
+    # skip the conversion when the type of input is not the same as src_type
+    inputs = torch.IntTensor([5])
+    src_type = torch.float
+    dst_type = torch.half
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == inputs.dtype
+
     inputs = 'tensor'
     src_type = str
     dst_type = str
@@ -53,7 +70,7 @@ def test_auto_fp16():
     with pytest.raises(TypeError):
         # ExampleObject is not a subclass of nn.Module
 
-        class ExampleObject(object):
+        class ExampleObject:
 
             @auto_fp16()
             def __call__(self, x):
@@ -179,7 +196,7 @@ def test_force_fp32():
     with pytest.raises(TypeError):
         # ExampleObject is not a subclass of nn.Module
 
-        class ExampleObject(object):
+        class ExampleObject:
 
             @force_fp32()
             def __call__(self, x):
diff --git a/tests/test_runner/test_hooks.py b/tests/test_runner/test_hooks.py
index 61c347e6665682eefdb8d79a48845373e4f11ef4..bdb93a9013d61b21d38b9fe6795cf58a52e9732f 100644
--- a/tests/test_runner/test_hooks.py
+++ b/tests/test_runner/test_hooks.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 """Tests the hooks with runners.
 
 CommandLine:
@@ -12,7 +13,7 @@ import re
 import shutil
 import sys
 import tempfile
-from unittest.mock import MagicMock, call, patch
+from unittest.mock import MagicMock, Mock, call, patch
 
 import pytest
 import torch
@@ -21,12 +22,15 @@ from torch.nn.init import constant_
 from torch.utils.data import DataLoader
 
 from mmcv.fileio.file_client import PetrelBackend
-from mmcv.runner import (CheckpointHook, DvcliveLoggerHook, EMAHook,
-                         Fp16OptimizerHook,
+# yapf: disable
+from mmcv.runner import (CheckpointHook, ClearMLLoggerHook, DvcliveLoggerHook,
+                         EMAHook, Fp16OptimizerHook,
                          GradientCumulativeFp16OptimizerHook,
                          GradientCumulativeOptimizerHook, IterTimerHook,
                          MlflowLoggerHook, NeptuneLoggerHook, OptimizerHook,
-                         PaviLoggerHook, WandbLoggerHook, build_runner)
+                         PaviLoggerHook, SegmindLoggerHook, WandbLoggerHook,
+                         build_runner)
+# yapf: enable
 from mmcv.runner.fp16_utils import auto_fp16
 from mmcv.runner.hooks.hook import HOOKS, Hook
 from mmcv.runner.hooks.lr_updater import (CosineRestartLrUpdaterHook,
@@ -34,11 +38,93 @@ from mmcv.runner.hooks.lr_updater import (CosineRestartLrUpdaterHook,
                                           FlatCosineAnnealingLrUpdaterHook,
                                           OneCycleLrUpdaterHook,
                                           StepLrUpdaterHook)
+from mmcv.utils import TORCH_VERSION
 
 sys.modules['petrel_client'] = MagicMock()
 sys.modules['petrel_client.client'] = MagicMock()
 
 
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+def test_optimizerhook():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+            self.conv2 = nn.Conv2d(
+                in_channels=2,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+            self.conv3 = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+
+        def forward(self, x):
+            x1 = self.conv1(x)
+            x2 = self.conv2(x1)
+            return x1, x2
+
+    model = Model()
+    x = torch.rand(1, 1, 3, 3)
+
+    dummy_runner = Mock()
+    dummy_runner.optimizer.zero_grad = Mock(return_value=None)
+    dummy_runner.optimizer.step = Mock(return_value=None)
+    dummy_runner.model = model
+    dummy_runner.outputs = dict()
+
+    dummy_runner.outputs['num_samples'] = 0
+
+    class DummyLogger():
+
+        def __init__(self):
+            self.msg = ''
+
+        def log(self, msg=None, **kwargs):
+            self.msg += msg
+
+    dummy_runner.logger = DummyLogger()
+    optimizer_hook = OptimizerHook(
+        dict(max_norm=2), detect_anomalous_params=True)
+
+    dummy_runner.outputs['loss'] = model(x)[0].sum()
+    optimizer_hook.after_train_iter(dummy_runner)
+    # assert the parameters of conv2 and conv3 are not in the
+    # computational graph which is with x1.sum() as root.
+    assert 'conv2.weight' in dummy_runner.logger.msg
+    assert 'conv2.bias' in dummy_runner.logger.msg
+    assert 'conv3.weight' in dummy_runner.logger.msg
+    assert 'conv3.bias' in dummy_runner.logger.msg
+    assert 'conv1.weight' not in dummy_runner.logger.msg
+    assert 'conv1.bias' not in dummy_runner.logger.msg
+
+    dummy_runner.outputs['loss'] = model(x)[1].sum()
+    dummy_runner.logger.msg = ''
+    optimizer_hook.after_train_iter(dummy_runner)
+    # assert the parameters of conv3 are not in the computational graph
+    assert 'conv3.weight' in dummy_runner.logger.msg
+    assert 'conv3.bias' in dummy_runner.logger.msg
+    assert 'conv2.weight' not in dummy_runner.logger.msg
+    assert 'conv2.bias' not in dummy_runner.logger.msg
+    assert 'conv1.weight' not in dummy_runner.logger.msg
+    assert 'conv1.bias' not in dummy_runner.logger.msg
+
+
 def test_checkpoint_hook(tmp_path):
     """xdoctest -m tests/test_runner/test_hooks.py test_checkpoint_hook."""
 
@@ -58,15 +144,15 @@ def test_checkpoint_hook(tmp_path):
     runner.meta = dict()
     out_dir = 's3://user/data'
     with patch.object(PetrelBackend, 'put') as mock_put, \
-         patch.object(PetrelBackend, 'remove') as mock_remove, \
-         patch.object(PetrelBackend, 'isfile') as mock_isfile:
+            patch.object(PetrelBackend, 'remove') as mock_remove, \
+            patch.object(PetrelBackend, 'isfile') as mock_isfile:
         checkpointhook = CheckpointHook(
             interval=1, out_dir=out_dir, by_epoch=True, max_keep_ckpts=2)
         runner.register_hook(checkpointhook)
         runner.run([loader], [('train', 1)])
         basename = osp.basename(runner.work_dir.rstrip(osp.sep))
         assert runner.meta['hook_msgs']['last_ckpt'] == \
-            '/'.join([out_dir, basename, 'epoch_4.pth'])
+               '/'.join([out_dir, basename, 'epoch_4.pth'])
     mock_put.assert_called()
     mock_remove.assert_called()
     mock_isfile.assert_called()
@@ -89,15 +175,15 @@ def test_checkpoint_hook(tmp_path):
     runner.meta = dict()
     out_dir = 's3://user/data'
     with patch.object(PetrelBackend, 'put') as mock_put, \
-         patch.object(PetrelBackend, 'remove') as mock_remove, \
-         patch.object(PetrelBackend, 'isfile') as mock_isfile:
+            patch.object(PetrelBackend, 'remove') as mock_remove, \
+            patch.object(PetrelBackend, 'isfile') as mock_isfile:
         checkpointhook = CheckpointHook(
             interval=1, out_dir=out_dir, by_epoch=False, max_keep_ckpts=2)
         runner.register_hook(checkpointhook)
         runner.run([loader], [('train', 1)])
         basename = osp.basename(runner.work_dir.rstrip(osp.sep))
         assert runner.meta['hook_msgs']['last_ckpt'] == \
-            '/'.join([out_dir, basename, 'iter_4.pth'])
+               '/'.join([out_dir, basename, 'iter_4.pth'])
     mock_put.assert_called()
     mock_remove.assert_called()
     mock_isfile.assert_called()
@@ -256,7 +342,7 @@ def test_pavi_hook():
         'learning_rate': 0.02,
         'momentum': 0.95
     }, 1)
-    # in windows environment, the latest checkpoint is copied from epoch_1.pth
+    # in Windows environment, the latest checkpoint is copied from epoch_1.pth
     if platform.system() == 'Windows':
         snapshot_file_path = osp.join(runner.work_dir, 'latest.pth')
     else:
@@ -275,20 +361,23 @@ def test_sync_buffers_hook():
     shutil.rmtree(runner.work_dir)
 
 
-@pytest.mark.parametrize('multi_optimziers', (True, False))
-def test_momentum_runner_hook(multi_optimziers):
+@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
+                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
+def test_momentum_runner_hook(multi_optimizers, max_iters, gamma,
+                              cyclic_times):
     """xdoctest -m tests/test_hooks.py test_momentum_runner_hook."""
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add momentum scheduler
     hook_cfg = dict(
         type='CyclicMomentumUpdaterHook',
         by_epoch=False,
         target_ratio=(0.85 / 0.95, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
+        cyclic_times=cyclic_times,
+        step_ratio_up=0.4,
+        gamma=gamma)
     runner.register_hook_from_cfg(hook_cfg)
 
     # add momentum LR scheduler
@@ -309,7 +398,7 @@ def test_momentum_runner_hook(multi_optimziers):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -340,26 +429,221 @@ def test_momentum_runner_hook(multi_optimziers):
                 'momentum': 0.95
             }, 1),
             call('train', {
-                'learning_rate': 0.2,
+                'learning_rate': 0.11,
                 'momentum': 0.85
+            }, 3),
+            call('train', {
+                'learning_rate': 0.1879422863405995,
+                'momentum': 0.95
+            }, 6),
+            call('train', {
+                'learning_rate': 0.11000000000000001,
+                'momentum': 0.9
+            }, 8),
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test constant momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='constant',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
             }, 5),
             call('train', {
-                'learning_rate': 0.155,
-                'momentum': 0.875
-            }, 7),
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
         ]
+
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
+    # test linear momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='linear',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
 
-@pytest.mark.parametrize('multi_optimziers', (True, False))
-def test_cosine_runner_hook(multi_optimziers):
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.3571428571428572,
+                    'momentum/model2': 1.2857142857142858,
+                }, 3),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.3571428571428572
+            }, 3),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
+        ]
+
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test exponentially momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='exp',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.4399307381848783,
+                    'momentum/model2': 1.3641449098593583,
+                }, 3),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.4399307381848783
+            }, 3),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
+        ]
+
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_cosine_runner_hook(multi_optimizers):
     """xdoctest -m tests/test_hooks.py test_cosine_runner_hook."""
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add momentum scheduler
-
     hook_cfg = dict(
         type='CosineAnnealingMomentumUpdaterHook',
         min_momentum_ratio=0.99 / 0.95,
@@ -386,7 +670,7 @@ def test_cosine_runner_hook(multi_optimziers):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -429,17 +713,94 @@ def test_cosine_runner_hook(multi_optimziers):
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
 
-@pytest.mark.parametrize('multi_optimziers, by_epoch', [(False, False),
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_linear_runner_hook(multi_optimizers):
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+
+    hook_cfg = dict(
+        type='LinearAnnealingMomentumUpdaterHook',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='LinearAnnealingLrUpdaterHook',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.97,
+                    'momentum/model2': 0.9189473684210527,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0019999999999999983,
+                    'learning_rate/model2': 0.0009999999999999992,
+                    'momentum/model1': 0.9860000000000001,
+                    'momentum/model2': 0.9341052631578949,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.97
+            }, 6),
+            call(
+                'train', {
+                    'learning_rate': 0.0019999999999999983,
+                    'momentum': 0.9860000000000001
+                }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers, by_epoch', [(False, False),
                                                         (True, False),
                                                         (False, True),
                                                         (True, True)])
-def test_flat_cosine_runner_hook(multi_optimziers, by_epoch):
+def test_flat_cosine_runner_hook(multi_optimizers, by_epoch):
     """xdoctest -m tests/test_hooks.py test_flat_cosine_runner_hook."""
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
     max_epochs = 10 if by_epoch else 1
     runner = _build_demo_runner(
-        multi_optimziers=multi_optimziers, max_epochs=max_epochs)
+        multi_optimizers=multi_optimizers, max_epochs=max_epochs)
 
     with pytest.raises(ValueError):
         # start_percent: expected float between 0 and 1
@@ -465,7 +826,7 @@ def test_flat_cosine_runner_hook(multi_optimziers, by_epoch):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         if by_epoch:
             calls = [
                 call(
@@ -570,10 +931,12 @@ def test_flat_cosine_runner_hook(multi_optimziers, by_epoch):
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
 
-@pytest.mark.parametrize('multi_optimziers, max_iters', [(True, 10), (True, 2),
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+@pytest.mark.parametrize('multi_optimizers, max_iters', [(True, 10), (True, 2),
                                                          (False, 10),
                                                          (False, 2)])
-def test_one_cycle_runner_hook(multi_optimziers, max_iters):
+def test_one_cycle_runner_hook(multi_optimizers, max_iters):
     """Test OneCycleLrUpdaterHook and OneCycleMomentumUpdaterHook."""
     with pytest.raises(AssertionError):
         # by_epoch should be False
@@ -589,7 +952,7 @@ def test_one_cycle_runner_hook(multi_optimziers, max_iters):
 
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add momentum scheduler
     hook_cfg = dict(
@@ -621,7 +984,7 @@ def test_one_cycle_runner_hook(multi_optimziers, max_iters):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -698,8 +1061,8 @@ def test_one_cycle_runner_hook(multi_optimziers, max_iters):
         assert lr_target[-1] == lr_last[0]
 
 
-@pytest.mark.parametrize('multi_optimziers', (True, False))
-def test_cosine_restart_lr_update_hook(multi_optimziers):
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_cosine_restart_lr_update_hook(multi_optimizers):
     """Test CosineRestartLrUpdaterHook."""
     with pytest.raises(AssertionError):
         # either `min_lr` or `min_lr_ratio` should be specified
@@ -741,7 +1104,7 @@ def test_cosine_restart_lr_update_hook(multi_optimziers):
 
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add cosine restart LR scheduler
     hook = CosineRestartLrUpdaterHook(
@@ -760,7 +1123,7 @@ def test_cosine_restart_lr_update_hook(multi_optimziers):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -802,8 +1165,8 @@ def test_cosine_restart_lr_update_hook(multi_optimziers):
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
 
-@pytest.mark.parametrize('multi_optimziers', (True, False))
-def test_step_runner_hook(multi_optimziers):
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_step_runner_hook(multi_optimizers):
     """Test StepLrUpdaterHook."""
     with pytest.raises(TypeError):
         # `step` should be specified
@@ -818,7 +1181,7 @@ def test_step_runner_hook(multi_optimziers):
     # test StepLrUpdaterHook with int `step` value
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((30, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add momentum scheduler
     hook_cfg = dict(
@@ -842,7 +1205,7 @@ def test_step_runner_hook(multi_optimziers):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -919,7 +1282,7 @@ def test_step_runner_hook(multi_optimziers):
     # test StepLrUpdaterHook with list[int] `step` value
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimziers=multi_optimziers)
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
 
     # add momentum scheduler
     hook_cfg = dict(
@@ -942,7 +1305,7 @@ def test_step_runner_hook(multi_optimziers):
 
     # TODO: use a more elegant way to check values
     assert hasattr(hook, 'writer')
-    if multi_optimziers:
+    if multi_optimizers:
         calls = [
             call(
                 'train', {
@@ -997,42 +1360,48 @@ def test_step_runner_hook(multi_optimziers):
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
 
-@pytest.mark.parametrize('multi_optimizers, max_iters', [(True, 8),
-                                                         (False, 8)])
-def test_cyclic_lr_update_hook(multi_optimizers, max_iters):
+@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
+                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
+def test_cyclic_lr_update_hook(multi_optimizers, max_iters, gamma,
+                               cyclic_times):
     """Test CyclicLrUpdateHook."""
     with pytest.raises(AssertionError):
         # by_epoch should be False
         CyclicLrUpdaterHook(by_epoch=True)
 
     with pytest.raises(AssertionError):
-        # target_ratio" must be either float or tuple/list of two floats
+        # target_ratio must be either float or tuple/list of two floats
         CyclicLrUpdaterHook(by_epoch=False, target_ratio=(10.0, 0.1, 0.2))
 
     with pytest.raises(AssertionError):
-        # step_ratio_up" must be in range [0,1)
+        # step_ratio_up must be in range [0,1)
         CyclicLrUpdaterHook(by_epoch=False, step_ratio_up=1.4)
 
     with pytest.raises(ValueError):
         # anneal_strategy must be one of "cos" or "linear"
         CyclicLrUpdaterHook(by_epoch=False, anneal_strategy='sin')
 
+    with pytest.raises(AssertionError):
+        # gamma must be in range (0, 1]
+        CyclicLrUpdaterHook(by_epoch=False, gamma=0)
+
     sys.modules['pavi'] = MagicMock()
     loader = DataLoader(torch.ones((10, 2)))
     runner = _build_demo_runner(
         runner_type='IterBasedRunner',
         max_epochs=None,
         max_iters=max_iters,
-        multi_optimziers=multi_optimizers)
+        multi_optimizers=multi_optimizers)
 
     # add cyclic LR scheduler
-    hook = CyclicLrUpdaterHook(
+    schedule_hook = CyclicLrUpdaterHook(
         by_epoch=False,
         target_ratio=(10.0, 1.0),
-        cyclic_times=1,
+        cyclic_times=cyclic_times,
         step_ratio_up=0.5,
-        anneal_strategy='linear')
-    runner.register_hook(hook)
+        anneal_strategy='linear',
+        gamma=gamma)
+    runner.register_hook(schedule_hook)
     runner.register_hook_from_cfg(dict(type='IterTimerHook'))
     runner.register_hook(IterTimerHook())
     # add pavi hook
@@ -1073,13 +1442,17 @@ def test_cyclic_lr_update_hook(multi_optimizers, max_iters):
                 'momentum': 0.95
             }, 1),
             call('train', {
-                'learning_rate': 0.155,
+                'learning_rate': 0.11,
                 'momentum': 0.95
             }, 4),
             call('train', {
-                'learning_rate': 0.155,
+                'learning_rate': 0.065,
                 'momentum': 0.95
             }, 6),
+            call('train', {
+                'learning_rate': 0.11,
+                'momentum': 0.95
+            }, 7),
         ]
     hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
 
@@ -1105,19 +1478,41 @@ def test_mlflow_hook(log_model):
         }, step=6)
     if log_model:
         hook.mlflow_pytorch.log_model.assert_called_with(
-            runner.model, 'models')
+            runner.model,
+            'models',
+            pip_requirements=[f'torch=={TORCH_VERSION}'])
     else:
         assert not hook.mlflow_pytorch.log_model.called
 
 
+def test_segmind_hook():
+    sys.modules['segmind'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = SegmindLoggerHook()
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.mlflow_log.assert_called_with(
+        hook.log_metrics, {
+            'learning_rate': 0.02,
+            'momentum': 0.95
+        },
+        step=runner.epoch,
+        epoch=runner.epoch)
+
+
 def test_wandb_hook():
     sys.modules['wandb'] = MagicMock()
     runner = _build_demo_runner()
-    hook = WandbLoggerHook()
+    hook = WandbLoggerHook(log_artifact=True)
     loader = DataLoader(torch.ones((5, 2)))
 
     runner.register_hook(hook)
     runner.run([loader, loader], [('train', 1), ('val', 1)])
+
     shutil.rmtree(runner.work_dir)
 
     hook.wandb.init.assert_called_with()
@@ -1127,6 +1522,7 @@ def test_wandb_hook():
     },
                                       step=6,
                                       commit=True)
+    hook.wandb.log_artifact.assert_called()
     hook.wandb.join.assert_called_with()
 
 
@@ -1147,27 +1543,68 @@ def test_neptune_hook():
     hook.run.stop.assert_called_with()
 
 
-def test_dvclive_hook(tmp_path):
+def test_dvclive_hook():
     sys.modules['dvclive'] = MagicMock()
     runner = _build_demo_runner()
 
-    (tmp_path / 'dvclive').mkdir()
-    hook = DvcliveLoggerHook(str(tmp_path / 'dvclive'))
+    hook = DvcliveLoggerHook()
+    dvclive_mock = hook.dvclive
     loader = DataLoader(torch.ones((5, 2)))
 
     runner.register_hook(hook)
     runner.run([loader, loader], [('train', 1), ('val', 1)])
     shutil.rmtree(runner.work_dir)
 
-    hook.dvclive.init.assert_called_with(str(tmp_path / 'dvclive'))
-    hook.dvclive.log.assert_called_with('momentum', 0.95, step=6)
-    hook.dvclive.log.assert_any_call('learning_rate', 0.02, step=6)
+    dvclive_mock.set_step.assert_called_with(6)
+    dvclive_mock.log.assert_called_with('momentum', 0.95)
+
+
+def test_dvclive_hook_model_file(tmp_path):
+    sys.modules['dvclive'] = MagicMock()
+    runner = _build_demo_runner()
+
+    hook = DvcliveLoggerHook(model_file=osp.join(runner.work_dir, 'model.pth'))
+    runner.register_hook(hook)
+
+    loader = torch.utils.data.DataLoader(torch.ones((5, 2)))
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+
+    assert osp.exists(osp.join(runner.work_dir, 'model.pth'))
+
+    shutil.rmtree(runner.work_dir)
+
+
+def test_clearml_hook():
+    sys.modules['clearml'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = ClearMLLoggerHook(init_kwargs={
+        'project_name': 'proj',
+        'task_name': 'task',
+    })
+
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.clearml.Task.init.assert_called_with(
+        project_name='proj', task_name='task')
+    hook.task.get_logger.assert_called_with()
+    report_scalar_calls = [
+        call('momentum', 'momentum', 0.95, 6),
+        call('learning_rate', 'learning_rate', 0.02, 6),
+    ]
+    hook.task_logger.report_scalar.assert_has_calls(
+        report_scalar_calls, any_order=True)
 
 
 def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
                                     max_epochs=1,
                                     max_iters=None,
-                                    multi_optimziers=False):
+                                    multi_optimizers=False):
 
     class Model(nn.Module):
 
@@ -1187,7 +1624,7 @@ def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
 
     model = Model()
 
-    if multi_optimziers:
+    if multi_optimizers:
         optimizer = {
             'model1':
             torch.optim.SGD(model.linear.parameters(), lr=0.02, momentum=0.95),
@@ -1213,15 +1650,14 @@ def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
 def _build_demo_runner(runner_type='EpochBasedRunner',
                        max_epochs=1,
                        max_iters=None,
-                       multi_optimziers=False):
-
+                       multi_optimizers=False):
     log_config = dict(
         interval=1, hooks=[
             dict(type='TextLoggerHook'),
         ])
 
     runner = _build_demo_runner_without_hook(runner_type, max_epochs,
-                                             max_iters, multi_optimziers)
+                                             max_iters, multi_optimizers)
 
     runner.register_checkpoint_hook(dict(interval=1))
     runner.register_logger_hooks(log_config)
@@ -1229,7 +1665,6 @@ def _build_demo_runner(runner_type='EpochBasedRunner',
 
 
 def test_runner_with_revise_keys():
-
     import os
 
     class Model(nn.Module):
diff --git a/tests/test_runner/test_optimizer.py b/tests/test_runner/test_optimizer.py
index cae22f7b511b2d88b152b791a89177ffa59e500e..724f45db9648a365c52c7f2a7cc690a6fbc87a69 100644
--- a/tests/test_runner/test_optimizer.py
+++ b/tests/test_runner/test_optimizer.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import sys
 import warnings
 from unittest.mock import MagicMock
diff --git a/tests/test_runner/test_runner.py b/tests/test_runner/test_runner.py
index 0a9f06cc7c3c6508e9a307b862ca2c02887150c9..d75f20d45f9ace19b04061639bdb6cef284e1c63 100644
--- a/tests/test_runner/test_runner.py
+++ b/tests/test_runner/test_runner.py
@@ -57,7 +57,7 @@ def test_build_runner():
 @pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
 def test_epoch_based_runner(runner_class):
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
         # batch_processor is deprecated
         model = OldStyleModel()
 
diff --git a/tests/test_runner/test_utils.py b/tests/test_runner/test_utils.py
index 974bac8d2fc1ba6ca1dce3a1390d399e4e3ed7bd..3d2d18146c85476d707f1113ba7e4dcd2106d9b6 100644
--- a/tests/test_runner/test_utils.py
+++ b/tests/test_runner/test_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import os
 import random
 
diff --git a/tests/test_utils/test_config.py b/tests/test_utils/test_config.py
index 89520de7ffe9decb2ccce15a9c312fe55b77c97d..96118e7109a133a2058f7a722adcd20fe6bfe566 100644
--- a/tests/test_utils/test_config.py
+++ b/tests/test_utils/test_config.py
@@ -1,5 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
+import copy
 import json
 import os
 import os.path as osp
@@ -10,7 +11,7 @@ from pathlib import Path
 import pytest
 import yaml
 
-from mmcv import Config, DictAction, dump, load
+from mmcv import Config, ConfigDict, DictAction, dump, load
 
 data_path = osp.join(osp.dirname(osp.dirname(__file__)), 'data')
 
@@ -28,28 +29,31 @@ def test_construct():
     cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
     # test a.py
     cfg_file = osp.join(data_path, 'config/a.py')
-    cfg = Config(cfg_dict, filename=cfg_file)
-    assert isinstance(cfg, Config)
-    assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
-    assert cfg.dump() == cfg.pretty_text
-    with tempfile.TemporaryDirectory() as temp_config_dir:
-        dump_file = osp.join(temp_config_dir, 'a.py')
-        cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
-        assert Config.fromfile(dump_file)
+    cfg_file_path = Path(cfg_file)
+    file_list = [cfg_file, cfg_file_path]
+    for item in file_list:
+        cfg = Config(cfg_dict, filename=item)
+        assert isinstance(cfg, Config)
+        assert isinstance(cfg.filename, str) and cfg.filename == str(item)
+        assert cfg.text == open(item).read()
+        assert cfg.dump() == cfg.pretty_text
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            dump_file = osp.join(temp_config_dir, 'a.py')
+            cfg.dump(dump_file)
+            assert cfg.dump() == open(dump_file).read()
+            assert Config.fromfile(dump_file)
 
     # test b.json
     cfg_file = osp.join(data_path, 'config/b.json')
     cfg = Config(cfg_dict, filename=cfg_file)
     assert isinstance(cfg, Config)
     assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.text == open(cfg_file).read()
     assert cfg.dump() == json.dumps(cfg_dict)
     with tempfile.TemporaryDirectory() as temp_config_dir:
         dump_file = osp.join(temp_config_dir, 'b.json')
         cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
+        assert cfg.dump() == open(dump_file).read()
         assert Config.fromfile(dump_file)
 
     # test c.yaml
@@ -57,12 +61,12 @@ def test_construct():
     cfg = Config(cfg_dict, filename=cfg_file)
     assert isinstance(cfg, Config)
     assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.text == open(cfg_file).read()
     assert cfg.dump() == yaml.dump(cfg_dict)
     with tempfile.TemporaryDirectory() as temp_config_dir:
         dump_file = osp.join(temp_config_dir, 'c.yaml')
         cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
+        assert cfg.dump() == open(dump_file).read()
         assert Config.fromfile(dump_file)
 
     # test h.py
@@ -78,12 +82,12 @@ def test_construct():
     cfg = Config(cfg_dict, filename=cfg_file)
     assert isinstance(cfg, Config)
     assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.text == open(cfg_file).read()
     assert cfg.dump() == cfg.pretty_text
     with tempfile.TemporaryDirectory() as temp_config_dir:
         dump_file = osp.join(temp_config_dir, 'h.py')
         cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
+        assert cfg.dump() == open(dump_file).read()
         assert Config.fromfile(dump_file)
         assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
         assert Config.fromfile(dump_file)['item2'] == cfg_dict['item2']
@@ -105,12 +109,12 @@ def test_construct():
     cfg = Config(cfg_dict, filename=cfg_file)
     assert isinstance(cfg, Config)
     assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.text == open(cfg_file).read()
     assert cfg.dump() == yaml.dump(cfg_dict)
     with tempfile.TemporaryDirectory() as temp_config_dir:
         dump_file = osp.join(temp_config_dir, 'p.yaml')
         cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
+        assert cfg.dump() == open(dump_file).read()
         assert Config.fromfile(dump_file)
         assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
 
@@ -124,12 +128,12 @@ def test_construct():
     cfg = Config(cfg_dict, filename=cfg_file)
     assert isinstance(cfg, Config)
     assert cfg.filename == cfg_file
-    assert cfg.text == open(cfg_file, 'r').read()
+    assert cfg.text == open(cfg_file).read()
     assert cfg.dump() == json.dumps(cfg_dict)
     with tempfile.TemporaryDirectory() as temp_config_dir:
         dump_file = osp.join(temp_config_dir, 'o.json')
         cfg.dump(dump_file)
-        assert cfg.dump() == open(dump_file, 'r').read()
+        assert cfg.dump() == open(dump_file).read()
         assert Config.fromfile(dump_file)
         assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
 
@@ -141,11 +145,14 @@ def test_construct():
 def test_fromfile():
     for filename in ['a.py', 'a.b.py', 'b.json', 'c.yaml']:
         cfg_file = osp.join(data_path, 'config', filename)
-        cfg = Config.fromfile(cfg_file)
-        assert isinstance(cfg, Config)
-        assert cfg.filename == cfg_file
-        assert cfg.text == osp.abspath(osp.expanduser(cfg_file)) + '\n' + \
-            open(cfg_file, 'r').read()
+        cfg_file_path = Path(cfg_file)
+        file_list = [cfg_file, cfg_file_path]
+        for item in file_list:
+            cfg = Config.fromfile(item)
+            assert isinstance(cfg, Config)
+            assert isinstance(cfg.filename, str) and cfg.filename == str(item)
+            assert cfg.text == osp.abspath(osp.expanduser(item)) + '\n' + \
+                open(item).read()
 
     # test custom_imports for Config.fromfile
     cfg_file = osp.join(data_path, 'config', 'q.py')
@@ -175,7 +182,7 @@ def test_fromstring():
         out_cfg = Config.fromstring(in_cfg.pretty_text, '.py')
         assert in_cfg._cfg_dict == out_cfg._cfg_dict
 
-        cfg_str = open(cfg_file, 'r').read()
+        cfg_str = open(cfg_file).read()
         out_cfg = Config.fromstring(cfg_str, file_format)
         assert in_cfg._cfg_dict == out_cfg._cfg_dict
 
@@ -186,7 +193,7 @@ def test_fromstring():
         Config.fromstring(in_cfg.pretty_text, '.json')
 
     # test file format error
-    cfg_str = open(cfg_file, 'r').read()
+    cfg_str = open(cfg_file).read()
     with pytest.raises(Exception):
         Config.fromstring(cfg_str, '.py')
 
@@ -198,9 +205,9 @@ def test_merge_from_base():
     assert cfg.filename == cfg_file
     base_cfg_file = osp.join(data_path, 'config/base.py')
     merge_text = osp.abspath(osp.expanduser(base_cfg_file)) + '\n' + \
-        open(base_cfg_file, 'r').read()
+        open(base_cfg_file).read()
     merge_text += '\n' + osp.abspath(osp.expanduser(cfg_file)) + '\n' + \
-                  open(cfg_file, 'r').read()
+                  open(cfg_file).read()
     assert cfg.text == merge_text
     assert cfg.item1 == [2, 3]
     assert cfg.item2.a == 1
@@ -347,12 +354,16 @@ def test_merge_delete():
     cfg_file = osp.join(data_path, 'config/delete.py')
     cfg = Config.fromfile(cfg_file)
     # cfg.field
-    assert cfg.item1 == [1, 2]
-    assert cfg.item2 == dict(b=0)
+    assert cfg.item1 == dict(a=0)
+    assert cfg.item2 == dict(a=0, b=0)
     assert cfg.item3 is True
     assert cfg.item4 == 'test'
     assert '_delete_' not in cfg.item2
 
+    # related issue: https://github.com/open-mmlab/mmcv/issues/1570
+    assert type(cfg.item1) == ConfigDict
+    assert type(cfg.item2) == ConfigDict
+
 
 def test_merge_intermediate_variable():
 
@@ -424,6 +435,51 @@ def test_dict():
         assert cfg.item2.a == 1
 
 
+@pytest.mark.parametrize('file', ['a.json', 'b.py', 'c.yaml', 'd.yml', None])
+def test_dump(file):
+    # config loaded from dict
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+    cfg = Config(cfg_dict=cfg_dict)
+    assert cfg.item1 == cfg_dict['item1']
+    assert cfg.item2 == cfg_dict['item2']
+    assert cfg.item3 == cfg_dict['item3']
+    assert cfg.item4 == cfg_dict['item4']
+    assert cfg._filename is None
+    if file is not None:
+        # dump without a filename argument is only returning pretty_text.
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            cfg_file = osp.join(temp_config_dir, file)
+            cfg.dump(cfg_file)
+            dumped_cfg = Config.fromfile(cfg_file)
+            assert dumped_cfg._cfg_dict == cfg._cfg_dict
+    else:
+        assert cfg.dump() == cfg.pretty_text
+
+    # The key of json must be a string, so key `1` will be converted to `'1'`.
+    def compare_json_cfg(ori_cfg, dumped_json_cfg):
+        for key, value in ori_cfg.items():
+            assert str(key) in dumped_json_cfg
+            if not isinstance(value, dict):
+                assert ori_cfg[key] == dumped_json_cfg[str(key)]
+            else:
+                compare_json_cfg(value, dumped_json_cfg[str(key)])
+
+    # config loaded from file
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    if file is not None:
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            cfg_file = osp.join(temp_config_dir, file)
+            cfg.dump(cfg_file)
+            dumped_cfg = Config.fromfile(cfg_file)
+        if not file.endswith('.json'):
+            assert dumped_cfg._cfg_dict == cfg._cfg_dict
+        else:
+            compare_json_cfg(cfg._cfg_dict, dumped_cfg._cfg_dict)
+    else:
+        assert cfg.dump() == cfg.pretty_text
+
+
 def test_setattr():
     cfg = Config()
     cfg.item1 = [1, 2]
@@ -465,9 +521,18 @@ def test_dict_action():
     with pytest.raises(AssertionError):
         parser.parse_args(['--options', 'item2.a=[(a,b), [1,2], false'])
     # Normal values
-    args = parser.parse_args(
-        ['--options', 'item2.a=1', 'item2.b=0.1', 'item2.c=x', 'item3=false'])
-    out_dict = {'item2.a': 1, 'item2.b': 0.1, 'item2.c': 'x', 'item3': False}
+    args = parser.parse_args([
+        '--options', 'item2.a=1', 'item2.b=0.1', 'item2.c=x', 'item3=false',
+        'item4=none', 'item5=None'
+    ])
+    out_dict = {
+        'item2.a': 1,
+        'item2.b': 0.1,
+        'item2.c': 'x',
+        'item3': False,
+        'item4': 'none',
+        'item5': None,
+    }
     assert args.options == out_dict
     cfg_file = osp.join(data_path, 'config/a.py')
     cfg = Config.fromfile(cfg_file)
@@ -476,18 +541,6 @@ def test_dict_action():
     assert cfg.item3 is False
 
 
-def test_dump_mapping():
-    cfg_file = osp.join(data_path, 'config/n.py')
-    cfg = Config.fromfile(cfg_file)
-
-    with tempfile.TemporaryDirectory() as temp_config_dir:
-        text_cfg_filename = osp.join(temp_config_dir, '_text_config.py')
-        cfg.dump(text_cfg_filename)
-        text_cfg = Config.fromfile(text_cfg_filename)
-
-    assert text_cfg._cfg_dict == cfg._cfg_dict
-
-
 def test_reserved_key():
     cfg_file = osp.join(data_path, 'config/g.py')
     with pytest.raises(KeyError):
@@ -529,6 +582,30 @@ def test_deprecation():
     ]
 
     for cfg_file in deprecated_cfg_files:
-        with pytest.warns(UserWarning):
+        with pytest.warns(DeprecationWarning):
             cfg = Config.fromfile(cfg_file)
         assert cfg.item1 == 'expected'
+
+
+def test_deepcopy():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    new_cfg = copy.deepcopy(cfg)
+
+    assert isinstance(new_cfg, Config)
+    assert new_cfg._cfg_dict == cfg._cfg_dict
+    assert new_cfg._cfg_dict is not cfg._cfg_dict
+    assert new_cfg._filename == cfg._filename
+    assert new_cfg._text == cfg._text
+
+
+def test_copy():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    new_cfg = copy.copy(cfg)
+
+    assert isinstance(new_cfg, Config)
+    assert new_cfg is not cfg
+    assert new_cfg._cfg_dict is cfg._cfg_dict
+    assert new_cfg._filename == cfg._filename
+    assert new_cfg._text == cfg._text
diff --git a/tests/test_utils/test_env.py b/tests/test_utils/test_env.py
index 7c245c7a527e8b3331306b99e3fd573e3c8662f1..74bafff3715d862394147f505adff77448108e11 100644
--- a/tests/test_utils/test_env.py
+++ b/tests/test_utils/test_env.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import sys
 
 import pytest
@@ -15,7 +16,7 @@ def test_collect_env():
     env_info = collect_env()
     expected_keys = [
         'sys.platform', 'Python', 'CUDA available', 'PyTorch',
-        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler',
+        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',
         'MMCV CUDA Compiler'
     ]
     for key in expected_keys:
@@ -25,8 +26,8 @@ def test_collect_env():
         for key in ['CUDA_HOME', 'NVCC']:
             assert key in env_info
 
-    if sys.platform != 'win32':
-        assert 'GCC' in env_info
+    if sys.platform == 'win32':
+        assert 'MSVC' in env_info
 
     assert env_info['sys.platform'] == sys.platform
     assert env_info['Python'] == sys.version.replace('\n', '')
diff --git a/tests/test_utils/test_hub.py b/tests/test_utils/test_hub.py
index 046415152f3b4f412dd374767ef90a09449b4872..b44ee9be06a60add57477834752e803a89592493 100644
--- a/tests/test_utils/test_hub.py
+++ b/tests/test_utils/test_hub.py
@@ -1,9 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
+import torch
 from torch.utils import model_zoo
 
 from mmcv.utils import TORCH_VERSION, digit_version, load_url
 
 
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not necessary in parrots test')
 def test_load_url():
     url1 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.5.pth'
     url2 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.6.pth'
diff --git a/tests/test_utils/test_logging.py b/tests/test_utils/test_logging.py
index 4be4bb2ada45519081cf0c636951c424569ea0d5..ab66a34b94d5c56ce0a8b0a2e9480c2d4a2d634d 100644
--- a/tests/test_utils/test_logging.py
+++ b/tests/test_utils/test_logging.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import logging
 import os
 import platform
@@ -100,7 +101,7 @@ def test_print_log_logger(caplog):
         logger = get_logger('abc', log_file=f.name)
         print_log('welcome', logger=logger)
         assert caplog.record_tuples[-1] == ('abc', logging.INFO, 'welcome')
-        with open(f.name, 'r') as fin:
+        with open(f.name) as fin:
             log_text = fin.read()
             regex_time = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}'
             match = re.fullmatch(regex_time + r' - abc - INFO - welcome\n',
diff --git a/tests/test_utils/test_misc.py b/tests/test_utils/test_misc.py
index 6070624c3ef679a8bbbdc54d319e5a5ed05e8382..2b14c007780a22f23d781662d328efec566a86db 100644
--- a/tests/test_utils/test_misc.py
+++ b/tests/test_utils/test_misc.py
@@ -132,7 +132,6 @@ def test_requires_executable(capsys):
 def test_import_modules_from_strings():
     # multiple imports
     import os.path as osp_
-
     import sys as sys_
     osp, sys = mmcv.import_modules_from_strings(['os.path', 'sys'])
     assert osp == osp_
diff --git a/tests/test_utils/test_parrots_jit.py b/tests/test_utils/test_parrots_jit.py
index 78f8b9f602b035fb2cdf013c99ff444aa327637f..71be929fb43d0e0f6efa5df9e48dcb15e3a83ed7 100644
--- a/tests/test_utils/test_parrots_jit.py
+++ b/tests/test_utils/test_parrots_jit.py
@@ -1,14 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
 import mmcv
 from mmcv.utils import TORCH_VERSION
 
+pytest.skip('this test not ready now', allow_module_level=True)
 skip_no_parrots = pytest.mark.skipif(
     TORCH_VERSION != 'parrots', reason='test case under parrots environment')
 
 
-class TestJit(object):
+class TestJit:
 
     def test_add_dict(self):
 
@@ -253,7 +255,7 @@ class TestJit(object):
 
     def test_instance_method(self):
 
-        class T(object):
+        class T:
 
             def __init__(self, shape):
                 self._c = torch.rand(shape)
diff --git a/tests/test_utils/test_path.py b/tests/test_utils/test_path.py
index 299c87fd3f9b913fb460f675000e1b94734576cc..56d65ce264a12b65a8747fb402643ae3a39b6398 100644
--- a/tests/test_utils/test_path.py
+++ b/tests/test_utils/test_path.py
@@ -30,12 +30,14 @@ def test_scandir():
     filenames = ['a.bin', '1.txt', '2.txt', '1.json', '2.json', '3.TXT']
     assert set(mmcv.scandir(folder)) == set(filenames)
     assert set(mmcv.scandir(Path(folder))) == set(filenames)
-    assert set(mmcv.scandir(folder, '.txt')) == set(
-        [filename for filename in filenames if filename.endswith('.txt')])
-    assert set(mmcv.scandir(folder, ('.json', '.txt'))) == set([
-        filename for filename in filenames
-        if filename.endswith(('.txt', '.json'))
-    ])
+    assert set(mmcv.scandir(folder, '.txt')) == {
+        filename
+        for filename in filenames if filename.endswith('.txt')
+    }
+    assert set(mmcv.scandir(folder, ('.json', '.txt'))) == {
+        filename
+        for filename in filenames if filename.endswith(('.txt', '.json'))
+    }
     assert set(mmcv.scandir(folder, '.png')) == set()
 
     # path of sep is `\\` in windows but `/` in linux, so osp.join should be
@@ -46,27 +48,33 @@ def test_scandir():
         osp.join('sub', '1.txt'), '.file'
     ]
     # .file starts with '.' and is a file so it will not be scanned
-    assert set(mmcv.scandir(folder, recursive=True)) == set(
-        [filename for filename in filenames_recursive if filename != '.file'])
-    assert set(mmcv.scandir(Path(folder), recursive=True)) == set(
-        [filename for filename in filenames_recursive if filename != '.file'])
-    assert set(mmcv.scandir(folder, '.txt', recursive=True)) == set([
-        filename for filename in filenames_recursive
-        if filename.endswith('.txt')
-    ])
+    assert set(mmcv.scandir(folder, recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename != '.file'
+    }
+    assert set(mmcv.scandir(Path(folder), recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename != '.file'
+    }
+    assert set(mmcv.scandir(folder, '.txt', recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename.endswith('.txt')
+    }
     assert set(
         mmcv.scandir(folder, '.TXT', recursive=True,
-                     case_sensitive=False)) == set([
-                         filename for filename in filenames_recursive
+                     case_sensitive=False)) == {
+                         filename
+                         for filename in filenames_recursive
                          if filename.endswith(('.txt', '.TXT'))
-                     ])
+                     }
     assert set(
         mmcv.scandir(
             folder, ('.TXT', '.JSON'), recursive=True,
-            case_sensitive=False)) == set([
-                filename for filename in filenames_recursive
+            case_sensitive=False)) == {
+                filename
+                for filename in filenames_recursive
                 if filename.endswith(('.txt', '.json', '.TXT'))
-            ])
+            }
     with pytest.raises(TypeError):
         list(mmcv.scandir(123))
     with pytest.raises(TypeError):
diff --git a/tests/test_utils/test_progressbar.py b/tests/test_utils/test_progressbar.py
index 730f45697d4a7f597cd254525cdc40a22cd7bb75..982aa247f7b4d0a0b47f035b3d6169c364e03d3e 100644
--- a/tests/test_utils/test_progressbar.py
+++ b/tests/test_utils/test_progressbar.py
@@ -1,18 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os
 import time
+from io import StringIO
+from unittest.mock import patch
 
-try:
-    from unittest.mock import patch
-except ImportError:
-    from mock import patch
-
-try:
-    from StringIO import StringIO
-except ImportError:
-    from io import StringIO
-
-import mmcv  # isort:skip
+import mmcv
 
 
 def reset_string_io(io):
diff --git a/tests/test_utils/test_registry.py b/tests/test_utils/test_registry.py
index bfef5313539680488eb1a3b87ba1c7d695f36c01..09dc46b7cd722a48f9b816a659b41ca6fada2e4b 100644
--- a/tests/test_utils/test_registry.py
+++ b/tests/test_utils/test_registry.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 
 import mmcv
@@ -88,23 +89,34 @@ def test_registry():
     with pytest.raises(TypeError):
         CATS.register_module(0)
 
-    # can only decorate a class
+    @CATS.register_module()
+    def muchkin():
+        pass
+
+    assert CATS.get('muchkin') is muchkin
+    assert 'muchkin' in CATS
+
+    # can only decorate a class or a function
     with pytest.raises(TypeError):
 
-        @CATS.register_module()
-        def some_method():
-            pass
+        class Demo:
+
+            def some_method(self):
+                pass
+
+        method = Demo().some_method
+        CATS.register_module(name='some_method', module=method)
 
     # begin: test old APIs
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
         CATS.register_module(SphynxCat)
         assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
         CATS.register_module(SphynxCat, force=True)
         assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
 
         @CATS.register_module
         class NewCat:
@@ -112,11 +124,11 @@ def test_registry():
 
         assert CATS.get('NewCat').__name__ == 'NewCat'
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
         CATS.deprecated_register_module(SphynxCat, force=True)
         assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
 
         @CATS.deprecated_register_module
         class CuteCat:
@@ -124,7 +136,7 @@ def test_registry():
 
         assert CATS.get('CuteCat').__name__ == 'CuteCat'
 
-    with pytest.warns(UserWarning):
+    with pytest.warns(DeprecationWarning):
 
         @CATS.deprecated_register_module(force=True)
         class NewCat2:
diff --git a/tests/test_utils/test_testing.py b/tests/test_utils/test_testing.py
index 0a479d372537bbbb12febdc669331eea0b2a42bc..c6f8e8d1230f7cfbc3c247a267f9ee7f5f68fab5 100644
--- a/tests/test_utils/test_testing.py
+++ b/tests/test_utils/test_testing.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import numpy as np
 import pytest
 
@@ -69,7 +70,7 @@ def test_assert_dict_contains_subset():
 
 def test_assert_attrs_equal():
 
-    class TestExample(object):
+    class TestExample:
         a, b, c = 1, ('wvi', 3), [4.5, 3.14]
 
         def test_func(self):
@@ -103,7 +104,7 @@ def test_assert_attrs_equal():
 
     if torch is not None:
 
-        class TestExample(object):
+        class TestExample:
             a, b = torch.tensor([1]), torch.tensor([4, 5])
 
         # case 5
diff --git a/tests/test_utils/test_torch_ops.py b/tests/test_utils/test_torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8752e0fd60f8d69787b92238460c0a61bab954e
--- /dev/null
+++ b/tests/test_utils/test_torch_ops.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.utils import torch_meshgrid
+
+
+def test_torch_meshgrid():
+    # torch_meshgrid should not throw warning
+    with pytest.warns(None) as record:
+        x = torch.tensor([1, 2, 3])
+        y = torch.tensor([4, 5, 6])
+        grid_x, grid_y = torch_meshgrid(x, y)
+
+    assert len(record) == 0
diff --git a/tests/test_utils/test_trace.py b/tests/test_utils/test_trace.py
index 3aca0b120454c0da889417339e052377e29d5144..2dbf2c8549491d421d17c336e73d1358f275cdee 100644
--- a/tests/test_utils/test_trace.py
+++ b/tests/test_utils/test_trace.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
 
diff --git a/tests/test_utils/test_version_utils.py b/tests/test_utils/test_version_utils.py
index 58bac0bab99ef7a1736243bb2aaf22703c605ff3..5400e3c86a61b84e2c924ad0105d51b36b598cbd 100644
--- a/tests/test_utils/test_version_utils.py
+++ b/tests/test_utils/test_version_utils.py
@@ -1,3 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
 from unittest.mock import patch
 
 import pytest
@@ -40,7 +41,7 @@ def test_parse_version_info():
 
 
 def _mock_cmd_success(cmd):
-    return '3b46d33e90c397869ad5103075838fdfc9812aa0'.encode('ascii')
+    return b'3b46d33e90c397869ad5103075838fdfc9812aa0'
 
 
 def _mock_cmd_fail(cmd):
diff --git a/tests/test_video/test_reader.py b/tests/test_video/test_reader.py
index f62ac5b7930f02bb8b809c7a1508d17930a6168c..c3bbdb7dcbbdd42e3c1e5ffefccbbf8b5c6c3897 100644
--- a/tests/test_video/test_reader.py
+++ b/tests/test_video/test_reader.py
@@ -45,7 +45,7 @@ class TestVideoReader:
     def setup_class(cls):
         cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
         cls.num_frames = 168
-        cls.video_url = 'https://www.learningcontainer.com/wp-content/uploads/2020/05/sample-mp4-file.mp4'  # noqa: E501
+        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501
 
     def test_load(self):
         # read from video file
diff --git a/tests/test_visualization.py b/tests/test_visualization.py
index 157bd9be70d0251f88419daf3408c49c270e72c8..82dd093bf8b6b97d196396d0ff79cde8d239b119 100644
--- a/tests/test_visualization.py
+++ b/tests/test_visualization.py
@@ -10,7 +10,7 @@ def test_color():
     assert mmcv.color_val('green') == (0, 255, 0)
     assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)
     assert mmcv.color_val(100) == (100, 100, 100)
-    assert mmcv.color_val(np.zeros(3, dtype=np.int)) == (0, 0, 0)
+    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)
     with pytest.raises(TypeError):
         mmcv.color_val([255, 255, 255])
     with pytest.raises(TypeError):