Pick commits from master (#2164)

* [Docs] Add swith_language.md in docs (#2160) * [Fix] Fix onnx unit tests (#2155) * [Docs] Limit extension versions (#2144) * Support PrRoIPool operation * Add MPS bbox overlap * Add .pre-commit-config-zh-cn.yaml (#2135) Co-authored-by: xcnick <xcnick0412@gmail.com> Co-authored-by: Jingwei Zhang <zjw18@mails.tsinghua.edu.cn> Co-authored-by: q.yao <yaoqian@sensetime.com>

Pick commits from master (#2164)
* [Docs] Add swith_language.md in docs (#2160) * [Fix] Fix onnx unit tests (#2155) * [Docs] Limit extension versions (#2144) * Support PrRoIPool operation * Add MPS bbox overlap * Add .pre-commit-config-zh-cn.yaml (#2135) Co-authored-by: xcnick <xcnick0412@gmail.com> Co-authored-by: Jingwei Zhang <zjw18@mails.tsinghua.edu.cn> Co-authored-by: q.yao <yaoqian@sensetime.com>
0b4285d9 · Zaida Zhou · GitHub · 47a61c3b · 0b4285d9 · 0b4285d9
Unverified Commit 0b4285d9 authored Aug 03, 2022 by Zaida Zhou Committed by GitHub Aug 03, 2022
20 changed files
--- a/.pre-commit-config-zh-cn.yaml
+++ b/.pre-commit-config-zh-cn.yaml
+exclude: ^tests/data/
+repos:
+  - repo: https://gitee.com/openmmlab/mirrors-flake8
+    rev: 3.8.3
+    hooks:
+      - id: flake8
+  - repo: https://gitee.com/openmmlab/mirrors-isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+  - repo: https://gitee.com/openmmlab/mirrors-yapf
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://gitee.com/openmmlab/mirrors-codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+  - repo: https://gitee.com/openmmlab/mirrors-mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://gitee.com/openmmlab/mirrors-docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v2.32.1
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
+  - repo: https://gitee.com/openmmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-copyright
+        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
+  - repo: https://gitee.com/openmmlab/mirrors-mypy
+    rev: v0.812
+    hooks:
+      - id: mypy
+        exclude: |-
+          (?x)(
+              ^test
+              | ^docs
+          )
+  # - repo: local
+  #   hooks:
+  #     - id: clang-format
+  #       name: clang-format
+  #       description: Format files with ClangFormat
+  #       entry: clang-format -style=google -i
+  #       language: system
+  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -3,4 +3,5 @@ include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model
 include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
 include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
 include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
-recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu
+include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
+recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -36,6 +36,11 @@ You can switch between Chinese and English documents in the lower-left corner of
   deployment/tensorrt_custom_ops.md
   deployment/tensorrt_plugin.md
+.. toctree::
+   :caption: Switch Language
+   switch_language.md
 .. toctree::
   :maxdepth: 2
   :caption: Compatibility

--- a/docs/en/switch_language.md
+++ b/docs/en/switch_language.md
+## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
+## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
--- a/docs/en/understand_mmcv/ops.md
+++ b/docs/en/understand_mmcv/ops.md
@@ -2,58 +2,59 @@
 We implement common ops used in detection, segmentation, etc.
-| Device                       | CPU | CUDA | MLU |
+| Device                       | CPU | CUDA | MLU | MPS |
-| ---------------------------- | --- | ---- | --- |
+| ---------------------------- | --- | ---- | --- | --- |
-| ActiveRotatedFilter          | √   | √    |     |
+| ActiveRotatedFilter          | √   | √    |     |     |
-| AssignScoreWithK             |     | √    |     |
+| AssignScoreWithK             |     | √    |     |     |
-| BallQuery                    |     | √    |     |
+| BallQuery                    |     | √    |     |     |
-| BBoxOverlaps                 |     | √    | √   |
+| BBoxOverlaps                 |     | √    | √   | √   |
-| BorderAlign                  |     | √    |     |
+| BorderAlign                  |     | √    |     |     |
-| BoxIouRotated                | √   | √    |     |
+| BoxIouRotated                | √   | √    |     |     |
-| CARAFE                       |     | √    |     |
+| CARAFE                       |     | √    |     |     |
-| ChamferDistance              |     | √    |     |
+| ChamferDistance              |     | √    |     |     |
-| CrissCrossAttention          |     | √    |     |
+| CrissCrossAttention          |     | √    |     |     |
-| ContourExpand                | √   |      |     |
+| ContourExpand                | √   |      |     |     |
-| ConvexIoU                    |     | √    |     |
+| ConvexIoU                    |     | √    |     |     |
-| CornerPool                   |     | √    |     |
+| CornerPool                   |     | √    |     |     |
-| Correlation                  |     | √    |     |
+| Correlation                  |     | √    |     |     |
-| Deformable Convolution v1/v2 | √   | √    |     |
+| Deformable Convolution v1/v2 | √   | √    |     |     |
-| Deformable RoIPool           |     | √    |     |
+| Deformable RoIPool           |     | √    |     |     |
-| DiffIoURotated               |     | √    |     |
+| DiffIoURotated               |     | √    |     |     |
-| DynamicScatter               |     | √    |     |
+| DynamicScatter               |     | √    |     |     |
-| FurthestPointSample          |     | √    |     |
+| FurthestPointSample          |     | √    |     |     |
-| FurthestPointSampleWithDist  |     | √    |     |
+| FurthestPointSampleWithDist  |     | √    |     |     |
-| FusedBiasLeakyrelu           |     | √    |     |
+| FusedBiasLeakyrelu           |     | √    |     |     |
-| GatherPoints                 |     | √    |     |
+| GatherPoints                 |     | √    |     |     |
-| GroupPoints                  |     | √    |     |
+| GroupPoints                  |     | √    |     |     |
-| Iou3d                        |     | √    |     |
+| Iou3d                        |     | √    |     |     |
-| KNN                          |     | √    |     |
+| KNN                          |     | √    |     |     |
-| MaskedConv                   |     | √    |     |
+| MaskedConv                   |     | √    |     |     |
-| MergeCells                   |     | √    |     |
+| MergeCells                   |     | √    |     |     |
-| MinAreaPolygon               |     | √    |     |
+| MinAreaPolygon               |     | √    |     |     |
-| ModulatedDeformConv2d        | √   | √    |     |
+| ModulatedDeformConv2d        | √   | √    |     |     |
-| MultiScaleDeformableAttn     |     | √    |     |
+| MultiScaleDeformableAttn     |     | √    |     |     |
-| NMS                          | √   | √    | √   |
+| NMS                          | √   | √    | √   |     |
-| NMSRotated                   | √   | √    |     |
+| NMSRotated                   | √   | √    |     |     |
-| PixelGroup                   | √   |      |     |
+| PixelGroup                   | √   |      |     |     |
-| PointsInBoxes                | √   | √    |     |
+| PointsInBoxes                | √   | √    |     |     |
-| PointsInPolygons             |     | √    |     |
+| PointsInPolygons             |     | √    |     |     |
-| PSAMask                      | √   | √    | √   |
+| PSAMask                      | √   | √    | √   |     |
-| RotatedFeatureAlign          | √   | √    |     |
+| RotatedFeatureAlign          | √   | √    |     |     |
-| RoIPointPool3d               |     | √    |     |
+| RoIPointPool3d               |     | √    |     |     |
-| RoIPool                      |     | √    | √   |
+| RoIPool                      |     | √    | √   |     |
-| RoIAlignRotated              | √   | √    | √   |
+| RoIAlignRotated              | √   | √    | √   |     |
-| RiRoIAlignRotated            |     | √    |     |
+| RiRoIAlignRotated            |     | √    |     |     |
-| RoIAlign                     | √   | √    | √   |
+| RoIAlign                     | √   | √    | √   |     |
-| RoIAwarePool3d               |     | √    |     |
+| RoIAwarePool3d               |     | √    |     |     |
-| SAConv2d                     |     | √    |     |
+| SAConv2d                     |     | √    |     |     |
-| SigmoidFocalLoss             |     | √    | √   |
+| SigmoidFocalLoss             |     | √    | √   |     |
-| SoftmaxFocalLoss             |     | √    |     |
+| SoftmaxFocalLoss             |     | √    |     |     |
-| SoftNMS                      |     | √    |     |
+| SoftNMS                      |     | √    |     |     |
-| Sparse Convolution           |     | √    |     |
+| Sparse Convolution           |     | √    |     |     |
-| Synchronized BatchNorm       |     | √    |     |
+| Synchronized BatchNorm       |     | √    |     |     |
-| ThreeInterpolate             |     | √    |     |
+| ThreeInterpolate             |     | √    |     |     |
-| ThreeNN                      |     | √    |     |
+| ThreeNN                      |     | √    |     |     |
-| TINShift                     |     | √    | √   |
+| TINShift                     |     | √    | √   |     |
-| UpFirDn2d                    |     | √    |     |
+| UpFirDn2d                    |     | √    |     |     |
-| Voxelization                 | √   | √    |     |
+| Voxelization                 | √   | √    |     |     |
+| PrRoIPool                    |     | √    |     |     |
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -36,6 +36,11 @@
   deployment/tensorrt_plugin.md
   deployment/tensorrt_custom_ops.md
+.. toctree::
+   :caption: 语言切换
+   switch_language.md
 .. toctree::
   :maxdepth: 2
   :caption: 兼容性

--- a/docs/zh_cn/switch_language.md
+++ b/docs/zh_cn/switch_language.md
+## <a href='https://mmcv.readthedocs.io/en/latest/'>English</a>
+## <a href='https://mmcv.readthedocs.io/zh_CN/latest/'>简体中文</a>
--- a/docs/zh_cn/understand_mmcv/ops.md
+++ b/docs/zh_cn/understand_mmcv/ops.md
@@ -2,58 +2,59 @@
 MMCV 提供了检测、分割等任务中常用的算子
-| Device                       | CPU | CUDA | MLU |
+| Device                       | CPU | CUDA | MLU | MPS |
-| ---------------------------- | --- | ---- | --- |
+| ---------------------------- | --- | ---- | --- | --- |
-| ActiveRotatedFilter          | √   | √    |     |
+| ActiveRotatedFilter          | √   | √    |     |     |
-| AssignScoreWithK             |     | √    |     |
+| AssignScoreWithK             |     | √    |     |     |
-| BallQuery                    |     | √    |     |
+| BallQuery                    |     | √    |     |     |
-| BBoxOverlaps                 |     | √    | √   |
+| BBoxOverlaps                 |     | √    | √   | √   |
-| BorderAlign                  |     | √    |     |
+| BorderAlign                  |     | √    |     |     |
-| BoxIouRotated                | √   | √    |     |
+| BoxIouRotated                | √   | √    |     |     |
-| CARAFE                       |     | √    |     |
+| CARAFE                       |     | √    |     |     |
-| ChamferDistance              |     | √    |     |
+| ChamferDistance              |     | √    |     |     |
-| CrissCrossAttention          |     | √    |     |
+| CrissCrossAttention          |     | √    |     |     |
-| ContourExpand                | √   |      |     |
+| ContourExpand                | √   |      |     |     |
-| ConvexIoU                    |     | √    |     |
+| ConvexIoU                    |     | √    |     |     |
-| CornerPool                   |     | √    |     |
+| CornerPool                   |     | √    |     |     |
-| Correlation                  |     | √    |     |
+| Correlation                  |     | √    |     |     |
-| Deformable Convolution v1/v2 | √   | √    |     |
+| Deformable Convolution v1/v2 | √   | √    |     |     |
-| Deformable RoIPool           |     | √    |     |
+| Deformable RoIPool           |     | √    |     |     |
-| DiffIoURotated               |     | √    |     |
+| DiffIoURotated               |     | √    |     |     |
-| DynamicScatter               |     | √    |     |
+| DynamicScatter               |     | √    |     |     |
-| FurthestPointSample          |     | √    |     |
+| FurthestPointSample          |     | √    |     |     |
-| FurthestPointSampleWithDist  |     | √    |     |
+| FurthestPointSampleWithDist  |     | √    |     |     |
-| FusedBiasLeakyrelu           |     | √    |     |
+| FusedBiasLeakyrelu           |     | √    |     |     |
-| GatherPoints                 |     | √    |     |
+| GatherPoints                 |     | √    |     |     |
-| GroupPoints                  |     | √    |     |
+| GroupPoints                  |     | √    |     |     |
-| Iou3d                        |     | √    |     |
+| Iou3d                        |     | √    |     |     |
-| KNN                          |     | √    |     |
+| KNN                          |     | √    |     |     |
-| MaskedConv                   |     | √    |     |
+| MaskedConv                   |     | √    |     |     |
-| MergeCells                   |     | √    |     |
+| MergeCells                   |     | √    |     |     |
-| MinAreaPolygon               |     | √    |     |
+| MinAreaPolygon               |     | √    |     |     |
-| ModulatedDeformConv2d        | √   | √    |     |
+| ModulatedDeformConv2d        | √   | √    |     |     |
-| MultiScaleDeformableAttn     |     | √    |     |
+| MultiScaleDeformableAttn     |     | √    |     |     |
-| NMS                          | √   | √    | √   |
+| NMS                          | √   | √    | √   |     |
-| NMSRotated                   | √   | √    |     |
+| NMSRotated                   | √   | √    |     |     |
-| PixelGroup                   | √   |      |     |
+| PixelGroup                   | √   |      |     |     |
-| PointsInBoxes                | √   | √    |     |
+| PointsInBoxes                | √   | √    |     |     |
-| PointsInPolygons             |     | √    |     |
+| PointsInPolygons             |     | √    |     |     |
-| PSAMask                      | √   | √    | √   |
+| PSAMask                      | √   | √    | √   |     |
-| RotatedFeatureAlign          | √   | √    |     |
+| RotatedFeatureAlign          | √   | √    |     |     |
-| RoIPointPool3d               |     | √    |     |
+| RoIPointPool3d               |     | √    |     |     |
-| RoIPool                      |     | √    | √   |
+| RoIPool                      |     | √    | √   |     |
-| RoIAlignRotated              | √   | √    | √   |
+| RoIAlignRotated              | √   | √    | √   |     |
-| RiRoIAlignRotated            |     | √    |     |
+| RiRoIAlignRotated            |     | √    |     |     |
-| RoIAlign                     | √   | √    | √   |
+| RoIAlign                     | √   | √    | √   |     |
-| RoIAwarePool3d               |     | √    |     |
+| RoIAwarePool3d               |     | √    |     |     |
-| SAConv2d                     |     | √    |     |
+| SAConv2d                     |     | √    |     |     |
-| SigmoidFocalLoss             |     | √    | √   |
+| SigmoidFocalLoss             |     | √    | √   |     |
-| SoftmaxFocalLoss             |     | √    |     |
+| SoftmaxFocalLoss             |     | √    |     |     |
-| SoftNMS                      |     | √    |     |
+| SoftNMS                      |     | √    |     |     |
-| Sparse Convolution           |     | √    |     |
+| Sparse Convolution           |     | √    |     |     |
-| Synchronized BatchNorm       |     | √    |     |
+| Synchronized BatchNorm       |     | √    |     |     |
-| ThreeInterpolate             |     | √    |     |
+| ThreeInterpolate             |     | √    |     |     |
-| ThreeNN                      |     | √    |     |
+| ThreeNN                      |     | √    |     |     |
-| TINShift                     |     | √    | √   |
+| TINShift                     |     | √    | √   |     |
-| UpFirDn2d                    |     | √    |     |
+| UpFirDn2d                    |     | √    |     |     |
-| Voxelization                 | √   | √    |     |
+| Voxelization                 | √   | √    |     |     |
+| PrRoIPool                    |     | √    |     |     |
--- a/mmcv/ops/__init__.py
+++ b/mmcv/ops/__init__.py
@@ -46,6 +46,7 @@ from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
                              points_in_boxes_part)
 from .points_in_polygons import points_in_polygons
 from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
 from .psa_mask import PSAMask
 from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
 from .roi_align import RoIAlign, roi_align
@@ -100,5 +101,6 @@ __all__ = [
    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
-    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance'
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool'
 ]
--- a/mmcv/ops/csrc/README.md
+++ b/mmcv/ops/csrc/README.md
@@ -13,11 +13,19 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── pytorch_cpp_helper.hpp
 │   ├── pytorch_cuda_helper.hpp
 │   ├── pytorch_device_registry.hpp
-│   └── cuda
+│   ├── cuda
-│       ├── common_cuda_helper.hpp
+│   │   ├── common_cuda_helper.hpp
-│       ├── parrots_cudawarpfunction.cuh
+│   │   ├── parrots_cudawarpfunction.cuh
-│       ├── ...
+│   │   ├── ...
-│       └── ops_cuda_kernel.cuh
+│   │   └── ops_cuda_kernel.cuh
+|   ├── mps
+│   │   ├── MPSLibrary.h
+│   │   ├── ...
+│   │   └── MPSUtils.h
+|   ├── mlu
+│   │   └── ...
+|   └── utils
+│   │   └── ...
 ├── onnxruntime
 │   ├── onnxruntime_register.h
 │   ├── onnxruntime_session_options_config_keys.h
@@ -41,9 +49,15 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 │   ├── cuda
 │   │   ├── ...
 │   │   └── ops_cuda.cu
-│   └── cpu
+│   ├── cpu
+│   │   ├── ...
+│   │   └── ops.cpp
+│   ├── mps
+│   │   ├── ...
+│   |   └── op_mps.mm
+│   └── mlu
 │       ├── ...
-│       └── ops.cpp
+│       └── op_mlu.cpp
 └── tensorrt
    ├── trt_cuda_helper.cuh
    ├── trt_plugin_helper.hpp
@@ -63,13 +77,18 @@ This folder contains all non-python code for MMCV custom ops. Please follow the
 - `common`: This directory contains all tools and shared codes.
  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
- `onnxruntime`: **ONNX Runtime** support for custom ops.
+  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `utils`: The kernels and utils of spconv.
+- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
  - `cpu`: CPU implementation of supported ops.
 - `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
 - `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
- `tensorrt`: **TensorRT** support for custom ops.
+  - `mlu`: This directory contain launchers of each MLU kernels.
+  - `mps`: MPS ops implementation and launchers.
+- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
 ## How to add new PyTorch ops?

--- a/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
+// Distributed under terms of the MIT license.
+#ifndef PRROI_POOL_CUDA_KERNEL_CUH
+#define PRROI_POOL_CUDA_KERNEL_CUH
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
+                                                        const int h,
+                                                        const int w,
+                                                        const int height,
+                                                        const int width) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  T retVal = overflow ? 0.0f : data[h * width + w];
+  return retVal;
+}
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
+  return (1.0f - abs(dh)) * (1.0f - abs(dw));
+}
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
+                                                                   T c1, T c2) {
+  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
+}
+template <typename T>
+__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
+                                              const T w, const int height,
+                                              const int width) {
+  T retVal = 0.0f;
+  int h1 = floorf(h);
+  int w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h);
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  return retVal;
+}
+template <typename T>
+__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
+                                               const int s_h, const int s_w,
+                                               const int e_h, const int e_w,
+                                               const T y0, const T x0,
+                                               const T y1, const T x1,
+                                               const int h0, const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  T sum_out = 0;
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
+  return sum_out;
+}
+template <typename T>
+__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
+                                                  const int h, const int w,
+                                                  const int height,
+                                                  const int width,
+                                                  const T coeff) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
+}
+template <typename T>
+__device__ static void PrRoIPoolingMatDistributeDiff(
+    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
+    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
+    const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
+}
+template <typename T>
+__global__ void prroi_pool_forward_cuda_kernel(
+    const int nthreads, const T *input, const T *rois, T *output,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    const T *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = offset_rois[3] * spatial_scale;
+    T roi_y2 = offset_rois[4] * spatial_scale;
+    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
+    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const T *this_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    T *this_out = output + index;
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    if (bin_size == 0) {
+      *this_out = 0;
+      continue;
+    }
+    T sum_out = 0;
+    int start_x, start_y, end_x, end_y;
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        sum_out += PrRoIPoolingMatCalculation(
+            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+    *this_out = sum_out / bin_size;
+  }
+}
+template <typename T>
+__global__ void prroi_pool_backward_cuda_kernel(
+    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const T *this_out_grad = grad_output + index;
+    T *this_data_grad =
+        grad_input + (roi_batch_ind * channels + c) * height * width;
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
+    int start_x, start_y, end_x, end_y;
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        PrRoIPoolingMatDistributeDiff(
+            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+  }
+}
+template <typename T>
+__global__ void prroi_pool_coor_backward_cuda_kernel(
+    const int nthreads, const T *output, const T *grad_output, const T *input,
+    const T *rois, T *grad_rois, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    const T output_grad_val = grad_output[index];
+    const T *this_input_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    const T output_val = output[index];
+    T *this_rois_grad = grad_rois + n * 5;
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
+    // WARNING: to be discussed
+    if (sum_out == 0) return;
+    int start_x, start_y, end_x, end_y;
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
+    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
+      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
+                                    height, width));
+      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
+                                    height, width));
+    }
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
+      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
+                                    height, width));
+      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
+                                    height, width));
+    }
+    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
+    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
+    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
+    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
+    partial_x1 = partial_x1 / bin_size * spatial_scale;
+    partial_x2 = partial_x2 / bin_size * spatial_scale;
+    partial_y1 = partial_y1 / bin_size * spatial_scale;
+    partial_y2 = partial_y2 / bin_size * spatial_scale;
+    // (index, x1, y1, x2, y2)
+    this_rois_grad[0] = 0;
+    atomicAdd(this_rois_grad + 1,
+              (partial_x1 * (1.0f - T(pw) / pooled_width) +
+               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 2,
+              (partial_y1 * (1.0f - T(ph) / pooled_height) +
+               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
+                                   partial_x1 * T(pw) / pooled_width) *
+                                      output_grad_val);
+    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
+                                   partial_y1 * T(ph) / pooled_height) *
+                                      output_grad_val);
+  }
+}
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/mps/MPSDevice.h
+++ b/mmcv/ops/csrc/common/mps/MPSDevice.h
+//  Copyright © 2022 Apple Inc.
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+using namespace std;
+namespace at {
+namespace mps {
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() { return _mtl_device; }
+  ~MPSDevice();
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+TORCH_API bool is_available();
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+}  // namespace mps
+}  // namespace at
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.h
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.h
+#ifndef _MPS_LIBRARY_H_
+#define _MPS_LIBRARY_H_
+#include <string>
+#include <unordered_map>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLComputePipelineState;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary;
+typedef void* MTLLibrary_t;
+#endif
+class MPSLibrary {
+ public:
+  // disable constructor for singleton
+  static MPSLibrary* createFromUrl(const std::string& library_url);
+  static MPSLibrary* createFromSource(const std::string& source);
+  ~MPSLibrary();
+  MTLLibrary_t library() { return _library; }
+  MTLComputePipelineState_t getComputePipelineState(
+      const std::string& function_name);
+ private:
+  MTLLibrary_t _library;
+  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
+};
+class MPSLibraryManager {
+ public:
+  // disable constructor for singleton
+  MPSLibraryManager(const MPSLibraryManager&) = delete;
+  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
+  MPSLibraryManager(MPSLibraryManager&&) = delete;
+  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
+  static MPSLibraryManager* getInstance();
+  bool hasLibrary(const std::string& name);
+  MPSLibrary* getLibrary(const std::string& library_url);
+  MPSLibrary* createLibraryFromSouce(const std::string& name,
+                                     const std::string& sources);
+  ~MPSLibraryManager();
+ private:
+  MPSLibraryManager();
+  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
+};
+#endif
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+#include "MPSLibrary.h"
+#include <c10/util/CallOnce.h>
+#include "MPSDevice.h"
+static std::unique_ptr<MPSLibraryManager> mps_library_manager;
+static c10::once_flag mpsdev_init;
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  c10::call_once(mpsdev_init, [] {
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  });
+  return mps_library_manager.get();
+}
+MPSLibraryManager::~MPSLibraryManager() {}
+MPSLibraryManager::MPSLibraryManager() {}
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+  return library;
+}
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+  return library;
+}
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
--- a/mmcv/ops/csrc/common/mps/MPSStream.h
+++ b/mmcv/ops/csrc/common/mps/MPSStream.h
+//  Copyright © 2022 Apple Inc.
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+#pragma once
+#include <cstdint>
+#include <utility>
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+namespace at {
+namespace mps {
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+  void flush();
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+  MTLDevice_t device() const { return [_commandQueue device]; }
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+  dispatch_queue_t _serialQueue = nullptr;
+};
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+typedef MPSEvent* mpsEvent_t;
+}  // namespace mps
+}  // namespace at
--- a/mmcv/ops/csrc/common/mps/MPSUtils.h
+++ b/mmcv/ops/csrc/common/mps/MPSUtils.h
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
--- a/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
+++ b/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -1737,3 +1737,54 @@ REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
                     chamfer_distance_forward_cuda);
 REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
                     chamfer_distance_backward_cuda);
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
--- a/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
+// Copyright (c) OpenMMLab. All rights reserved
+#include "prroi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_forward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
+          output.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width,
+                                         float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_input.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
+                                             Tensor input, Tensor rois,
+                                             Tensor grad_rois,
+                                             int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_coor_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
+          input.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
--- a/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+++ b/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+using at::Tensor;
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+    const float f_offset = float(offset);
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
--- a/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}