release v1.6.1 of mmcv

fdeee889 · limm · df465820 · fdeee889 · fdeee889 · fdeee889
Commit fdeee889 authored May 25, 2025 by limm
20 changed files
--- a/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
--- a/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
--- a/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -23,7 +23,7 @@ void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
@@ -51,7 +51,7 @@ void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c, b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -21,7 +21,7 @@ void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
  cudaStream_t stream = at::cuda::getCurrentCUDAStream();

  // blockIdx.x(col), blockIdx.y(row)
-  dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), b);
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
  dim3 threads(THREADS_PER_BLOCK);

  AT_DISPATCH_FLOATING_TYPES_AND_HALF(

--- a/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
+++ b/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -145,6 +145,104 @@ int HardVoxelizeForwardCUDAKernelLauncher(
  return voxel_num_int;
 }

+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0) return 0;
+
+  dim3 blocks(
+      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors[0][0].lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(at::kInt);
+  coors_map = coors_map.to(at::kInt);
+
+  at::Tensor coors_count = at::zeros({1}, coors_map.options());
+  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
+  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
+  reduce_count = at::zeros({num_coors}, coors_map.options());
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "get_assign_pos", ([&] {
+        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
+            num_points, coors_map.contiguous().data_ptr<int32_t>(),
+            pts_id.contiguous().data_ptr<int32_t>(),
+            coors_count.contiguous().data_ptr<int32_t>(),
+            reduce_count.contiguous().data_ptr<int32_t>(),
+            coors_order.contiguous().data_ptr<int32_t>());
+      }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        nondeterministic_assign_point_voxel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                num_points, points.contiguous().data_ptr<scalar_t>(),
+                coors_map.contiguous().data_ptr<int32_t>(),
+                pts_id.contiguous().data_ptr<int32_t>(),
+                temp_coors.contiguous().data_ptr<int32_t>(),
+                reduce_count.contiguous().data_ptr<int32_t>(),
+                coors_order.contiguous().data_ptr<int32_t>(),
+                voxels.contiguous().data_ptr<scalar_t>(),
+                coors.contiguous().data_ptr<int32_t>(),
+                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+                max_voxels, max_points, num_features, NDim);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
 void DynamicVoxelizeForwardCUDAKernelLauncher(
    const at::Tensor &points, at::Tensor &coors,
    const std::vector<float> voxel_size, const std::vector<float> coors_range,

--- a/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
--- a/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+++ b/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
--- a/mmcv/ops/csrc/pytorch/iou3d.cpp
+++ b/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -19,31 +19,24 @@ void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
                       num_b, boxes_b, ans_overlap);
 }

-void iou3d_boxes_iou_bev_forward_impl(const int num_a, const Tensor boxes_a,
-                                      const int num_b, const Tensor boxes_b,
-                                      Tensor ans_iou) {
-  DISPATCH_DEVICE_IMPL(iou3d_boxes_iou_bev_forward_impl, num_a, boxes_a, num_b,
-                       boxes_b, ans_iou);
-}
-
-void iou3d_nms_forward_impl(const Tensor boxes, unsigned long long *mask,
-                            int boxes_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms_forward_impl, boxes, mask, boxes_num,
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
                       nms_overlap_thresh);
 }

-void iou3d_nms_normal_forward_impl(const Tensor boxes, unsigned long long *mask,
-                                   int boxes_num, float nms_overlap_thresh) {
-  DISPATCH_DEVICE_IMPL(iou3d_nms_normal_forward_impl, boxes, mask, boxes_num,
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
                       nms_overlap_thresh);
 }

 void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                     Tensor ans_overlap) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params boxes_b: (M, 5)
  // params ans_overlap: (N, M)
-
  int num_a = boxes_a.size(0);
  int num_b = boxes_b.size(0);

@@ -51,20 +44,9 @@ void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
                                       ans_overlap);
 }

-void iou3d_boxes_iou_bev_forward(Tensor boxes_a, Tensor boxes_b,
-                                 Tensor ans_iou) {
-  // params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
-  // params boxes_b: (M, 5)
-  // params ans_overlap: (N, M)
-  int num_a = boxes_a.size(0);
-  int num_b = boxes_b.size(0);
-
-  iou3d_boxes_iou_bev_forward_impl(num_a, boxes_a, num_b, boxes_b, ans_iou);
-}
-
-void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                       float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)
  CHECK_CONTIGUOUS(boxes);
  CHECK_CONTIGUOUS(keep);
@@ -73,13 +55,14 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  int64_t *keep_data = keep.data_ptr<int64_t>();
  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  unsigned long long *mask_data =
      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);

  at::Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long *mask_host =
@@ -105,9 +88,9 @@ void iou3d_nms_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  }
 }

-void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
-                              float nms_overlap_thresh) {
-  // params boxes: (N, 5) [x1, y1, x2, y2, ry]
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
  // params keep: (N)

  CHECK_CONTIGUOUS(boxes);
@@ -117,14 +100,15 @@ void iou3d_nms_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
  int64_t *keep_data = keep.data_ptr<int64_t>();
  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();

-  const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;

  Tensor mask =
      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
  unsigned long long *mask_data =
      (unsigned long long *)mask.data_ptr<int64_t>();
-  iou3d_nms_normal_forward_impl(boxes, mask_data, boxes_num,
-                                nms_overlap_thresh);
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
+                                  nms_overlap_thresh);

  at::Tensor mask_cpu = mask.to(at::kCPU);
  unsigned long long *mask_host =

--- a/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+++ b/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
--- a/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                        cnrtQueue_t queue, const cnrtDataType_t d_type,
+                        const void *bbox1, const void *bbox2, void *ious,
+                        const int32_t num_bbox1, const int32_t num_bbox2,
+                        const int32_t mode, const bool aligned,
+                        const int32_t offset);
+
+static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                       const int32_t batch_num_all) {
+  auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto core_num = union_num * core_dim;
+
+  // Union1 policyFunc
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_dim;
+  auto need_core_num = PAD_UP(batch_num_all, core_dim);
+  k_dim->y =
+      (need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
+  k_dim->z = 1;
+
+  return;
+}
+
+void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                   Tensor ious, const int32_t mode,
+                                   const bool aligned, const int32_t offset) {
+  // check dtype
+  TORCH_CHECK(
+      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      bboxes1.scalar_type(), ".");
+  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
+              "bboxes1's dtype should be the same with bboxes2's dtype.");
+
+  // params check
+  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
+              bboxes1.dim(), "D");
+  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
+              bboxes2.dim(), "D");
+
+  auto rows = bboxes1.size(0);
+  auto cols = bboxes2.size(0);
+  auto batch_num_all = rows;
+
+  if (rows * cols == 0) {
+    // return if zero element
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(&k_dim, &k_type, batch_num_all);
+
+  // get compute queue
+  cnrtQueue_t queue = torch_mlu::getCurQueue();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
+
+  // get ptr of tensors
+  auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
+  auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
+  auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
+  auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious);
+  auto ious_ptr = ious_impl->cnnlMalloc();
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
+  CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
+              << " ]";
+  KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
+                     ious_ptr, rows, cols, mode, aligned, offset);
+}
+
+void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <string>
+#include <vector>
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                   cnrtQueue_t queue,
+                                   const cnrtDataType_t d_type,
+                                   const void *input, const void *target,
+                                   const void *weight, const int32_t N,
+                                   const int32_t C, const float alpha,
+                                   const float gamma, void *output);
+
+void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                    cnrtQueue_t queue,
+                                    const cnrtDataType_t d_type,
+                                    const void *input, const void *target,
+                                    const void *weight, const float gamma,
+                                    const float alpha, const int32_t dim_n,
+                                    const int32_t deal_n, const int32_t dim_c,
+                                    void *output);
+// Policy Function for Forward
+static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                              const Tensor &input, const Tensor &target,
+                              const Tensor &weight) {
+  auto N = input.size(0);
+  auto C = input.size(1);
+
+  const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
+  const int split_target_num = 2;
+  const int split_pipeline_num = 6;
+  const int has_weight = weight.data_ptr() != nullptr;
+  const int target_data_width = target.scalar_type() == at::kLong
+                                    ? target.itemsize() / 2
+                                    : target.itemsize();
+  const int threshold_c =
+      PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      input.itemsize();
+
+  int n_seg = 1;
+  if (C <= threshold_c) {
+    int c_size = C * input.itemsize();
+    int reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int wegiht_size = 0;
+    if (has_weight) {
+      c_size = c_align_size;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      wegiht_size = c_align_size;
+    }
+    // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
+    // split_target_num
+    //     + weight_size + reservered_align_size <= nram_size
+    n_seg = (nram_size - wegiht_size - reservered_align_size) /
+            (split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
+  }
+  auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_num = core_dim * cluster_num;
+
+  k_dim->x = *k_type;
+  k_dim->y =
+      seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
+  k_dim->z = 1;
+}
+
+// Policy Function for Backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  // set Union1 Job
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
+                                              Tensor weight, Tensor output,
+                                              const float gamma,
+                                              const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  // return if zero-element
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  policyFuncForward(&k_dim, &k_type, input, target, weight);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+  // launch kernel
+  KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
+                                target_ptr, weight_ptr, input.size(0),
+                                input.size(1), alpha, gamma, output_ptr);
+}
+
+void getDealNAndThresholdC(const int compute_data_bytes,
+                           const int target_data_bytes, const int total_c,
+                           int *deal_n_ptr, int *threshold_c_ptr,
+                           const bool has_weight, const bool is_half) {
+  /* NRAM partition:
+   *
+   * |-----------------ping pong--------------------|
+   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
+   *
+   * split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
+   */
+  const int nram_split_num = 5;
+  const int nram_split_pingpong = 2;
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  int32_t compute_align_size = NFU_ALIGN_SIZE;
+  if (is_half) {
+    compute_align_size += NFU_ALIGN_SIZE;
+  }
+  const int32_t compute_align_num = compute_align_size / compute_data_bytes;
+  // reservered_align_size: including input(ping pong), pt(ping pong),
+  //                        alpha_t(ping pong), temp(ping pong),
+  //                        output(ping pong), target(ping pong),
+  //                        flt_min and gamma.
+  const int reservered_align_size =
+      ((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
+  int nram_pingpong_size = max_nram_size - reservered_align_size;
+
+  int compute_c = total_c;
+  int threshold_c = 0;
+  if (has_weight) {
+    // reserved space for weight to align
+    nram_pingpong_size -= NFU_ALIGN_SIZE;
+
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes +
+    //     threshold_c * compute_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
+        (compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
+    threshold_c = PAD_DOWN(threshold_c, compute_align_num);
+    int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
+
+    // reserved space for weight
+    nram_pingpong_size -= weight_space;
+    compute_c = PAD_UP(total_c, compute_align_num);
+  } else {
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
+        (nram_split_num * compute_data_bytes);
+  }
+  // deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
+  //     nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
+  //     nram_pingpong_size
+  *deal_n_ptr =
+      nram_pingpong_size /
+      ((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
+       nram_split_pingpong);
+  *threshold_c_ptr = threshold_c;
+}
+
+void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  bool has_weight = false;
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+    has_weight = true;
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  auto dim_c = input.size(1);
+  const int compute_data_bytes = sizeof(float);
+  // target supports only INT on MLU device while it keeps LONG on host side,
+  // so target.itemsize() / 2
+  const int target_data_bytes = target.scalar_type() == at::kLong
+                                    ? (target.itemsize() / 2)
+                                    : target.itemsize();
+  int deal_n = 0;
+  int threshold_c = 0;
+  bool is_half = false;
+  if (input.scalar_type() == at::kHalf) {
+    is_half = true;
+  }
+  // calculate deal_n and threshold_c
+  getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
+                        &threshold_c, has_weight, is_half);
+
+  // check C
+  TORCH_CHECK(threshold_c >= dim_c,
+              "input.size(1) should be in the range of [0, ", threshold_c,
+              "]. ", "But now input.size(1) is ", dim_c, ".");
+
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  // set task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto dim_n = input.size(0);
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+
+  // launch kernel
+  KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
+                                 target_ptr, weight_ptr, gamma, alpha, dim_n,
+                                 deal_n, dim_c, output_ptr);
+}
+
+void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
+                                           alpha);
+}
+
+void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha) {
+  SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
+                     sigmoid_focal_loss_forward_mlu);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
+                     sigmoid_focal_loss_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2021 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+
+int selectUnionType(uint32_t use_job, int box_num_per_core) {
+  // the box_num_per_core should be at least 256, otherwise the real IO
+  // bandwidth would be very low
+  while (box_num_per_core < 256 && use_job >= 4) {
+    box_num_per_core *= 2;
+    use_job /= 2;
+  }
+  return use_job;
+}
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  // dimension parameters check
+  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
+              boxes.dim(), "D");
+  TORCH_CHECK(boxes.size(1) == 4,
+              "boxes should have 4 elements in dimension 1, got ",
+              boxes.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
+              scores.dim(), "D");
+
+  // data type check
+  TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
+              "boxes should have the same type as scores");
+  TORCH_CHECK(
+      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
+      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int input_num_boxes = boxes.size(0);
+  int input_stride = boxes.size(0);
+  int max_output_boxes = boxes.size(0);
+
+  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
+  cnrtDim3_t k_dim;
+  cnrtJobType_t k_type;
+  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t job_limit = union_number * core_dim;
+  uint32_t core_number = union_number * core_dim;
+  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
+  // initiate k_type as Union1
+  k_dim.x = core_dim;
+  k_dim.y = 1;
+  k_dim.z = 1;
+  k_type = CNRT_FUNC_TYPE_UNION1;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  if (use_job < 4) {
+    k_dim.x = 1;
+    k_type = CNRT_FUNC_TYPE_BLOCK;
+  } else if (use_job == 4) {
+    k_dim.x = core_dim;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+  } else {
+    k_dim.x = use_job;
+    k_type = (cnrtFunctionType_t)use_job;
+  }
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_t = boxes.transpose(0, 1);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  // workspace
+  const int info_num = 5;  // x1, x2, y1, y2 and score
+  size_t space_size = 0;
+  if (boxes.scalar_type() == at::kHalf) {
+    space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
+  } else {
+    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
+  }
+  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
+              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
+            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+            offset, workspace_ptr, output_size_ptr, output_ptr);
+
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  return output.slice(0, 0, output_num);
+}
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <algorithm>
+
+#include "psamask_utils.hpp"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+#define COMPUTE_COUNT_ALIGN 64
+
+void KernelPsamaskForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *x, void *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+void KernelPsamaskBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *dy, void *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+namespace {
+void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
+                PartitionSeg *partition_ptr, const int n, const int h_feature) {
+  unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  unsigned int use_cluster_num = cluster_num;
+  unsigned int use_core_num = core_dim;
+
+  if (n >= cluster_num || n >= h_feature) {
+    partition_ptr->cluster_partition = PARTITION_N;
+    partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
+    partition_ptr->h_per_cluster = h_feature;
+    use_cluster_num =
+        (n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
+  } else {
+    partition_ptr->cluster_partition = PARTITION_H;
+    partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
+    partition_ptr->n_per_cluster = n;
+    use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
+                      partition_ptr->h_per_cluster;
+  }
+
+  if (partition_ptr->n_per_cluster >= core_dim ||
+      partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
+    partition_ptr->core_partition = PARTITION_N;
+    partition_ptr->n_per_core =
+        (partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->h_per_core = partition_ptr->h_per_cluster;
+    use_core_num =
+        (partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
+        partition_ptr->n_per_core;
+  } else {
+    partition_ptr->core_partition = PARTITION_H;
+    partition_ptr->h_per_core =
+        (partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->n_per_core = partition_ptr->n_per_cluster;
+    use_core_num =
+        (partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
+        partition_ptr->h_per_core;
+  }
+  *k_dim_ptr = {core_dim, use_cluster_num, 1};
+}
+
+}  // namespace
+
+bool findLimit(const int shape_core_n, const int shape_core_h,
+               const int shape_core_w, const int shape_core_ci,
+               const int shape_core_co, int *limit_n_seg_ptr,
+               int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
+  const bool need_temp = psa_type == 1;
+  const int input_bytes = sizeof(float);
+  int limit_n_seg = shape_core_n;
+  int limit_h_seg = shape_core_h;
+  int limit_w_seg = shape_core_w;
+
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
+  const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
+  const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
+  const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
+  const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
+  const int max_num = max_nram_size / input_bytes;
+
+  int n_limit =
+      max_num /
+      (CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
+       align_hw * align_co * (1 + need_temp));
+  if (n_limit > 0) {
+    n_limit = std::min(n_limit, shape_core_n);
+    limit_n_seg = n_limit;
+  } else {
+    int h_limit =
+        max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
+                   align_w * align_co * (1 + need_temp));
+    if (h_limit > 0) {
+      h_limit = std::min(h_limit, shape_core_h);
+      limit_h_seg = h_limit;
+      limit_n_seg = 1;
+    } else {
+      int w_limit =
+          max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
+                     CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
+      if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
+        w_limit = std::min(w_limit, shape_core_w);
+        w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
+                  (COMPUTE_COUNT_ALIGN / input_bytes);
+        limit_w_seg = w_limit;
+        limit_h_seg = 1;
+        limit_n_seg = 1;
+      } else {
+        CNLOG(INFO) << "The size of input channel is too large.";
+        return false;
+      }
+    }
+  }
+  *limit_n_seg_ptr = limit_n_seg;
+  *limit_h_seg_ptr = limit_h_seg;
+  *limit_w_seg_ptr = limit_w_seg;
+  return true;
+}
+
+void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
+                                     Tensor y, const int num_,
+                                     const int h_feature, const int w_feature,
+                                     const int h_mask, const int w_mask,
+                                     const int half_h_mask,
+                                     const int half_w_mask) {
+  // params check
+  TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
+              x.scalar_type());
+  TORCH_CHECK(y.scalar_type() == x.scalar_type(),
+              "y should have the same type as x");
+  TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
+  TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
+
+  int x_c = x.size(1);
+  int y_c = y.size(1);
+  TORCH_CHECK(h_mask * w_mask == x_c,
+              "channel of x should be the same as h_mask * w_mask");
+  TORCH_CHECK(h_feature * w_feature == y_c,
+              "channel of y should be the same as h_feature * w_feature");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (x.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
+  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
+  at::Tensor y_tmp =
+      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
+  auto x_ptr = x_impl->cnnlMalloc();
+  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
+  auto y_ptr = y_impl->cnnlMalloc();
+
+  KernelPsamaskForward(
+      k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
+      partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  y.copy_(y_tmp);
+}
+
+void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
+                                      Tensor dx, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  // params check
+  TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
+              dy.scalar_type());
+  TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
+              "dx should have the same type as dy");
+  TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
+  TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
+
+  int dy_c = dy.size(1);
+  int dx_c = dx.size(1);
+  TORCH_CHECK(h_feature * w_feature == dy_c,
+              "channel of dy should be the same as h_feature * w_feature");
+  TORCH_CHECK(h_mask * w_mask == dx_c,
+              "channel of dx should be the same as h_mask * w_mask");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (dx.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
+  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
+  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
+                                dy.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
+  auto dx_ptr = dx_impl->cnnlMalloc();
+  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
+  auto dy_ptr = dy_impl->cnnlMalloc();
+
+  KernelPsamaskBackward(
+      k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
+      half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  dx.copy_(dx_tmp);
+}
+
+void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
+                                  w_feature, h_mask, w_mask, half_h_mask,
+                                  half_w_mask);
+}
+
+void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                   h_feature, w_feature, h_mask, w_mask,
+                                   half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+
+REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                    cnrtQueue_t queue, const cnrtDataType_t d_type,
+                    const void *input, const void *rois, const int channels,
+                    const bool aligned, const int pooled_height,
+                    const int pooled_width, const int input_height,
+                    const int input_width, const int sampling_ratio,
+                    const float spatial_scale, const int num_rois,
+                    void *output);
+
+void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                            cnrtQueue_t queue, const cnrtDataType_t dtype,
+                            const void *grads, const void *boxes,
+                            void *grads_image, const int boxes_num,
+                            const int hi, const int wi, const int c,
+                            const int no, const int ho, const int wo,
+                            const float spatial_scale, const int sampling_ratio,
+                            const bool aligned);
+
+void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax_y, Tensor argmax_x,
+                                      int aligned_height, int aligned_width,
+                                      float spatial_scale, int sampling_ratio,
+                                      int pool_mode, bool aligned) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
+              "rois should have the same type as input");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  if (output.numel() == 0) {
+    output = at::zeros({num_rois, channels, aligned_height, aligned_width},
+                       input.options());
+    return;
+  }
+
+  at::Tensor output_tmp =
+      at::empty({num_rois, channels, aligned_height, aligned_width},
+                input.options(), memory_format);
+
+  // get tensor impl
+  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto self_ptr = self_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim.z = 1;
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
+                 aligned, aligned_height, aligned_width, height, width,
+                 sampling_ratio, spatial_scale, num_rois, output_ptr);
+
+  output.copy_(output_tmp);
+}
+
+static int nearestPower2(int x) {
+  x--;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return x;
+}
+
+void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       Tensor grad_input, int aligned_height,
+                                       int aligned_width, float spatial_scale,
+                                       int sampling_ratio, int pool_mode,
+                                       bool aligned) {
+  // params check
+  TORCH_CHECK(
+      grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
+      "grad type should be Float or Half, got ", grad.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
+              "rois should have the same type as grad");
+  TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  int batch_size = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
+  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
+  auto grad_input_ = at::empty({batch_size, channels, height, width},
+                               grad.options(), memory_format)
+                         .zero_();
+
+  int boxes_num = rois.size(0);
+  int hi = grad.size(2);
+  int wi = grad.size(3);
+  int c = grad.size(1);
+
+  int no = grad_input.size(0);
+  int ho = grad_input.size(2);
+  int wo = grad_input.size(3);
+
+  // get tensor impl
+  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_ptr = grad_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  int need_core = nearestPower2(boxes_num);
+  int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t dim_y = (need_core - 1) / dim_x + 1;
+  dim_y = (dim_y > union_number) ? union_number : dim_y;
+  cnrtDim3_t k_dim = {dim_x, dim_y, 1};
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
+
+  KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
+                         grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
+                         spatial_scale, sampling_ratio, aligned);
+  grad_input.copy_(grad_input_);
+}
+
+void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
+                                   aligned_height, aligned_width, spatial_scale,
+                                   sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardMLUKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+#include "roi_align_rotated_utils.hpp"
+
+namespace {
+
+void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+}  // namespace
+
+void KernelRoiAlignRotatedForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *features, const void *rois,
+    void *output, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void KernelRoiAlignRotatedBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
+    void *bottom_grad, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                             Tensor output, int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio, bool aligned,
+                                             bool clockwise) {
+  TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
+               (output.scalar_type() == rois.scalar_type())),
+              "data types of input, rois and output should be the same, ",
+              "but now input type is ", input.scalar_type(), ", rois type is ",
+              rois.scalar_type(), ", output type is ", output.scalar_type(),
+              ".");
+  TORCH_CHECK(
+      (input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
+              output.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == output.size(0)),
+              "the 1st dimensions of rois and output should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and output is ", output.size(0), ".");
+
+  TORCH_CHECK((input.size(1) == output.size(1)),
+              "the 2nd dimensions of input and output should be the same, ",
+              "but now the 2nd dimension of input is ", input.size(1),
+              ", and output is ", output.size(1), ".");
+
+  int channel = input.size(1);
+  int width = input.size(3);
+  int height = input.size(2);
+  int batch = input.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  // return if zero-elements
+  if (input.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  at::Tensor output_tmp =
+      at::empty({batch, channel, pooled_height, pooled_width}, input.options(),
+                memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
+                               rois_ptr, output_ptr, batch, height, width,
+                               channel, rois_nums, roiAlignRotatedParams);
+  output.copy_(output_tmp);
+}
+
+void ROIAlignRotatedBackwardMLUKernelLauncher(
+    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
+    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
+    bool clockwise) {
+  TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
+               (bottom_grad.scalar_type() == rois.scalar_type())),
+              "data types of top_grad, rois and bottom_grad should be ",
+              "the same, but now top_grad type is ", top_grad.scalar_type(),
+              ", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
+              bottom_grad.scalar_type(), ".");
+  TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
+               bottom_grad.scalar_type() == at::kHalf),
+              "Data type of bottom_grad should be Float ro Half, got ",
+              bottom_grad.scalar_type(), ".");
+
+  TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
+              top_grad.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
+              bottom_grad.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == top_grad.size(0)),
+              "the 1st dimensions of rois and top_grad should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and top_grad is ", top_grad.size(0), ".");
+
+  TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
+              "the 2nd dimensions of bottom_grad and top_grad should be ",
+              "the same, but now the 2nd dimension of bottom_grad is ",
+              bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
+
+  int channel = bottom_grad.size(1);
+  int width = bottom_grad.size(3);
+  int height = bottom_grad.size(2);
+  int batch = bottom_grad.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
+
+  // return if zero-elements
+  if (bottom_grad.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
+                                         top_grad.options(), memory_format)
+                                   .zero_();
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
+                                rois_ptr, bottom_grad_ptr, batch, height, width,
+                                channel, rois_nums, roiAlignRotatedParams);
+  bottom_grad.copy_(bottom_grad_tmp);
+}
+
+void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
+                                          aligned_width, spatial_scale,
+                                          sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  ROIAlignRotatedBackwardMLUKernelLauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
+                     roi_align_rotated_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
+                     roi_align_rotated_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax);
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale);
+
+// policy function for forward
+static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
+                              cnrtFunctionType_t *k_type) {
+  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                     Tensor argmax, int pooled_height,
+                                     int pooled_width, float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
+              "rois should have the same type as input");
+
+  // Check dtype relationship.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+
+  // Check shape.
+  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // compute kernel params
+  auto batch = input.size(0);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto channels = input.size(1);
+  auto rois_num = output.size(0);
+
+  if (output.numel() == 0) {
+    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       input.options());
+    return;
+  }
+  if (argmax.numel() == 0) {
+    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       argmax.options());
+    return;
+  }
+
+  // zero element check
+  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
+      argmax.numel() == 0) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  at::Tensor output_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                input.options(), memory_format);
+  at::Tensor argmax_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                argmax.options(), memory_format);
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
+                       batch, channels, height, width, pooled_height,
+                       pooled_width, rois_num, spatial_scale, output_ptr,
+                       (int *)argmax_ptr);
+  output.copy_(output_);
+  argmax.copy_(argmax_);
+}
+
+// policy function for backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
+                                      Tensor argmax, Tensor grad_input,
+                                      int pooled_height, int pooled_width,
+                                      float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
+               grad_output.scalar_type() == at::kHalf),
+              "grad_output type should be FLoat or Half, got ",
+              grad_output.scalar_type());
+
+  // Check dtype relationship.
+  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
+              "rois should have the same type as grad_output");
+
+  // Check shape.
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
+              grad_output.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // Check relationship between tensor.
+  // Check the relationship of n.
+  TORCH_CHECK(grad_output.size(0) == rois.size(0),
+              "grad_output.size(0) = ", grad_output.size(0),
+              ", while rois.size(0) = ", rois.size(0),
+              ". They should be the same.");
+
+  // Check the relationship of channels.
+  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
+              "grad_output.size(1) = ", grad_output.size(1),
+              ", while argmax.size(1) = ", argmax.size(1),
+              ". They should be the same.");
+
+  // Check the relationship of height and width.
+  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
+              "argmax.size(2) = ", argmax.size(2),
+              ", while grad_output.size(2) = ", grad_output.size(2),
+              ". They should be the same.");
+  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
+              "argmax.size(3) = ", argmax.size(3),
+              ", while grad_output.size(3) = ", grad_output.size(3),
+              ". They should be the same.");
+
+  // Check zero element.
+  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
+      grad_input.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
+
+  int boxes_num = grad_output.size(0);
+  int no = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto grad_input_ = at::empty({no, channels, height, width},
+                               grad_input.options(), memory_format)
+                         .zero_();
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // calculate task dimension
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
+                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
+                        pooled_height, pooled_width, channels, no, height,
+                        width, spatial_scale);
+
+  grad_input.copy_(grad_input_);
+}
+
+void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
+                                  pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
+                                   pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelTinShiftForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+void KernelTinShiftBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *grad_output, const void *shifts, void *grad_input,
+    const int batch_size, const int time_size, const int channel_size,
+    const int hw_size, const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+// policy function
+static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type, int *channel_per_core,
+                       int *max_number_hw_per_core, int *max_length_per_core) {
+  const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int core_num = core_limit * cluster_limit;
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+
+  const size_t size_per_channel = time_size * hw_size * input.itemsize();
+  *channel_per_core = nram_size / size_per_channel;
+  int task_dim = 0;
+  if (*channel_per_core == 0) {
+    const size_t size_per_hw = hw_size * input.itemsize();
+    *max_number_hw_per_core = nram_size / size_per_hw;
+    if (*max_number_hw_per_core <= 0) {
+      *max_length_per_core = nram_size / input.itemsize();
+    }
+    int tmp_max_number_hw_per_core =
+        *max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1;
+    const int loop_time =
+        (time_size / (tmp_max_number_hw_per_core)) +
+        ((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0);
+    task_dim = batch_size * channel_size * loop_time < core_num
+                   ? batch_size * channel_size * loop_time
+                   : core_num;
+  } else {
+    task_dim = batch_size * channel_size < core_num ? batch_size * channel_size
+                                                    : core_num;
+  }
+
+  k_dim->x = core_limit;
+  k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1;
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+}
+
+void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
+                                      Tensor output) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(
+      input.size(0) == shift.size(0),
+      "input batch size should be the same as shift's, input batch size is ",
+      input.size(0), " and shift batch size is ", shift.size(0), ".");
+  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
+  TORCH_CHECK(input.size(3) != 0,
+              "The last dim size of input should not be zero.");
+  if (input.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core,
+             &max_length_per_core);
+
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr,
+                        batch_size, time_size, channel_size, hw_size,
+                        group_size, group_channel, data_dtype, channel_per_core,
+                        max_number_hw_per_core, max_length_per_core);
+}
+
+void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
+                                       Tensor grad_input) {
+  // params check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type(), ".");
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(grad_output.size(0) == shift.size(0),
+              "grad_output batch size should be the same as shift's, "
+              "grad_output batch size is ",
+              grad_output.size(0), ", shift batch size is ", shift.size(0),
+              ".");
+  TORCH_CHECK(grad_output.size(0) != 0,
+              "grad_output batch size should not be zero.");
+  TORCH_CHECK(grad_output.size(3) != 0,
+              "The last dim size of grad_output should not be zero.");
+  if (grad_output.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(grad_output, &k_dim, &k_type, &channel_per_core,
+             &max_number_hw_per_core, &max_length_per_core);
+
+  const int batch_size = grad_output.size(0);
+  const int time_size = grad_output.size(1);
+  const int channel_size = grad_output.size(2);
+  const int hw_size = grad_output.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype());
+
+  KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr,
+                         grad_input_ptr, batch_size, time_size, channel_size,
+                         hw_size, group_size, group_channel, data_dtype,
+                         channel_per_core, max_number_hw_per_core,
+                         max_length_per_core);
+}
+
+void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardMLUKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
+                            Tensor grad_input) {
+  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
--- a/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+++ b/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+
+using at::Tensor;
+
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+
+    const float f_offset = float(offset);
+
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
--- a/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+++ b/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
--- a/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+++ b/mmcv/ops/csrc/pytorch/prroi_pool.cpp
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}