support v1.4.0

6f3c5f1c · limm · 6f674c7e · 6f3c5f1c · 6f674c7e · 6f3c5f1c
Commit 6f3c5f1c authored Jul 11, 2024 by limm
20 changed files
--- a/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -42,23 +42,23 @@ __global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
  // params boxes3d: (B, M, 7)
  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
  // background points
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
-  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
-    if (box_idx >= boxes_num || bs_idx >= batch_size) return;

-    int assign_idx =
-        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
-    pts_assign[assign_idx] = 0;
+  if (pt_idx >= pts_num || box_idx >= boxes_num || bs_idx >= batch_size) {
+    return;
+  }
+  int assign_idx = bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+  pts_assign[assign_idx] = 0;

-    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
-    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+  int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+  int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;

-    T local_x = 0, local_y = 0;
-    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
-                                        local_x, local_y);
-    pts_assign[assign_idx] = cur_in_flag;
-  }
+  T local_x = 0, local_y = 0;
+  int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                      local_x, local_y);
+  pts_assign[assign_idx] = cur_in_flag;
 }

 __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
@@ -69,32 +69,35 @@ __global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
  // params pts_assign: (B, N)
  // params pts_idx: (B, M, 512)
  // params pooled_empty_flag: (B, M)
-  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
-    int bs_idx = blockIdx.y;
-
-    int cnt = 0;
-    for (int k = 0; k < pts_num; k++) {
-      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
-                     boxes_idx]) {
-        if (cnt < sampled_pts_num) {
-          pts_idx[bs_idx * boxes_num * sampled_pts_num +
-                  boxes_idx * sampled_pts_num + cnt] = k;
-          cnt++;
-        } else
-          break;
-      }
+
+  int boxes_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (boxes_idx >= boxes_num) {
+    return;
+  }
+
+  int bs_idx = blockIdx.y;
+
+  int cnt = 0;
+  for (int k = 0; k < pts_num; k++) {
+    if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num + boxes_idx]) {
+      if (cnt < sampled_pts_num) {
+        pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                boxes_idx * sampled_pts_num + cnt] = k;
+        cnt++;
+      } else
+        break;
    }
+  }

-    if (cnt == 0) {
-      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
-    } else if (cnt < sampled_pts_num) {
-      // duplicate same points for sampling
-      for (int k = cnt; k < sampled_pts_num; k++) {
-        int duplicate_idx = k % cnt;
-        int base_offset =
-            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
-        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
-      }
+  if (cnt == 0) {
+    pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+  } else if (cnt < sampled_pts_num) {
+    // duplicate same points for sampling
+    for (int k = cnt; k < sampled_pts_num; k++) {
+      int duplicate_idx = k % cnt;
+      int base_offset =
+          bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+      pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
    }
  }
 }
@@ -109,26 +112,33 @@ __global__ void roipoint_pool3d_forward(
  // params pts_feature: (B, N, C)
  // params pooled_features: (B, M, 512, 3+C)
  // params pooled_empty_flag: (B, M)
+
+  int sample_pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
  int box_idx = blockIdx.y;
  int bs_idx = blockIdx.z;
-  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
-    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
-    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
-
-    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
-                   box_idx * sampled_pts_num + sample_pt_idx;
-    int src_pt_idx = pts_idx[temp_idx];
-    int dst_feature_offset = temp_idx * (3 + feature_in_len);
-
-    for (int j = 0; j < 3; j++)
-      pooled_features[dst_feature_offset + j] =
-          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
-
-    int src_feature_offset =
-        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
-    memcpy(pooled_features + dst_feature_offset + 3,
-           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+
+  if (sample_pt_idx >= sampled_pts_num || box_idx >= boxes_num ||
+      bs_idx >= batch_size) {
+    return;
+  }
+
+  if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) {
+    return;
  }
+
+  int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                 box_idx * sampled_pts_num + sample_pt_idx;
+  int src_pt_idx = pts_idx[temp_idx];
+  int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+  for (int j = 0; j < 3; j++)
+    pooled_features[dst_feature_offset + j] =
+        xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+  int src_feature_offset =
+      bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+  memcpy(pooled_features + dst_feature_offset + 3,
+         pts_feature + src_feature_offset, feature_in_len * sizeof(T));
 }

 #endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
-#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
-#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename scalar_t>
-__global__ void rotated_feature_align_forward_kernel(
-    const int nthreads, const int points, const scalar_t* bottom_data,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width, scalar_t* top_data) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    const scalar_t* offset_bottom_data =
-        bottom_data + (n * channels + c) * height * width;
-
-    scalar_t output_val = bottom_data[index];
-    for (int i = 0; i < points; i++) {
-      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
-                                                   width, py[i], px[i], i);
-    }
-    top_data[index] = output_val;
-  }
-}
-
-template <typename scalar_t>
-__global__ void rotated_feature_align_backward_kernel(
-    const int nthreads, const int points, const scalar_t* top_diff,
-    const scalar_t* best_bboxes, const scalar_t spatial_scale,
-    const int channels, const int height, const int width,
-    scalar_t* bottom_diff) {
-  CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int w = index % width;
-    int h = (index / width) % height;
-    int c = (index / width / height) % channels;
-    int n = index / width / height / channels;
-
-    const scalar_t* bbox_offset =
-        best_bboxes + ((n * height + h) * width + w) * 5;
-    scalar_t roi_y = bbox_offset[0] * spatial_scale;
-    scalar_t roi_x = bbox_offset[1] * spatial_scale;
-
-    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
-    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
-
-    if (points > 1) {
-      scalar_t roi_w = bbox_offset[2] * spatial_scale;
-      scalar_t roi_h = bbox_offset[3] * spatial_scale;
-      scalar_t roi_a = bbox_offset[4];
-
-      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
-      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
-      scalar_t wx = cosa * w_2, wy = sina * w_2;
-      scalar_t hx = -sina * h_2, hy = cosa * h_2;
-
-      px[1] = roi_x + wx + hx;
-      py[1] = roi_y + wy + hy;
-      px[2] = roi_x - wx + hx;
-      py[2] = roi_y - wy + hy;
-      px[3] = roi_x - wx - hx;
-      py[3] = roi_y - wy - hy;
-      px[4] = roi_x + wx - hx;
-      py[4] = roi_y + wy - hy;
-    }
-
-    scalar_t* offset_bottom_diff =
-        bottom_diff + (n * channels + c) * height * width;
-    scalar_t value_top_diff = top_diff[index];
-
-    atomicAdd(bottom_diff + index, value_top_diff);
-    for (int i = 0; i < points; i++) {
-      scalar_t w1, w2, w3, w4;
-      int x_low, x_high, y_low, y_high;
-
-      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
-                                              w2, w3, w4, x_low, x_high, y_low,
-                                              y_high, i);
-      scalar_t g1 = value_top_diff * w1;
-      scalar_t g2 = value_top_diff * w2;
-      scalar_t g3 = value_top_diff * w3;
-      scalar_t g4 = value_top_diff * w4;
-      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
-        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
-        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
-        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
-      }
-    }
-  }
-}
-#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -34,7 +34,7 @@ __device__ __forceinline__ static void reduceMax(double *address, double val) {
 }

 // get rid of meaningless warnings when compiling host code
-#ifdef MMCV_WITH_HIP
+#ifdef HIP_DIFF
 __device__ __forceinline__ static void reduceAdd(float *address, float val) {
  atomicAdd(address, val);
 }
@@ -86,7 +86,7 @@ __device__ __forceinline__ static void reduceAdd(double *address, double val) {
 #endif
 }
 #endif  // __CUDA_ARCH__
-#endif  // MMCV_WITH_HIP
+#endif  // HIP_DIFF

 template <typename T>
 __global__ void feats_reduce_kernel(

--- a/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
+++ b/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef INDICE_CU_H_
-#define INDICE_CU_H_
-#include <utils/spconv/spconv/geometry.h>
-#include <utils/spconv/tensorview/tensorview.h>
-
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void prepareDeConvIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
-    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index kernelVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    kernelVolume *= kernelSize[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  auto indicePairsDim2 = indicePairs.dim(2);
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPosTranspose<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (Index i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-      indicePairs(offset, 0, oldNum) = ix;
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      indicePairs(offset, 1, oldNum) = index;
-      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignGridAndIndiceOutKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numAct, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
-  Index index;
-  auto indicesOutPtr = indicesOut.data();
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    index = indicePairUnique[ix];
-    gridsOut[index] = ix;
-    index = tv::rowArrayIdxInv<Index, NDim>(
-        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
-    indicesOut[ix * (NDim + 1)] = index % batchSize;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void assignIndicePairsKernel(
-    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
-    int numActIn, tv::TensorView<Index> indicePairs,
-    tv::TensorView<Index> indicePairUnique,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  Index index;
-  int kernelVolume = indicePairs.dim(0);
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    for (int i = 0; i < kernelVolume; ++i) {
-      index = indicePairs(i, 1, ix);
-      if (index > -1) {
-        indicePairs(i, 1, ix) = gridsOut[index];
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void prepareSubMGridKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
-                                         outSpatialShape.data()) +
-            spatialVolume * indicesIn(ix, 0);
-    gridsOut[index] = ix;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim,
-          int KernelMaxVolume = 256>
-__global__ void getSubMIndicePairsKernel(
-    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
-    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
-    const tv::SimpleVector<Index, NDim> kernelSize,
-    const tv::SimpleVector<Index, NDim> stride,
-    const tv::SimpleVector<Index, NDim> padding,
-    const tv::SimpleVector<Index, NDim> dilation,
-    const tv::SimpleVector<Index, NDim> outSpatialShape) {
-  auto numActIn = indicesIn.dim(0);
-  Index spatialVolume = 1;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index numValidPoints = 0;
-  Index validPoints[KernelMaxVolume * (NDim + 1)];
-  Index *pointPtr = nullptr;
-  Index index = 0;
-  for (int ix : tv::KernelLoopX<int>(numActIn)) {
-    numValidPoints = getValidOutPos<Index, NDim>(
-        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
-        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
-        validPoints);
-    for (int i = 0; i < numValidPoints; ++i) {
-      pointPtr = validPoints + i * (NDim + 1);
-      auto offset = pointPtr[NDim];
-      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
-              spatialVolume * indicesIn(ix, 0);
-      if (gridsOut[index] > -1) {
-        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
-        indicePairs(offset, 1, oldNum) = gridsOut[index];
-        indicePairs(offset, 0, oldNum) = ix;
-      }
-    }
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridKernel(const Index *indicePairUnique,
-                                tv::TensorView<IndexGrid> gridsOut,
-                                int numAct) {
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    gridsOut[indicePairUnique[ix]] = -1;
-  }
-}
-
-template <typename Index, typename IndexGrid, unsigned NDim>
-__global__ void resetGridSubMKernel(
-    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
-    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
-  int outSpatialShapeReg[NDim];
-  for (int i = 0; i < NDim; ++i) {
-    outSpatialShapeReg[i] = outSpatialShape[i];
-  }
-  Index spatialVolume = 1;
-  auto indsPtr = indices;
-#pragma unroll
-  for (int i = 0; i < NDim; ++i) {
-    spatialVolume *= outSpatialShape[i];
-  }
-  Index index;
-  for (int ix : tv::KernelLoopX<int>(numAct)) {
-    indsPtr = indices + ix * (NDim + 1);
-    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
-    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
-  }
-}
-
-#endif
--- a/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
+++ b/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
-// Copyright 2019 Yan Yan
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef REORDERING_CU_H_
-#define REORDERING_CU_H_
-#include <utils/spconv/tensorview/helper_kernel.cuh>
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
-                                    const Index *indices, int size,
-                                    int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              features[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType>
-__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
-                                const Index *indices, int size, int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size)
-          reinterpret_cast<VecType *>(
-              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
-              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
-                                     const Index *indices, int size,
-                                     int numPlanes) {
-  int ILPStrideY[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  features += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      reinterpret_cast<VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
-          reinterpret_cast<const VecType *>(
-              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
-                        threadIdx.x];
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP>
-__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
-                                        const scalar_t *buffer,
-                                        const Index *indices, int size,
-                                        int numPlanes) {
-  int ILPStrideX[NumILP];
-  Index inds[NumILP];
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
-  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ilp++) {
-      if (ix + ILPStrideX[ilp] < size)
-        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
-    }
-    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
-#pragma unroll
-      for (int ilp = 0; ilp < NumILP; ++ilp) {
-        if (ix + ILPStrideX[ilp] < size) {
-          outFeatures[inds[ilp] + iy] +=
-              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
-        }
-      }
-    }
-  }
-}
-
-template <typename scalar_t, typename Index, int NumTLP, int NumILP,
-          typename VecType = int4>
-__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
-                                         const scalar_t *buffer,
-                                         const Index *indices, int size,
-                                         int numPlanes) {
-  int ILPStrideY[NumILP];
-  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
-#pragma unroll
-  for (int ilp = 0; ilp < NumILP; ilp++)
-    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
-  outFeatures += blockIdx.x * NumTLP;
-  buffer += blockIdx.x * NumTLP;
-  scalar_t buf[vecloadFactor];
-  scalar_t buf2[vecloadFactor];
-  Index idx;
-  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
-#pragma unroll
-    for (int ilp = 0; ilp < NumILP; ++ilp) {
-      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
-      reinterpret_cast<VecType *>(buf)[0] =
-          reinterpret_cast<VecType *>(outFeatures)[idx];
-      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
-          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
-#pragma unroll
-      for (int i = 0; i < vecloadFactor; i++) {
-        buf[i] += buf2[i];
-      }
-      reinterpret_cast<VecType *>(outFeatures)[idx] =
-          reinterpret_cast<VecType *>(buf)[0];
-    }
-  }
-}
-
-#endif
--- a/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/stack_ball_query_cuda_kernel.cuh
-// Copyright (c) OpenMMLab. All rights reserved
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
-#ifndef STACK_BALL_QUERY_CUDA_KERNEL_CUH
-#define STACK_BALL_QUERY_CUDA_KERNEL_CUH
-
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-
-template <typename T>
-__global__ void stack_ball_query_forward_cuda_kernel(
-    int B, int M, float radius, int nsample, const T *new_xyz,
-    const int *new_xyz_batch_cnt, const T *xyz, const int *xyz_batch_cnt,
-    int *idx) {
-  // :param xyz: (N1 + N2 ..., 3) xyz coordinates of the features
-  // :param xyz_batch_cnt: (batch_size), [N1, N2, ...]
-  // :param new_xyz: (M1 + M2 ..., 3) centers of the ball query
-  // :param new_xyz_batch_cnt: (batch_size), [M1, M2, ...]
-  // output:
-  //      idx: (M, nsample)
-  const T *cur_xyz = xyz;
-  int *cur_idx = idx;
-  CUDA_1D_KERNEL_LOOP(pt_idx, M) {
-    int bs_idx = 0;
-    for (int pt_cnt = 0; bs_idx < B; bs_idx++) {
-      pt_cnt += new_xyz_batch_cnt[bs_idx];
-      if (pt_idx < pt_cnt) break;
-    }
-
-    int xyz_batch_start_idx = 0;
-    for (int k = 0; k < bs_idx; k++) xyz_batch_start_idx += xyz_batch_cnt[k];
-
-    const T *new_xyz_p = new_xyz + pt_idx * 3;
-    cur_xyz += xyz_batch_start_idx * 3;
-    cur_idx += pt_idx * nsample;
-
-    float radius2 = radius * radius;
-    T new_x = new_xyz_p[0];
-    T new_y = new_xyz_p[1];
-    T new_z = new_xyz_p[2];
-    int n = xyz_batch_cnt[bs_idx];
-
-    int cnt = 0;
-    for (int k = 0; k < n; ++k) {
-      T x = cur_xyz[k * 3 + 0];
-      T y = cur_xyz[k * 3 + 1];
-      T z = cur_xyz[k * 3 + 2];
-      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
-             (new_z - z) * (new_z - z);
-      if (d2 < radius2) {
-        if (cnt == 0) {
-          for (int l = 0; l < nsample; ++l) {
-            cur_idx[l] = k;
-          }
-        }
-        cur_idx[cnt] = k;
-        ++cnt;
-        if (cnt >= nsample) break;
-      }
-    }
-    if (cnt == 0) cur_idx[0] = -1;
-  }
-}
-
-#endif  // STACK_BALL_QUERY_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/stack_group_points_cuda_kernel.cuh
-// Copyright (c) OpenMMLab. All rights reserved.
-// Modified from
-// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
-#ifndef STACK_GROUP_POINTS_CUDA_KERNEL_CUH
-#define STACK_GROUP_POINTS_CUDA_KERNEL_CUH
-#ifdef MMCV_USE_PARROTS
-#include "parrots_cuda_helper.hpp"
-#else
-#include "pytorch_cuda_helper.hpp"
-#endif
-#include <stdio.h>
-template <typename T>
-__global__ void stack_group_points_forward_cuda_kernel(
-    int b, int c, int m, int nsample, const T *features,
-    const int *features_batch_cnt, const int *idx, const int *idx_batch_cnt,
-    T *out) {
-  // :param features: (N1 + N2 ..., C) tensor of features to group
-  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
-  // indices of features to group with :param idx: (M1 + M2 ..., nsample) tensor
-  // containing the indices of features to group with :param idx_batch_cnt:
-  // (batch_size) [M1 + M2 ...] tensor containing the indices of features to
-  // group with :return:
-  //     output: (M1 + M2, C, nsample) tensor
-  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
-    const T *cur_features = features;
-    const int *cur_idx = idx;
-    int sample_idx = index % nsample;
-    int c_idx = (index / nsample) % c;
-    int pt_idx = (index / nsample / c);
-
-    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
-    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
-    for (int k = 1; k < b; k++) {
-      if (pt_idx < pt_cnt) break;
-      pt_cnt += idx_batch_cnt[k];
-      bs_idx = k;
-    }
-
-    int features_batch_start_idx = 0;
-    int features_batch_end_idx = features_batch_cnt[0];
-    for (int k = 0; k < bs_idx; k++) {
-      features_batch_start_idx += features_batch_cnt[k];
-      features_batch_end_idx =
-          features_batch_start_idx + features_batch_cnt[k + 1];
-    }
-    cur_features += features_batch_start_idx * c;
-
-    cur_idx += pt_idx * nsample + sample_idx;
-    int in_idx = cur_idx[0] * c + c_idx;
-    int out_idx = pt_idx * c * nsample + c_idx * nsample + sample_idx;
-    if (in_idx < features_batch_end_idx * c) {
-      out[out_idx] = cur_features[in_idx];
-    }
-  }
-}
-
-template <typename T>
-__global__ void stack_group_points_backward_cuda_kernel(
-    int b, int c, int m, int n, int nsample, const T *grad_out, const int *idx,
-    const int *idx_batch_cnt, const int *features_batch_cnt, T *grad_features) {
-  // :param grad_out: (M1 + M2 ..., C, nsample) tensor of the gradients of the
-  // output from forward :param idx: (M1 + M2 ..., nsample) tensor containing
-  // the indices of features to group with :param idx_batch_cnt: (batch_size)
-  // [M1 + M2 ...] tensor containing the indices of features to group with
-  // :param features_batch_cnt: (batch_size) [N1 + N2 ...] tensor containing the
-  // indices of features to group with :return:
-  //     grad_features: (N1 + N2 ..., C) gradient of the features
-  CUDA_1D_KERNEL_LOOP(index, m * c * nsample) {
-    const T *cur_grad_out = grad_out;
-    const int *cur_idx = idx;
-    T *cur_grad_features = grad_features;
-    int sample_idx = index % nsample;
-    int c_idx = (index / nsample) % c;
-    int pt_idx = (index / nsample / c);
-
-    if (pt_idx >= m || c_idx >= c || sample_idx >= nsample) return;
-
-    int bs_idx = 0, pt_cnt = idx_batch_cnt[0];
-    for (int k = 1; k < b; k++) {
-      if (pt_idx < pt_cnt) break;
-      pt_cnt += idx_batch_cnt[k];
-      bs_idx = k;
-    }
-
-    int features_batch_start_idx = 0;
-    for (int k = 0; k < bs_idx; k++)
-      features_batch_start_idx += features_batch_cnt[k];
-
-    cur_grad_out += pt_idx * c * nsample + c_idx * nsample + sample_idx;
-    cur_idx += pt_idx * nsample + sample_idx;
-    cur_grad_features += (features_batch_start_idx + cur_idx[0]) * c + c_idx;
-
-    atomicAdd(cur_grad_features, cur_grad_out[0]);
-  }
-}
-
-#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -20,17 +20,17 @@ __global__ void three_interpolate_forward_cuda_kernel(

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b || c_idx >= c) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;

-    weight += bs_idx * n * 3 + pt_idx * 3;
-    points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-    out += bs_idx * c * n + c_idx * n;
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;

-    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
-                  weight[2] * points[idx[2]];
-  }
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+  out += bs_idx * c * n + c_idx * n;
+
+  out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                weight[2] * points[idx[2]];
 }

 template <typename T>
@@ -44,18 +44,18 @@ __global__ void three_interpolate_backward_cuda_kernel(

  int bs_idx = blockIdx.z;
  int c_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b || c_idx >= c) return;
-
-    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
-    weight += bs_idx * n * 3 + pt_idx * 3;
-    grad_points += bs_idx * c * m + c_idx * m;
-    idx += bs_idx * n * 3 + pt_idx * 3;
-
-    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
-    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
-    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
-  }
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
+
+  grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+  weight += bs_idx * n * 3 + pt_idx * 3;
+  grad_points += bs_idx * c * m + c_idx * m;
+  idx += bs_idx * n * 3 + pt_idx * 3;
+
+  atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+  atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+  atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
 }

 #endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -19,49 +19,48 @@ __global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
  //      idx: (B, N, 3)

  int bs_idx = blockIdx.y;
-  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
-    if (bs_idx >= b) return;
+  int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (bs_idx >= b || pt_idx >= n) return;

-    unknown += bs_idx * n * 3 + pt_idx * 3;
-    known += bs_idx * m * 3;
-    dist2 += bs_idx * n * 3 + pt_idx * 3;
-    idx += bs_idx * n * 3 + pt_idx * 3;
+  unknown += bs_idx * n * 3 + pt_idx * 3;
+  known += bs_idx * m * 3;
+  dist2 += bs_idx * n * 3 + pt_idx * 3;
+  idx += bs_idx * n * 3 + pt_idx * 3;

-    T ux = unknown[0];
-    T uy = unknown[1];
-    T uz = unknown[2];
+  T ux = unknown[0];
+  T uy = unknown[1];
+  T uz = unknown[2];

-    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
-    int besti1 = 0, besti2 = 0, besti3 = 0;
-    for (int k = 0; k < m; ++k) {
-      T x = known[k * 3 + 0];
-      T y = known[k * 3 + 1];
-      T z = known[k * 3 + 2];
-      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
-      if (d < best1) {
-        best3 = best2;
-        besti3 = besti2;
-        best2 = best1;
-        besti2 = besti1;
-        best1 = d;
-        besti1 = k;
-      } else if (d < best2) {
-        best3 = best2;
-        besti3 = besti2;
-        best2 = d;
-        besti2 = k;
-      } else if (d < best3) {
-        best3 = d;
-        besti3 = k;
-      }
+  double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+  int besti1 = 0, besti2 = 0, besti3 = 0;
+  for (int k = 0; k < m; ++k) {
+    T x = known[k * 3 + 0];
+    T y = known[k * 3 + 1];
+    T z = known[k * 3 + 2];
+    T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+    if (d < best1) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = best1;
+      besti2 = besti1;
+      best1 = d;
+      besti1 = k;
+    } else if (d < best2) {
+      best3 = best2;
+      besti3 = besti2;
+      best2 = d;
+      besti2 = k;
+    } else if (d < best3) {
+      best3 = d;
+      besti3 = k;
    }
-    dist2[0] = best1;
-    dist2[1] = best2;
-    dist2[2] = best3;
-    idx[0] = besti1;
-    idx[1] = besti2;
-    idx[2] = besti3;
  }
+  dist2[0] = best1;
+  dist2[1] = best2;
+  dist2[2] = best3;
+  idx[0] = besti1;
+  idx[1] = besti2;
+  idx[2] = besti3;
 }

 #endif  // THREE_NN_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
+++ b/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -23,20 +23,20 @@ __global__ void dynamic_voxelize_kernel(
    // To save some computation
    auto points_offset = points + index * num_features;
    auto coors_offset = coors + index * NDim;
-    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
    if (c_x < 0 || c_x >= grid_x) {
      coors_offset[0] = -1;
      continue;
    }

-    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
    if (c_y < 0 || c_y >= grid_y) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
      continue;
    }

-    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
    if (c_z < 0 || c_z >= grid_z) {
      coors_offset[0] = -1;
      coors_offset[1] = -1;
@@ -101,7 +101,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
  CUDA_1D_KERNEL_LOOP(index, num_points) {
    auto coor_offset = coor + index * NDim;
    // skip invalid points
-    if (coor_offset[0] == -1) continue;
+    if ((index >= num_points) || (coor_offset[0] == -1)) return;

    int num = 0;
    int coor_x = coor_offset[0];
@@ -122,7 +122,7 @@ __global__ void point_to_voxelidx_kernel(const T_int* coor,
          point_to_pointidx[index] = i;
        } else if (num >= max_points) {
          // out of boundary
-          break;
+          return;
        }
      }
    }
@@ -166,51 +166,4 @@ __global__ void determin_voxel_num(
  }
 }

-__global__ void nondeterministic_get_assign_pos(
-    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
-    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    if (coors_idx > -1) {
-      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
-      pts_id[thread_idx] = coors_pts_pos;
-      if (coors_pts_pos == 0) {
-        coors_order[coors_idx] = atomicAdd(coors_count, 1);
-      }
-    }
-  }
-}
-
-template <typename T>
-__global__ void nondeterministic_assign_point_voxel(
-    const int nthreads, const T* points, const int32_t* coors_map,
-    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
-    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
-    const int max_voxels, const int max_points, const int num_features,
-    const int NDim) {
-  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
-    int coors_idx = coors_map[thread_idx];
-    int coors_pts_pos = pts_id[thread_idx];
-    if (coors_idx > -1 && coors_pts_pos < max_points) {
-      int coors_pos = coors_order[coors_idx];
-      if (coors_pos < max_voxels) {
-        auto voxels_offset =
-            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
-        auto points_offset = points + thread_idx * num_features;
-        for (int k = 0; k < num_features; k++) {
-          voxels_offset[k] = points_offset[k];
-        }
-        if (coors_pts_pos == 0) {
-          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
-          auto coors_offset = coors + coors_pos * NDim;
-          auto coors_in_offset = coors_in + coors_idx * NDim;
-          for (int k = 0; k < NDim; k++) {
-            coors_offset[k] = coors_in_offset[k];
-          }
-        }
-      }
-    }
-  }
-}
-
 #endif  // VOXELIZATION_CUDA_KERNEL_CUH
--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <float.h>
-
-#include "common_mlu_helper.hpp"
-
-#define COORD_NUM 4
-
-__nram__ char nmem_buf[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
-                             void *nram_addition, const int32_t deal_num) {
-  __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
-  __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
-}
-
-template <>
-__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
-                                   void *nram_src1, void *nram_addition,
-                                   const int32_t deal_num) {
-  __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
-  __bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
-                        deal_num);
-  __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
-  __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
-}
-
-template <typename T>
-__mlu_func__ void bboxOverlapsWorkflow(
-    T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
-    T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
-    T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
-    const int32_t offset, const int32_t mode, const int32_t batches_stride,
-    const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
-  int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
-  int32_t batch_start = taskId * task_batch_stride;
-  int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
-                               ? task_batch_stride
-                               : num_bbox1 - batch_start;
-  batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
-
-  if (aligned) {
-    int32_t num_loop_cpy = batch_per_task / batches_stride;
-    int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < num_loop_cpy; i++) {
-      int32_t index = batch_start + i * batches_stride;
-      int32_t handle_batches = index + batches_stride > num_bbox1
-                                   ? num_rem_cpy_batches
-                                   : batches_stride;
-      int32_t b1 = index;
-      int32_t b2 = index;
-
-      int32_t base1 = b1 * COORD_NUM;
-      __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-
-      int32_t base2 = b2 * COORD_NUM;
-      __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-               COORD_NUM * sizeof(T), handle_batches - 1);
-      // get the width and height
-      __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-      __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-      __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-      __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-
-      // right - left + offset ---> left
-      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-
-      // bottom - top + offset ---> right
-      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-
-      // zero vector ---> bottom
-      __bang_write_value(vec_bottom, batches_stride, 0.f);
-
-      // width --> vec_left
-      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-      T *width = vec_left;
-      // height --> vec_right
-      __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-      T *height = vec_right;
-
-      // get the b1_area
-      // (b1_x2 - b1_x1 + offset)  --->  vec_top
-      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-
-      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-
-      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-      // --->  vec_top;
-      __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-      T *b1_area = vec_top;
-
-      // get the b2_area
-      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-
-      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-
-      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-      // --->  b2_x1;
-      __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-      T *b2_area = vec_b2_x1;
-
-      // inter_s = width * height
-      __bang_mul(height, width, height, batches_stride);
-      T *inter_s = height;
-
-      // offset vector ---> vec_b2_y1
-      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-      T *vec_offset = vec_b2_y1;
-
-      if (mode == 0) {
-        __bang_add(b1_area, b1_area, b2_area, batches_stride);
-        __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      } else {
-        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-      }
-      T *base_s = b1_area;
-
-      // ious = inter_s / base_s
-      computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-      __memcpy((T *)ious + index, width, handle_batches * sizeof(T),
-               NRAM2GDRAM);
-    }
-  } else {
-    int32_t num_loop_cpy = num_bbox2 / batches_stride;
-    int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
-    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
-    for (int32_t i = 0; i < batch_per_task; i++) {
-      int32_t index1 = batch_start + i;
-      int32_t b1 = index1;
-      int32_t base1 = b1 * COORD_NUM;
-
-      // set bbox1 and bbox2 to nram
-      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
-      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
-
-      for (int32_t j = 0; j < num_loop_cpy; j++) {
-        int32_t index2 = j * batches_stride;
-        int32_t handle_batches = index2 + batches_stride > num_bbox2
-                                     ? num_rem_cpy_batches
-                                     : batches_stride;
-        int32_t b2 = index2;
-        int32_t base2 = b2 * COORD_NUM;
-
-        // copy bbox2 to nram
-        __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-        __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
-                 COORD_NUM * sizeof(T), handle_batches - 1);
-
-        // get the width and height
-        __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
-        __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
-        __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
-        __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
-
-        // right - left + offset ---> left
-        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
-        // bottom - top + offset ---> right
-        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);
-
-        // zero vector ---> bottom
-        __bang_write_value(vec_bottom, batches_stride, (T)0);
-
-        // width --> vec_left
-        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
-        T *width = vec_left;
-        // height --> vec_right
-        __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
-        T *height = vec_right;
-
-        // get the b1_area
-        // (b1_x2 - b1_x1 + offset)  --->  vec_top
-        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
-        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
-        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
-        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
-        // --->  vec_top;
-        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
-        T *b1_area = vec_top;
-
-        // get the b2_area
-        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
-        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
-        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
-        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
-        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
-        // --->  b2_x1;
-        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
-        T *b2_area = vec_b2_x1;
-
-        // inter_s = width * height
-        __bang_mul(height, width, height, batches_stride);
-        T *inter_s = height;
-
-        // offset vector ---> vec_b2_y1
-        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
-        T *vec_offset = vec_b2_y1;
-
-        if (mode == 0) {
-          __bang_add(b1_area, b1_area, b2_area, batches_stride);
-          __bang_sub(b1_area, b1_area, inter_s, batches_stride);
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        } else {
-          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
-        }
-        T *base_s = b1_area;
-
-        // ious = inter_s / base_s
-        computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
-        int32_t gdram_offset = index1 * num_bbox2 + index2;
-        __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
-                 NRAM2GDRAM);
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
-    const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
-    const int32_t num_bbox2, const int32_t mode, const bool aligned,
-    const int32_t offset) {
-  /*
-   * NRAM partition
-   *  |-------------------------------------------------------------|
-   *  |   vec_b1_x1   |  vec_b1_y1   |   vec_b1_x2  |   vec_b1_y2   |
-   *  |-------------------------------------------------------------|
-   *  |   vec_b2_x1   |  vec_b2_y1   |   vec_b2_x2  |   vec_b2_y2   |
-   *  |-------------------------------------------------------------|
-   *  |    vec_left   |  vec_right   |    vec_top   |   vec_bottom  |
-   *  |-------------------------------------------------------------|
-   *
-  */
-  const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
-  const int32_t split_nram_num = 12;
-  const int32_t nram_stride =
-      align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
-
-  void *vec_b1_x1 = nmem_buf;
-  void *vec_b1_y1 = nmem_buf + nram_stride;
-  void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
-  void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
-
-  void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
-  void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
-  void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
-  void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
-
-  void *vec_left = nmem_buf + 8 * nram_stride;
-  void *vec_right = nmem_buf + 9 * nram_stride;
-  void *vec_top = nmem_buf + 10 * nram_stride;
-  void *vec_bottom = nmem_buf + 11 * nram_stride;
-
-  const int32_t vec_length = nram_stride / sizeof(T);
-  bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
-                       (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
-                       (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
-                       (T *)vec_right, (T *)vec_top, (T *)vec_bottom,
-                       (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
-                       vec_length, num_bbox1, num_bbox2, aligned);
-}
-
-void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                        cnrtQueue_t queue, const cnrtDataType_t d_type,
-                        const void *bbox1, const void *bbox2, void *ious,
-                        const int32_t num_bbox1, const int32_t num_bbox2,
-                        const int32_t mode, const bool aligned,
-                        const int32_t offset) {
-  if (d_type == CNRT_FLOAT16) {
-    MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  } else {
-    MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
-        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "carafe_utils.hpp"
-#include "common_mlu_helper.hpp"
-
-#define INDEX3(n, h, w, c, strN, strH, strW) \
-  (strN) * (n) + (strH) * (h) + (strW) * (w) + (c)
-
-#define NRAM_BLOCK PAD_DOWN(MAX_NRAM_SIZE / 5, NRAM_ALIGN_SIZE)
-
-__nram__ char nram_buf[MAX_NRAM_SIZE];
-
-namespace forward {
-struct BlockId {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-// start indices of block
-struct BlockStart {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-  int C;
-};
-
-struct BlockEnd {
-  int Ho;
-  int Wo;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-struct BlockSize {
-  int Ho;
-  int Wo;
-  int G;
-  int Cg;
-  int Kh;
-  int Kw;
-  int Hi;
-  int Wi;
-};
-
-template <typename T>
-__mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
-                                     const CarafeForwardParam param,
-                                     const CarafeForwardBlockDim block_dim,
-                                     const CarafeForwardGridDim grid_dim,
-                                     T *output) {
-  // data block info
-  BlockId blkId;
-  BlockStart blkStart;
-  BlockEnd blkEnd;
-  BlockSize blkSize;
-
-  // set pointers on NRAM arrays
-
-  // input_nram[blkDim_(Hi+Kh)-1, blkDim_(Wi+Kw)-1, blkDim_(G*Cg)]
-  T *input_nram = (T *)nram_buf;
-
-  // mask_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Kh*Kw)]
-  T *mask_nram = input_nram + param.input_nram_size;
-
-  // output_nram[blkDim_Ho, blkDim_Wo, blkDim_(G*Cg)]
-  T *output_nram = mask_nram + param.mask_nram_size;
-
-  // sum_array[blkDim_(G*Cg)]
-  T *sum_array = output_nram + param.output_nram_size;
-
-  /* ===== loop over N, grid_dim(Ho,Wo,G,Cg)
-   * iterations are distributed over computing cores
-   */
-  for (int loop_index = taskId; loop_index < param.job_num;
-       loop_index += taskDim) {
-    // block idx
-    blkId.Cg = loop_index;
-    blkId.G = blkId.Cg / grid_dim.Cg;
-    blkId.Wo = blkId.G / grid_dim.G;
-    blkId.Ho = blkId.Wo / grid_dim.Wo;
-    int sample_idx = blkId.Ho / grid_dim.Ho;
-
-    blkId.Cg %= grid_dim.Cg;
-    blkId.G %= grid_dim.G;
-    blkId.Wo %= grid_dim.Wo;
-    blkId.Ho %= grid_dim.Ho;
-
-    // block starting indices
-    blkStart.Ho = blkId.Ho * block_dim.Ho;
-    blkStart.Wo = blkId.Wo * block_dim.Wo;
-    blkStart.G = blkId.G * block_dim.G;
-    blkStart.Cg = blkId.Cg * block_dim.Cg;
-    blkStart.C = blkStart.G * param.Cg + blkStart.Cg;
-
-    // block size
-    blkSize.Ho = block_dim.Ho;
-    blkSize.Wo = block_dim.Wo;
-    blkSize.G = block_dim.G;
-    blkSize.Cg = block_dim.Cg;
-
-    // take care of blocks near the end of each dimension
-    if (blkId.Ho == (grid_dim.Ho - 1)) {
-      blkSize.Ho = param.Ho - (grid_dim.Ho - 1) * block_dim.Ho;
-    }
-    if (blkId.Wo == (grid_dim.Wo - 1)) {
-      blkSize.Wo = param.Wo - (grid_dim.Wo - 1) * block_dim.Wo;
-    }
-    if (blkId.G == (grid_dim.G - 1)) {
-      blkSize.G = param.group_size - (grid_dim.G - 1) * block_dim.G;
-    }
-    if (blkId.Cg == (grid_dim.Cg - 1)) {
-      blkSize.Cg = param.Cg - (grid_dim.Cg - 1) * block_dim.Cg;
-    }
-
-    // block end indices
-    blkEnd.Ho = blkStart.Ho + blkSize.Ho - 1;
-    blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;
-
-    // set output_nram to zero
-    __bang_write_value(output_nram, param.output_nram_size, T(0));
-
-    // loop blocks of kernel window: grid_dim.(Kh, Kw)
-    for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
-      blkStart.Kh = blkId.Kh * block_dim.Kh;
-      blkSize.Kh = block_dim.Kh;
-      if (blkId.Kh == (grid_dim.Kh - 1)) {
-        blkSize.Kh = param.kernel_size - (grid_dim.Kh - 1) * block_dim.Kh;
-      }
-      blkEnd.Kh = blkStart.Kh + blkSize.Kh - 1;
-
-      blkStart.Hi = blkStart.Ho / param.scale_factor - param.kernel_size_half +
-                    blkStart.Kh;
-      blkEnd.Hi =
-          blkEnd.Ho / param.scale_factor - param.kernel_size_half + blkEnd.Kh;
-      blkSize.Hi = blkEnd.Hi - blkStart.Hi + 1;
-
-      for (blkId.Kw = 0; blkId.Kw < grid_dim.Kw; ++blkId.Kw) {
-        blkStart.Kw = blkId.Kw * block_dim.Kw;
-        blkSize.Kw = block_dim.Kw;
-        if (blkId.Kw == (grid_dim.Kw - 1)) {
-          blkSize.Kw = param.kernel_size - (grid_dim.Kw - 1) * block_dim.Kw;
-        }
-        blkEnd.Kw = blkStart.Kw + blkSize.Kw - 1;
-
-        blkStart.Wi = blkStart.Wo / param.scale_factor -
-                      param.kernel_size_half + blkStart.Kw;
-        blkEnd.Wi =
-            blkEnd.Wo / param.scale_factor - param.kernel_size_half + blkEnd.Kw;
-        blkSize.Wi = blkEnd.Wi - blkStart.Wi + 1;
-
-        // load input block from gdram2nram
-        //
-        // input_nram[            | input[ sample_idx,
-        //   0:blkSize.Hi-1,      |   blkStart.Hi + 0:blkSize.Hi-1,
-        //   0:blkSize.Wi-1,      |   blkStart.Wi + 0:blkSize.Wi-1,
-        //   0:blkSize.G-1        |   blkStart.G + 0:blkSize.G-1
-        //   0:blkSize.Cg-1]      |   blkStart.Cg + 0:blkSize.Cg-1]
-        //
-        // To skip out of bound indices:
-        //
-        // input_nram[
-        //    hi_start_local:hi_end_local,
-        //    wi_start_local:wi_end_local, ...]
-        // = input[n,
-        //    hi_start_global:hi_end_global,
-        //    wi_start_global:wi_end_global, ...]
-        //
-        int hi_start_local = 0;
-        int hi_start_global = blkStart.Hi;
-        if (blkStart.Hi < 0) {
-          hi_start_local = -blkStart.Hi;
-          hi_start_global = 0;
-        }
-        int wi_start_local = 0;
-        int wi_start_global = blkStart.Wi;
-        if (blkStart.Wi < 0) {
-          wi_start_local = -blkStart.Wi;
-          wi_start_global = 0;
-        }
-        int hi_end_local = blkSize.Hi - 1;
-        int hi_end_global = blkEnd.Hi;
-        if (blkEnd.Hi > param.Hi - 1) {
-          hi_end_global = param.Hi - 1;
-          hi_end_local -= blkEnd.Hi - hi_end_global;
-        }
-        int wi_end_local = blkSize.Wi - 1;
-        int wi_end_global = blkEnd.Wi;
-        if (blkEnd.Wi > param.Wi - 1) {
-          wi_end_global = param.Wi - 1;
-          wi_end_local -= blkEnd.Wi - wi_end_global;
-        }
-
-        int dst_offset = param.input_nram_stride_h * hi_start_local +
-                         param.input_nram_stride_w * wi_start_local;
-        T *dst = input_nram + dst_offset;
-
-        int src_offset = INDEX3(sample_idx, hi_start_global, wi_start_global,
-                                blkStart.C, param.input_stride_n,
-                                param.input_stride_h, param.input_stride_w);
-        T *src = input + src_offset;
-
-        int input_seg_num_h = hi_end_local - hi_start_local + 1;
-        int input_seg_num_w = wi_end_local - wi_start_local + 1;
-        for (int i = 0; i < input_seg_num_h; ++i) {
-          loadStr3D(dst, src, blkSize.Cg, blkSize.G, input_seg_num_w,
-                    param.input_nram_stride_g, param.input_nram_stride_w,
-                    param.input_stride_g, param.input_stride_w);
-          dst += param.input_nram_stride_h;
-          src += param.input_stride_h;
-        }
-
-        /* load mask block from gdram2nram
-         *
-         * mask_nram[          |  mask[sample_idx,
-         *   0:blkSize.Ho-1 ,  |    blkStart.Ho + 0:blkSize.Ho-1,
-         *   0:blkSize.Wo-1,   |    blkStart.Wo + 0:blkSize.Wo-1,
-         *   0:blkSize.G-1,    |    blkStart.G  + 0:blkSize.G-1,
-         *   0:blkSize.Kh-1,   |    blkStart.Kh + 0:blkSize.Kh-1,
-         *   0:blkSize.Kw-1]   |    blkStart.Kw + 0:blkSize.Kw-1]
-         */
-        src_offset = INDEX3(blkStart.Wo, blkStart.G, blkStart.Kh, blkStart.Kw,
-                            param.mask_stride_w, param.mask_stride_g,
-                            param.mask_stride_kh);
-        src_offset += sample_idx * param.mask_stride_n +
-                      blkStart.Ho * param.mask_stride_h;
-
-        for (int ho = 0; ho < blkSize.Ho; ++ho) {
-          dst = mask_nram + ho * param.mask_nram_stride_h;
-          src = mask + src_offset + ho * param.mask_stride_h;
-
-          for (int wo = 0; wo < blkSize.Wo; ++wo) {
-            loadStr3D(dst, src, blkSize.Kw, blkSize.Kh, blkSize.G,
-                      param.mask_nram_stride_kh, param.mask_nram_stride_g,
-                      param.mask_stride_kh, param.mask_stride_g);
-            dst += param.mask_nram_stride_w;
-            src += param.mask_stride_w;
-          }
-        }
-
-        // loop each pixel of the output block
-        for (int ho = 0; ho < blkSize.Ho; ++ho) {
-          int kernel_hi_start_global = (blkStart.Ho + ho) / param.scale_factor -
-                                       param.kernel_size_half + blkStart.Kh;
-          int kernel_hi_start_local = kernel_hi_start_global - blkStart.Hi;
-
-          // int kernel_hi_end_global = kernel_hi_start_global + blkSize.Kh - 1;
-          // int kernel_hi_end_local = kernel_hi_end_global - blkStart.Hi;
-
-          // exclude out of bound indices which should be ignored
-          int kh_min = hi_start_local - kernel_hi_start_local > 0
-                           ? hi_start_local - kernel_hi_start_local
-                           : 0;
-          int kh_max = hi_end_local - kernel_hi_start_local < blkSize.Kh - 1
-                           ? hi_end_local - kernel_hi_start_local
-                           : blkSize.Kh - 1;
-
-          for (int wo = 0; wo < blkSize.Wo; ++wo) {
-            int kernel_wi_start_global =
-                (blkStart.Wo + wo) / param.scale_factor -
-                param.kernel_size_half + blkStart.Kw;
-            int kernel_wi_start_local = kernel_wi_start_global - blkStart.Wi;
-
-            // exclude out of bound indices wwich should be ignored
-            int kw_min = wi_start_local - kernel_wi_start_local > 0
-                             ? wi_start_local - kernel_wi_start_local
-                             : 0;
-            int kw_max = wi_end_local - kernel_wi_start_local < blkSize.Kw - 1
-                             ? wi_end_local - kernel_wi_start_local
-                             : blkSize.Kw - 1;
-
-            // output_nram[ho, wo, g, c] = sum(mask_nram[ho, wo, g, kh, kw]
-            //     * input_nram[hi+kh, wi+kw, g, c],
-            //  for (kh,kw) in [0:blkSize.Kw-1] x [0:blkSize.Kh-1])
-            //
-            // sum(mask_nram[ho, wo, g, kh, kw]
-            //     * input_nram[hi+kh, wi+kw, g, c], (kh,kw))
-            //
-            T *mask_array = mask_nram + param.mask_nram_stride_h * ho +
-                            param.mask_nram_stride_w * wo;
-
-            for (int kh = kh_min; kh <= kh_max; ++kh) {
-              for (int kw = kw_min; kw <= kw_max; ++kw) {
-                T *src =
-                    input_nram +
-                    param.input_nram_stride_h * (kernel_hi_start_local + kh) +
-                    param.input_nram_stride_w * (kernel_wi_start_local + kw);
-
-                int mask_index = param.mask_nram_stride_kh * kh + kw;
-
-                // mlutiply mask weight with channels for each channel group
-                T *sum = sum_array;
-
-                for (int g = 0; g < blkSize.G; ++g) {
-                  __bang_mul_scalar(sum, src, mask_array[mask_index],
-                                    param.block_Cg_NFU);
-                  //
-                  // NOTE: Since block_Cg_NFU >= block_Cg_stride,
-                  // overlapped writing may occur on sum_array.
-                  // So this loop must be executed in order to
-                  // avoid data contamination, as shown below.
-                  //
-                  // |-----block_Cg_NFU---------|
-                  // xxxxxxxxxxxxxxxxxxxxyyyzzzzz------------
-                  // |---block_Cg_stride---|^^^^^will be overwritten
-                  //                             in the next iteration.
-                  //
-                  // x: actual data used, y: not used, z: overwritten
-                  //
-                  sum += param.input_nram_stride_g;
-                  src += param.input_nram_stride_g;
-                  mask_index += param.mask_nram_stride_g;
-                }  // loop blk_G
-
-                // add array[blk_G * blk_C] to output_nram
-                dst = output_nram + param.output_nram_stride_h * ho +
-                      param.output_nram_stride_w * wo;
-
-                __bang_add(dst, dst, sum_array, param.output_nram_stride_w);
-              }  // end loop blk_Kw
-            }    // end loop blk_Kh
-          }      // end loop blk_Wo
-        }        // end loop blk_Ho
-      }          // end loop grid_dim.Kw
-    }            // end loop grid_dim.Kh
-
-    /* write output from nram2gdram
-     *
-     * output_nram[          |   output[sample_idx,
-     *   0:blkSize.Ho-1,     |     blkStart.Ho + 0:blkSize.Ho-1,
-     *   0:blkSize.Wo-1,     |     blkStart.Wo + 0:blkSize.Wo-1,
-     *   0:blkSize.G-1,      |     blkStart.G  + 0:blkSize.G-1,
-     *   0:blkSize.Cg-1]     |     blkStart.Cg + 0:blkSize.Cg-1]
-     */
-    int dst_offset = INDEX3(sample_idx, blkStart.Ho, blkStart.Wo, blkStart.C,
-                            param.output_stride_n, param.output_stride_h,
-                            param.output_stride_w);
-    T *dst = output + dst_offset;
-    T *src = output_nram;
-    for (int i = 0; i < blkSize.Ho; ++i) {
-      storeStr3D(dst, src, blkSize.Cg, blkSize.G, blkSize.Wo,
-                 param.output_stride_g, param.output_stride_w,
-                 param.output_nram_stride_g, param.output_nram_stride_w);
-      dst += param.output_stride_h;
-      src += param.output_nram_stride_h;
-    }
-  }  // end loop N, grid_dim.(Hi,Wi,G,Cg)
-}
-
-template <typename T>
-__mlu_global__ void MLUBLOCKKernelCarafeForward(
-    const void *input, const void *mask, const CarafeForwardParam param,
-    const CarafeForwardBlockDim block_dim, const CarafeForwardGridDim grid_dim,
-    void *output) {
-  carafeForwardBLOCK((T *)input, (T *)mask, param, block_dim, grid_dim,
-                     (T *)output);
-}
-}  // namespace forward
-
-namespace backward {
-template <typename T>
-__mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
-                                T *grad_input, T *grad_mask, const int n,
-                                const int hi, const int wi, const int c,
-                                const int k_up, const int group,
-                                const int scale) {
-  char *input_buff = nram_buf;
-  char *mask_buff = input_buff + NRAM_BLOCK;
-  char *grad_input_buff = mask_buff + NRAM_BLOCK;
-  char *grad_output_buff = grad_input_buff + NRAM_BLOCK;
-  char *grad_mask_buff = grad_output_buff + NRAM_BLOCK;
-
-  int wo = wi * scale;
-  int ho = hi * scale;
-  int out_num = n * ho * wo * group;
-  int group_size = c / group;
-  int repeat = out_num / taskDim + (int)(taskId < out_num % taskDim);
-  int num_align = PAD_DOWN(NRAM_BLOCK / sizeof(T), NFU_ALIGN_SIZE / sizeof(T));
-  int num_per_loop = group_size / num_align;
-  int rem_for_loop = group_size % num_align;
-  int rem_for_loop_align = PAD_UP(rem_for_loop, NFU_ALIGN_SIZE / sizeof(T));
-  for (int k = 0; k < repeat; k++) {
-    int iter = k * taskDim + taskId;
-    int group_k = iter % group;
-    int w_k = (iter / group) % wo;
-    int h_k = (iter / wo / group) % ho;
-    int n_k = (iter / ho / wo / group) % n;
-    int h_i = h_k / scale;
-    int w_i = w_k / scale;
-    int start_h = h_i - ((k_up - 1) / 2);
-    int end_h = h_i + ((k_up - 1) / 2) + 1;
-    int start_w = w_i - ((k_up - 1) / 2);
-    int end_w = w_i + ((k_up - 1) / 2) + 1;
-    T *base_mask = (T *)mask + n_k * ho * wo * group * k_up * k_up +
-                   h_k * wo * group * k_up * k_up + w_k * group * k_up * k_up +
-                   group_k * k_up * k_up;
-    T *base_grad_mask = (T *)grad_mask + n_k * ho * wo * group * k_up * k_up +
-                        h_k * wo * group * k_up * k_up +
-                        w_k * group * k_up * k_up + group_k * k_up * k_up;
-
-    __bang_write_zero((T *)grad_input_buff, NRAM_BLOCK / sizeof(T));
-    __bang_write_zero((T *)grad_mask_buff, NRAM_BLOCK / sizeof(T));
-    __bang_write_zero((T *)grad_output_buff, NRAM_BLOCK / sizeof(T));
-
-    __memcpy((T *)mask_buff, (T *)base_mask, k_up * k_up * sizeof(T),
-             GDRAM2NRAM);
-    for (int i = 0; i < num_per_loop; i++) {
-      __bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
-      T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
-                            h_k * wo * c + w_k * c + group_k * group_size +
-                            i * num_align;
-      __memcpy((T *)grad_output_buff, (T *)base_grad_output,
-               num_align * sizeof(T), GDRAM2NRAM);
-      for (int ih = start_h; ih < end_h; ih++) {
-        for (int iw = start_w; iw < end_w; iw++) {
-          if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
-            continue;
-          }
-          int mask_ih = ih - h_i + (k_up - 1) / 2;
-          int mask_iw = iw - w_i + (k_up - 1) / 2;
-          int mask_index = mask_ih * k_up + mask_iw;
-          int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
-                            group_k * group_size + i * num_align;
-          T *base_input = (T *)input + input_index;
-          T *base_grad_input = (T *)grad_input + input_index;
-          __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
-                   GDRAM2NRAM);
-          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
-                            ((T *)mask_buff)[mask_index], num_align);
-          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
-                            (T *)grad_input_buff, num_align);
-          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
-                     num_align);
-
-          __bang_sumpool((T *)input_buff, (T *)input_buff,
-                         NFU_ALIGN_SIZE / sizeof(T),
-                         num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
-                         num_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
-
-          __bang_reduce_sum((T *)input_buff, (T *)input_buff,
-                            NFU_ALIGN_SIZE / sizeof(T));
-          ((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
-        }
-      }
-    }
-    if (rem_for_loop) {
-      __bang_write_zero((T *)input_buff, NRAM_BLOCK / sizeof(T));
-      T *base_grad_output = (T *)grad_output + n_k * ho * wo * c +
-                            h_k * wo * c + w_k * c + group_k * group_size +
-                            num_per_loop * num_align;
-      __memcpy((T *)grad_output_buff, (T *)base_grad_output,
-               rem_for_loop * sizeof(T), GDRAM2NRAM);
-      for (int ih = start_h; ih < end_h; ih++) {
-        for (int iw = start_w; iw < end_w; iw++) {
-          if (ih < 0 || ih > hi - 1 || iw < 0 || iw > wi - 1) {
-            continue;
-          }
-          int mask_ih = ih - h_i + (k_up - 1) / 2;
-          int mask_iw = iw - w_i + (k_up - 1) / 2;
-          int mask_index = mask_ih * k_up + mask_iw;
-          int input_index = n_k * hi * wi * c + ih * wi * c + iw * c +
-                            group_k * group_size + num_per_loop * num_align;
-          T *base_input = (T *)input + input_index;
-          T *base_grad_input = (T *)grad_input + input_index;
-          __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
-                   GDRAM2NRAM);
-          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
-                            ((T *)mask_buff)[mask_index], rem_for_loop_align);
-          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
-                            (T *)grad_input_buff, rem_for_loop);
-          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
-                     rem_for_loop_align);
-
-          __bang_sumpool(
-              (T *)input_buff, (T *)input_buff, NFU_ALIGN_SIZE / sizeof(T),
-              rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1,
-              rem_for_loop_align / (NFU_ALIGN_SIZE / sizeof(T)), 1, 1, 1);
-          __bang_reduce_sum((T *)input_buff, (T *)input_buff,
-                            NFU_ALIGN_SIZE / sizeof(T));
-
-          ((T *)grad_mask_buff)[mask_index] += ((T *)input_buff)[0];
-        }
-      }
-    }
-    __memcpy((T *)base_grad_mask, (T *)grad_mask_buff, k_up * k_up * sizeof(T),
-             NRAM2GDRAM);
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelCarafeBackward(
-    const void *input, const void *mask, const void *grad_output,
-    void *grad_input, void *grad_mask, const int n, const int hi, const int wi,
-    const int c, const int k_up, const int group, const int scale) {
-  CarafeCompute((T *)input, (T *)mask, (T *)grad_output, (T *)grad_input,
-                (T *)grad_mask, n, hi, wi, c, k_up, group, scale);
-}
-}  // namespace backward
-
-void KernelCarafeForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                         cnrtQueue_t queue, const cnrtDataType_t d_type,
-                         const void *input, const void *mask,
-                         const CarafeForwardParam &param,
-                         const CarafeForwardBlockDim &block_dim,
-                         const CarafeForwardGridDim &grid_dim, void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    forward::MLUBLOCKKernelCarafeForward<half><<<k_dim, k_type, queue>>>(
-        input, mask, param, block_dim, grid_dim, output);
-  } else {
-    forward::MLUBLOCKKernelCarafeForward<float><<<k_dim, k_type, queue>>>(
-        input, mask, param, block_dim, grid_dim, output);
-  }
-}
-
-void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t dtype,
-                          const void *input, const void *mask,
-                          const void *grad_output, void *grad_input,
-                          void *grad_mask, const int n, const int hi,
-                          const int wi, const int c, const int k_up,
-                          const int group, const int scale) {
-  if (dtype == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
-        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
-        group, scale);
-  } else {
-    backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
-        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
-        group, scale);
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/carafe_utils.hpp
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef CARAFE_UTILS_HPP_
-#define CARAFE_UTILS_HPP_
-
-#define NRAM_ALIGN_SIZE 64
-
-struct CarafeForwardParam {
-  int N;   // batch size
-  int Hi;  // input height
-  int Wi;  // input width
-  int Ci;  // input channels
-  int Ho;  // output height
-  int Wo;  // output width
-  int Cg;  // channels per group
-
-  int kernel_size;       // kernel_size
-  int group_size;        // group_size
-  int scale_factor;      // scale_factor
-  int kernel_size_half;  // kernel half size (K-1)/2
-  int kernel_size_sq;    // square of kernel size
-
-  int dtype_size;  // size of tensor data type
-
-  // Host arrays' geometry
-  int input_stride_g;
-  int input_stride_w;
-  int input_stride_h;
-  int input_stride_n;
-  int input_size;
-  int mask_stride_kh;
-  int mask_stride_g;
-  int mask_stride_w;
-  int mask_stride_h;
-  int mask_stride_n;
-  int mask_size;
-  int output_stride_g;
-  int output_stride_w;
-  int output_stride_h;
-  int output_stride_n;
-  int output_size;
-
-  // NRAM arrays' geometry
-  int input_nram_stride_g;
-  int input_nram_stride_w;
-  int input_nram_stride_h;
-  int input_nram_size;
-  int mask_nram_stride_kh;
-  int mask_nram_stride_g;
-  int mask_nram_stride_w;
-  int mask_nram_stride_h;
-  int mask_nram_size;
-  int output_nram_stride_g;
-  int output_nram_stride_w;
-  int output_nram_stride_h;
-  int output_nram_size;
-
-  // for address/compute alignment
-  int align_size_NRAM;  // for addressing on NRAM
-  int align_size_NFU;   // for NFU operation length
-  int block_Cg_NFU;     // for bang_mul_const
-
-  int job_num;  // total job number
-};
-
-struct CarafeForwardBlockDim {
-  int Ho;  // block size of output height
-  int Wo;  // block size of output width
-  int Kh;  // block size of kernel height
-  int Kw;  // block size of kernel width
-  int G;   // block size of groups
-  int Cg;  // block size of channels within a group
-  int Hi;  // block size of input height
-  int Wi;  // block size of input width
-};
-
-struct CarafeForwardGridDim {
-  int Ho;  // number of blocks of output height
-  int Wo;
-  int Kh;
-  int Kw;
-  int G;
-  int Cg;
-};
-
-#endif  // CARAFE_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef COMMON_MLU_HELPER_HPP_
-#define COMMON_MLU_HELPER_HPP_
-
-#define NFU_ALIGN_SIZE 128          // Byte
-#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
-
-#ifdef __BANG_ARCH__
-#define MAX_NRAM_SIZE \
-  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
-#define MAX_SRAM_SIZE \
-  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
-#else
-#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
-#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
-#endif
-
-#ifndef PAD_UP
-#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
-#endif
-
-#ifndef PAD_DOWN
-#define PAD_DOWN(x, y) (((x) / (y)) * (y))
-#endif
-
-#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
-
-template <typename scalar_t>
-__mlu_func__ inline scalar_t min(scalar_t a, scalar_t b) {
-  return a < b ? a : b;
-}
-
-template <typename scalar_t>
-__mlu_func__ inline scalar_t max(scalar_t a, scalar_t b) {
-  return a > b ? a : b;
-}
-
-/*!
- * @brief loads data from global DRAM to NRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void loadStr2D(T *dst, T *src, const int size, const int dst_str,
-                            const int src_str, const int seg_num) {
-  if (dst_str == src_str && size == src_str) {
-    __memcpy(dst, src, src_str * seg_num * sizeof(T), GDRAM2NRAM);
-  } else if ((size == src_str || src_str <= dst_str) &&
-             src_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    T *tmp = (T *)dst + (dst_str - src_str) * seg_num;
-    __memcpy(tmp, src, (src_str * (seg_num - 1) + size) * sizeof(T),
-             GDRAM2NRAM);
-    if (dst_str != src_str) {
-      __memcpy(dst, tmp, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-  } else {
-    __memcpy(dst, src, size * sizeof(T), GDRAM2NRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-
-/*!
- * @brief loads data from global DRAM to NRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores dst data.
- * @param[in] src
- *   Pointer to global DRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void loadStr3D(T *dst, T *src, const int size,
-                            const int seg_num_in, const int seg_num_out,
-                            const int dst_str_in, const int dst_str_out,
-                            const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-
-  for (int i = 0; i < seg_num_out; ++i) {
-    loadStr2D(tmp_dst, tmp_src, size, dst_str_in, src_str_in, seg_num_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
-
-/*!
- * @brief stores data from NRAM to global DRAM with 2D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lower dimension.
- * @param[in] dst_str
- *   The data stride in bytes between segments in the lower dimension of dst.
- * @param[in] src_str
- *   The data stride in bytes between segments in the lower dimension of src.
- * @param[in] seg_num
- *   The total count of data segments in the lower dimension.
- */
-template <typename T>
-__mlu_func__ void storeStr2D(T *dst, T *src, const int size, const int seg_num,
-                             const int dst_str, const int src_str) {
-  if ((size == dst_str && dst_str <= src_str) && dst_str * sizeof(T) <= 512) {
-    // gather data less than 512Bytes to improve IO efficiency
-    if (dst_str != src_str) {
-      __memcpy(src, src, size * sizeof(T), NRAM2NRAM, dst_str * sizeof(T),
-               src_str * sizeof(T), seg_num - 1);
-    }
-    __memcpy(dst, src, size * seg_num * sizeof(T), NRAM2GDRAM);
-  } else {
-    __memcpy(dst, src, size * sizeof(T), NRAM2GDRAM, dst_str * sizeof(T),
-             src_str * sizeof(T), seg_num - 1);
-  }
-}
-
-/*!
- * @brief stores data from NRAM to global DRAM with 3D pattern.
- *
- * @param[out] dst
- *   Pointer to global DRAM that stores dst data.
- * @param[in] src
- *   Pointer to NRAM that stores src data.
- * @param[in] size
- *   The byte size of segment in the lowest dimension.
- * @param[in] seg_num_in
- *   The total count of data segments in the lowest dimension.
- * @param[in] seg_num_out
- *   The total count of data segments in the middle dimension.
- * @param[in] dst_str_in
- *   The data stride in bytes between segments in the lowest dimension of dst.
- * @param[in] dst_str_out
- *   The data stride in bytes between segments in the middle dimension of dst.
- * @param[in] src_str_in
- *   The data stride in bytes between segments in the lowest dimension of src.
- * @param[in] src_str_out
- *   The data stride in bytes between segments in the middle dimension of src.
- */
-template <typename T>
-__mlu_func__ void storeStr3D(T *dst, T *src, const int size,
-                             const int seg_num_in, const int seg_num_out,
-                             const int dst_str_in, const int dst_str_out,
-                             const int src_str_in, const int src_str_out) {
-  T *tmp_dst = dst;
-  T *tmp_src = src;
-  for (int i = 0; i < seg_num_out; ++i) {
-    storeStr2D(tmp_dst, tmp_src, size, seg_num_in, dst_str_in, src_str_in);
-    tmp_src += src_str_out;
-    tmp_dst += dst_str_out;
-  }
-}
-
-/*!
- * @brief Converts int32 to float32 data type.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores int32 type data.
- * @param[in,out] dst_addition
- *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src
- *   Pointer to NRAM that stores float32 type data.
- * @param[in,out] src_addition
- *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
-                                   float *src_addition, const int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
-#else
-  // get sign bit
-  const float move_23bit = 8388608.0;
-  // 0x80000000 = 1,000000000,0000000000000000000000000000
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000000);
-  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
-                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  // get 1 or 0 from sign bit
-  // judg is Odd
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x00000001);
-  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
-                   (char *)src_addition, src_count * sizeof(float),
-                   NFU_ALIGN_SIZE);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000001);
-  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
-                  NFU_ALIGN_SIZE / sizeof(float));
-  // minus xor, positive num invariant
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xffffffff);
-  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
-  // convert int32 to float32
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x7fffff);
-  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
-                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x4b000000);
-  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
-                   src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_sub_scalar(dst, dst, move_23bit, src_count);
-  // add one
-  __bang_add(dst, dst, dst_addition, src_count);
-  // set sign for float32
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xffffffff);
-  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x00000001);
-  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0x80000000);
-  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
-                    (char *)src_addition, src_count * 4, 128);
-  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
-#endif  // __BANG_ARCH__ >= 300
-}
-
-/*!
- * @brief Converts float32 to int32 data type with to_zero round mode.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores float32 type data.
- * @param[in,out] dst_addition
- *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src
- *   Pointer to NRAM that stores int32 type data.
- * @param[in,out] src_addition
- *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
- *   It allows empty pointer on MLU300 series.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
-                                   float *src_addition, const int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
-#else
-  // sign ===> src_addition
-  // dst=-1.0 : when src[i] is a negative number
-  // dst=+1.0 : when src[i] is a positive number
-  const int floatDchar = sizeof(float) / sizeof(char);
-  __bang_active_sign((float *)dst, src, src_count);
-  // dst_addition = abs(src)
-  __bang_mul(dst_addition, src, (float *)dst, src_count);
-  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
-  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     1.0f);
-  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
-                  NFU_ALIGN_SIZE / sizeof(float));
-  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     0xbf800000);
-  // set negative flag -1.0 = 0xbf80000
-  __bang_cycle_eq(
-      (float *)dst, (float *)dst, (float *)src_addition, src_count,
-      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
-  __bang_active_abs(dst_addition, src, src_count);
-  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     8388608.0f);
-  // mask shift move 23
-  __bang_cycle_add_tz(
-      dst_addition, dst_addition, src_addition, src_count,
-      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
-  // two`s complement for negatibe
-  // dst=1.0 , when src <-1.0
-  // dst=0.0 , when src >=-1.0
-  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
-  // to fix max value
-  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
-  // means max value.
-  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
-  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
-              src_count * floatDchar);
-  // get low 23bit
-  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-                     (unsigned)0x007fffff);
-  // mask low 23bit is 1
-  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
-                    (char *)src_addition, src_count * floatDchar,
-                    NFU_ALIGN_SIZE / sizeof(char));
-  // set 9 high bit ===> dst
-  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
-  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
-  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
-  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
-                   NFU_ALIGN_SIZE / sizeof(float));
-  // src or dst_addition
-  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
-             src_count * floatDchar);
-  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
-  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
-             src_count * floatDchar);
-#endif  // __BANG_ARCH__ >= 300
-}
-
-/*!
- * @brief Converts float32 to half data type,
- * the rounding mode on MLU200 is rd, on MLU300 is rn.
- *
- * @param[out] dst
- *   Pointer to NRAM that stores half type data.
- * @param[in] src
- *   Pointer to NRAM that stores float32 type data.
- * @param[in] src_count
- *   The count of elements in src.
- */
-__mlu_func__ inline void convertFloat2half(half *dst, float *src,
-                                           int src_count) {
-#if __BANG_ARCH__ >= 300
-  __bang_float2half_rn(dst, src, src_count);
-#else
-  __bang_float2half_rd(dst, src, src_count);
-#endif
-}
-
-/*!
- * @brief recursiveSumPool.
- * @param[in,out] dst
- *     Pointer to NRAM that stores the input and output data.
- * @param[in] low_dim
- *     Which is the number of low dim.
- * @param[in] high_dim
- *     Which is the number of high dim.
- * @param[in] kernel_limit
- *     Which is the high_dim of sumpool per time.
- ******************************************************************************/
-template <typename T>
-__mlu_func__ void recursiveSumPool(T *dst, int low_dim, int high_dim,
-                                   int kernel_limit) {
-  for (; high_dim > 1;) {
-    int repeat_s = high_dim / kernel_limit;
-    int remain_s = high_dim % kernel_limit;
-
-    if (remain_s) {
-      __bang_sumpool((T *)dst, (T *)dst, low_dim, 1, remain_s, 1, remain_s, 1,
-                     1);
-    }
-    if (repeat_s) {
-      __bang_sumpool((T *)dst + (remain_s > 0 ? low_dim : 0),
-                     (T *)dst + remain_s * low_dim, low_dim,
-                     kernel_limit * repeat_s, 1, kernel_limit, 1, 1,
-                     kernel_limit);
-    }
-    high_dim = repeat_s + (bool)remain_s;
-  }
-  return;
-}
-
-#endif  // COMMON_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/deform_roi_pool_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <iostream>
-
-#include "common_mlu_helper.hpp"
-
-#define ROI_OFFSET 5
-#define FOURSPLIT 4
-#define FIVESPLIT 5
-#define NINESPLIT 9
-#define THIRTEENSPLIT 13
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-static __mlu_func__ void bilinearInterpolate(const int input_width, T y, T x,
-                                             T *w1, T *w2, T *w3, T *w4,
-                                             int *x_low, int *x_high,
-                                             const int y_low, bool *is_empty) {
-  if (x < -1.0 || x > input_width) {
-    *is_empty = true;
-    return;
-  }
-
-  if (x <= 0) x = 0;
-
-  *x_low = int(x);
-
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-
-  T ly = y - y_low;
-  T lx = x - *x_low;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1DeformRoIPoolForward(
-    const T *input, const T *rois, const T *offset, T *output,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const T spatial_scale,
-    const int sampling_ratio, const T gamma) {
-  for (int bin_index = taskId;
-       bin_index < num_rois * pooled_width * pooled_height;
-       bin_index += taskDim) {
-    int out_batch = bin_index / pooled_width / pooled_height;
-    int out_height = bin_index / pooled_width % pooled_height;
-    int out_width = bin_index % pooled_width;
-    const T *cur_roi = rois + out_batch * ROI_OFFSET;
-    T *nram_rois = (T *)nram_buffer;
-    __memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
-             GDRAM2NRAM);
-    const int roi_batch = nram_rois[0];
-    T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
-    T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
-    const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
-    const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
-    const T roi_width = roi_x_max - roi_x_min;
-    const T roi_height = roi_y_max - roi_y_min;
-    const T bin_width = roi_width / static_cast<T>(pooled_width);
-    const T bin_height = roi_height / static_cast<T>(pooled_height);
-    const T *offset_input = input + roi_batch * height * width * channels;
-    int roi_bin_grid_height =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_width =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_width / pooled_width));
-    if (offset != NULL) {
-      const T *offset_cur = offset +
-                            out_batch * pooled_width * pooled_height * 2 +
-                            out_height * pooled_width + out_width;
-      roi_x_min += gamma * roi_width * offset_cur[0];
-      roi_y_min +=
-          gamma * roi_height * offset_cur[pooled_width * pooled_height];
-    }
-    int type_align = NFU_ALIGN_SIZE / sizeof(T);
-    int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-    int channels_nram_split =
-        channels_max_num_nram / NINESPLIT / type_align * type_align;
-    int channel_rem = channels % channels_nram_split;
-    int channel_loops =
-        channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    for (int channel_loop_index = 0; channel_loop_index < channel_loops;
-         ++channel_loop_index) {
-      int channels_num =
-          channels_nram_split >= channels ? channels : channels_nram_split;
-      const int channel_offset = channel_loop_index * channels_num;
-      if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
-        channels_num = channel_rem;
-      }
-      int channels_align = CEIL_ALIGN(channels_num, type_align);
-      int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - channels_align) >> 1;
-      int c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-      int c_slice_align = 0;
-
-      /* NRAM partition
-       *
-       * |          |       ping        |       pong        |
-       * |----------|-------------------|-------------------|
-       * | nram_out | p1 | p2 | p3 | p4 | p1 | p2 | p3 | p4 |
-       *
-       */
-
-      T *nram_out = (T *)nram_buffer;
-      T *nram_ping = nram_out + channels_align;
-      T *nram_pong = nram_ping + nram_limit;
-      __bang_write_value((T *)nram_out, channels_align, (T)0);
-      __bang_write_value((T *)nram_ping, FOURSPLIT * c_slice, (T)0);
-      __bang_write_value((T *)nram_pong, FOURSPLIT * c_slice, (T)0);
-      const T num_bins =
-          static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
-      const T value_div = 1.0f / num_bins;
-      bool is_ping_empty = true;
-      for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
-        T y = roi_y_min + out_height * bin_height +
-              static_cast<T>(iy + .5f) * bin_height /
-                  static_cast<T>(roi_bin_grid_height);
-        if (y < -1.0 || y > height) {
-          is_ping_empty = true;
-          continue;
-        }
-        if (y <= 0) {
-          y = 0;
-        }
-        int y_low = 0, y_high = 0;
-        y_low = int(y);
-        if (y_low >= height - 1) {
-          y_high = y_low = height - 1;
-          y = T(y_low);
-        } else {
-          y_high = y_low + 1;
-        }
-        for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
-          T x = roi_x_min + out_width * bin_width +
-                static_cast<T>(ix + .5f) * bin_width /
-                    static_cast<T>(roi_bin_grid_width);
-          const int sample_index = iy * roi_bin_grid_width + ix;
-          int c_rem = channels_num;
-          c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-          c_slice_align = 0;
-          bool is_empty = false;
-          T w1, w2, w3, w4;
-          int x_low = 0, x_high = 0;
-          bilinearInterpolate(width, y, x, &w1, &w2, &w3, &w4, &x_low, &x_high,
-                              y_low, &is_empty);
-          if (is_empty) {
-            is_ping_empty = true;
-            continue;
-          }
-          if (is_ping_empty) {
-            c_slice = c_slice > c_rem ? c_rem : c_slice;
-            c_slice_align = CEIL_ALIGN(c_slice, type_align);
-            __bang_write_value(nram_ping, FOURSPLIT * c_slice_align, (T)0);
-            __asm__ volatile("sync;");
-            __memcpy(nram_ping,
-                     offset_input + y_low * width * channels +
-                         x_low * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + c_slice_align,
-                     offset_input + y_low * width * channels +
-                         x_high * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + 2 * c_slice_align,
-                     offset_input + y_high * width * channels +
-                         x_low * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            __memcpy(nram_ping + 3 * c_slice_align,
-                     offset_input + y_high * width * channels +
-                         x_high * channels + channel_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            is_ping_empty = false;
-          }
-          int c_offset = 0;
-          int pongc_slice = 0;
-          int pongc_slice_align = 0;
-          while (c_rem > 0) {
-            c_slice = c_slice > c_rem ? c_rem : c_slice;
-            c_slice_align = CEIL_ALIGN(c_slice, type_align);
-            if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
-              int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
-              int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
-              y = roi_y_min + out_height * bin_height +
-                  static_cast<T>(iy_tmp + .5f) * bin_height /
-                      static_cast<T>(roi_bin_grid_height);
-              x = roi_x_min + out_width * bin_width +
-                  static_cast<T>(ix_tmp + .5f) * bin_width /
-                      static_cast<T>(roi_bin_grid_width);
-              if (y < -1.0 || y > height) {
-                is_empty = true;
-              } else {
-                T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
-                if (y <= 0) {
-                  y = 0;
-                }
-                y_low = int(y);
-                if (y_low >= height - 1) {
-                  y_high = y_low = height - 1;
-                  y = T(y_low);
-                } else {
-                  y_high = y_low + 1;
-                }
-                bilinearInterpolate(width, y, x, &w1_tmp, &w2_tmp, &w3_tmp,
-                                    &w4_tmp, &x_low, &x_high, y_low, &is_empty);
-              }
-              pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
-              pongc_slice =
-                  pongc_slice > channels_num ? channels_num : pongc_slice;
-              pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
-              __bang_write_value(nram_pong, FOURSPLIT * pongc_slice_align,
-                                 (T)0);
-              __asm__ volatile("sync;");
-              if (!is_empty) {
-                __memcpy_async(nram_pong,
-                               offset_input + y_low * width * channels +
-                                   x_low * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + pongc_slice_align,
-                               offset_input + y_low * width * channels +
-                                   x_high * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + 2 * pongc_slice_align,
-                               offset_input + y_high * width * channels +
-                                   x_low * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-                __memcpy_async(nram_pong + 3 * pongc_slice_align,
-                               offset_input + y_high * width * channels +
-                                   x_high * channels + channel_offset,
-                               pongc_slice * sizeof(T), GDRAM2NRAM);
-              }
-            }
-            __bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
-            __bang_mul_scalar(nram_ping + c_slice_align,
-                              nram_ping + c_slice_align, w2, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + c_slice_align,
-                       c_slice_align);
-            __bang_mul_scalar(nram_ping + 2 * c_slice_align,
-                              nram_ping + 2 * c_slice_align, w3, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + 2 * c_slice_align,
-                       c_slice_align);
-            __bang_mul_scalar(nram_ping + 3 * c_slice_align,
-                              nram_ping + 3 * c_slice_align, w4, c_slice_align);
-            __bang_add(nram_ping, nram_ping, nram_ping + 3 * c_slice_align,
-                       c_slice_align);
-            __bang_add(nram_out + c_offset, nram_out + c_offset, nram_ping,
-                       c_slice_align);
-            T *nram_tmp = nram_ping;
-            nram_ping = nram_pong;
-            nram_pong = nram_tmp;
-            c_rem -= c_slice;
-            c_offset += c_slice;
-            __asm__ volatile("sync;");
-          }
-        }
-      }
-      __bang_mul_scalar(nram_out, nram_out, value_div, channels_align);
-      __memcpy(output + channels * bin_index + channel_offset, nram_out,
-               channels_num * sizeof(T), NRAM2GDRAM);
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelDeformRoIPoolForward(
-    cnrtDataType_t data_type, const void *input, const void *rois,
-    const void *offset, void *output, const int channels, const int height,
-    const int width, const int num_rois, const int pooled_height,
-    const int pooled_width, const float spatial_scale, const int sampling_ratio,
-    const float gamma) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1DeformRoIPoolForward((half *)input, (half *)rois, (half *)offset,
-                                    (half *)output, channels, height, width,
-                                    num_rois, pooled_height, pooled_width,
-                                    static_cast<half>(spatial_scale),
-                                    sampling_ratio, static_cast<half>(gamma));
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1DeformRoIPoolForward(
-          (float *)input, (float *)rois, (float *)offset, (float *)output,
-          channels, height, width, num_rois, pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), sampling_ratio,
-          static_cast<float>(gamma));
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelDeformRoIPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                cnrtQueue_t queue, cnrtDataType_t data_type,
-                                const void *input, const void *rois,
-                                const void *offset, void *output,
-                                const int channels, const int height,
-                                const int width, const int num_rois,
-                                const int pooled_height, const int pooled_width,
-                                const float spatial_scale,
-                                const int sampling_ratio, const float gamma) {
-  MLUKernelDeformRoIPoolForward<<<k_dim, k_type, queue>>>(
-      data_type, input, rois, offset, output, channels, height, width, num_rois,
-      pooled_height, pooled_width, spatial_scale, sampling_ratio, gamma);
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1DeformRoIPoolBackward(
-    const T *grad_output, const T *input, const T *rois, const T *offset,
-    T *grad_input, T *grad_offset, const int channels, const int height,
-    const int width, const int num_rois, const int pooled_height,
-    const int pooled_width, const T spatial_scale, const int sampling_ratio,
-    const T gamma) {
-  for (int bin_index = taskId;
-       bin_index < num_rois * pooled_width * pooled_height;
-       bin_index += taskDim) {
-    int out_batch = bin_index / pooled_width / pooled_height;
-    int out_height = bin_index / pooled_width % pooled_height;
-    int out_width = bin_index % pooled_width;
-    const T *cur_roi = rois + out_batch * ROI_OFFSET;
-    T *nram_rois = (T *)nram_buffer;
-    __memcpy((void *)nram_rois, (void *)cur_roi, ROI_OFFSET * sizeof(T),
-             GDRAM2NRAM);
-    const int roi_batch = nram_rois[0];
-    T roi_x_min = nram_rois[1] * spatial_scale - 0.5;
-    T roi_y_min = nram_rois[2] * spatial_scale - 0.5;
-    const T roi_x_max = nram_rois[3] * spatial_scale - 0.5;
-    const T roi_y_max = nram_rois[4] * spatial_scale - 0.5;
-    const T roi_width = roi_x_max - roi_x_min;
-    const T roi_height = roi_y_max - roi_y_min;
-    const T bin_width = roi_width / static_cast<T>(pooled_width);
-    const T bin_height = roi_height / static_cast<T>(pooled_height);
-    const T *offset_input = input + roi_batch * height * width * channels;
-    T *offset_grad_input = grad_input + roi_batch * height * width * channels;
-    int roi_bin_grid_height =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_height / pooled_height));
-    int roi_bin_grid_width =
-        (sampling_ratio > 0)
-            ? sampling_ratio
-            : static_cast<int>(ceilf(roi_width / pooled_width));
-    if (offset != NULL) {
-      const T *offset_cur = offset +
-                            out_batch * pooled_width * pooled_height * 2 +
-                            out_height * pooled_width + out_width;
-      roi_x_min += gamma * roi_width * offset_cur[0];
-      roi_y_min +=
-          gamma * roi_height * offset_cur[pooled_width * pooled_height];
-    }
-
-    /* NRAM partition
-     *
-     * If offset != NULL, NRAM partition belows.
-     * |                                                                     |
-     * ping   |    pong   |
-     * |---------------------------------------------------------------------|-----------|-----------|
-     * |nram_tmp1|nram_tmp2|nram_tmp3|nram_tmp4|nram_grad_output|nram_sum_tmp|p1|p2|p3|p4|p1|p2|p3|p4|
-     *
-     * If offset == NULL, ping and pang will not be needed.
-     * | |
-     * |----------------------------------------------------------------------------------|
-     * | nram_tmp1 | nram_tmp2 | nram_tmp3 |  nram_tmp4 | nram_grad_output |
-     *
-     */
-
-    int type_align = NFU_ALIGN_SIZE / sizeof(T);
-    int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-    int channels_nram_split =
-        channels_max_num_nram / FIVESPLIT / type_align * type_align;
-    int channel_rem = channels % channels_nram_split;
-    int channel_loops =
-        channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    if (offset != NULL) {
-      channels_nram_split =
-          channels_max_num_nram / THIRTEENSPLIT / type_align * type_align;
-      channel_rem = channels % channels_nram_split;
-      channel_loops =
-          channels / channels_nram_split + (channel_rem != 0 ? 1 : 0);
-    }
-
-    for (int channel_loop_index = 0; channel_loop_index < channel_loops;
-         ++channel_loop_index) {
-      int channels_num =
-          channels_nram_split >= channels ? channels : channels_nram_split;
-      const int channel_offset = channel_loop_index * channels_num;
-      if (channel_loop_index + 1 == channel_loops && channel_rem != 0) {
-        channels_num = channel_rem;
-      }
-      int channels_align = CEIL_ALIGN(channels_num, type_align);
-      const int32_t nram_sum_tmp_channel = NFU_ALIGN_SIZE / sizeof(T);
-      int nram_limit = (MAX_NRAM_SIZE / sizeof(T) - 5 * channels_align -
-                        nram_sum_tmp_channel) >>
-                       1;
-      int c_slice = 0;
-      int c_slice_align = 0;
-      T *nram_tmp1 = (T *)nram_buffer;
-      T *nram_tmp2 = (T *)nram_buffer + channels_align;
-      T *nram_tmp3 = (T *)nram_buffer + 2 * channels_align;
-      T *nram_tmp4 = (T *)nram_buffer + 3 * channels_align;
-      T *nram_grad_output = nram_tmp4 + channels_align;
-      T *nram_sum_tmp = NULL;
-      T *nram_ping_input = NULL;
-      T *nram_pong_input = NULL;
-      __bang_write_value((T *)nram_grad_output, channels_align, (T)0);
-      __asm__ volatile("sync;");
-
-      if (offset != NULL) {
-        c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-        nram_sum_tmp = nram_grad_output + channels_align;
-        nram_ping_input = nram_sum_tmp + nram_sum_tmp_channel;
-        nram_pong_input = nram_ping_input + FOURSPLIT * c_slice;
-        __bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
-        __bang_write_value((T *)nram_ping_input, FOURSPLIT * c_slice, (T)0);
-        __bang_write_value((T *)nram_pong_input, FOURSPLIT * c_slice, (T)0);
-        __asm__ volatile("sync;");
-      }
-      const T num_bins =
-          static_cast<T>(max(roi_bin_grid_height * roi_bin_grid_width, 1));
-      const T value_div = 1.0f / num_bins;
-      bool is_ping_empty = true;
-      __memcpy(nram_grad_output,
-               grad_output + channels * bin_index + channel_offset,
-               channels_num * sizeof(T), GDRAM2NRAM);
-      __bang_mul_scalar(nram_grad_output, nram_grad_output, value_div,
-                        channels_align);
-      for (int iy = 0; iy < roi_bin_grid_height; ++iy) {
-        T y = roi_y_min + out_height * bin_height +
-              static_cast<T>(iy + .5f) * bin_height /
-                  static_cast<T>(roi_bin_grid_height);
-        T y_tmp = y;
-        if (y_tmp < -1.0 || y_tmp > height) {
-          is_ping_empty = true;
-          continue;
-        }
-        if (y_tmp <= 0) {
-          y_tmp = 0;
-        }
-        int y_low = 0, y_high = 0;
-        y_low = int(y_tmp);
-        if (y_low >= height - 1) {
-          y_high = y_low = height - 1;
-          y_tmp = T(y_low);
-        } else {
-          y_high = y_low + 1;
-        }
-        for (int ix = 0; ix < roi_bin_grid_width; ++ix) {
-          T x = roi_x_min + out_width * bin_width +
-                static_cast<T>(ix + .5f) * bin_width /
-                    static_cast<T>(roi_bin_grid_width);
-          const int sample_index = iy * roi_bin_grid_width + ix;
-          int c_rem = channels_num;
-          bool is_empty = false;
-          T w1, w2, w3, w4;
-          int x_low = 0, x_high = 0;
-          bilinearInterpolate(width, y_tmp, x, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, y_low, &is_empty);
-          if (is_empty) {
-            is_ping_empty = true;
-            continue;
-          }
-          __bang_mul_scalar((T *)nram_tmp1, (T *)nram_grad_output, w1,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp2, (T *)nram_grad_output, w2,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp3, (T *)nram_grad_output, w3,
-                            channels_align);
-          __bang_mul_scalar((T *)nram_tmp4, (T *)nram_grad_output, w4,
-                            channels_align);
-          __asm__ volatile("sync;");
-          __bang_atomic_add(
-              (T *)nram_tmp1,
-              (T *)(offset_grad_input + (y_low * width + x_low) * channels +
-                    channel_offset),
-              (T *)nram_tmp1, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp2,
-              (T *)(offset_grad_input + (y_low * width + x_high) * channels +
-                    channel_offset),
-              (T *)nram_tmp2, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp3,
-              (T *)(offset_grad_input + (y_high * width + x_low) * channels +
-                    channel_offset),
-              (T *)nram_tmp3, channels_num);
-          __bang_atomic_add(
-              (T *)nram_tmp4,
-              (T *)(offset_grad_input + (y_high * width + x_high) * channels +
-                    channel_offset),
-              (T *)nram_tmp4, channels_num);
-          if (offset != NULL) {
-            c_slice = nram_limit / FOURSPLIT / type_align * type_align;
-            c_slice_align = 0;
-            if (is_ping_empty) {
-              c_slice = c_slice > c_rem ? c_rem : c_slice;
-              c_slice_align = CEIL_ALIGN(c_slice, type_align);
-              __bang_write_value(nram_ping_input, FOURSPLIT * c_slice_align,
-                                 (T)0);
-              __asm__ volatile("sync;");
-              const T *src_offset1 = offset_input + y_low * width * channels +
-                                     x_low * channels + channel_offset;
-              const T *src_offset2 = offset_input + y_low * width * channels +
-                                     x_high * channels + channel_offset;
-              const T *src_offset3 = offset_input + y_high * width * channels +
-                                     x_low * channels + channel_offset;
-              const T *src_offset4 = offset_input + y_high * width * channels +
-                                     x_high * channels + channel_offset;
-              __memcpy(nram_ping_input, src_offset1, c_slice * sizeof(T),
-                       GDRAM2NRAM);
-              __memcpy(nram_ping_input + c_slice_align, src_offset2,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              __memcpy(nram_ping_input + 2 * c_slice_align, src_offset3,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              __memcpy(nram_ping_input + 3 * c_slice_align, src_offset4,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              is_ping_empty = false;
-            }
-            int c_offset = 0;
-            int pongc_slice = 0;
-            int pongc_slice_align = 0;
-            while (c_rem > 0) {
-              c_slice = c_slice > c_rem ? c_rem : c_slice;
-              c_slice_align = CEIL_ALIGN(c_slice, type_align);
-              if (sample_index + 1 < roi_bin_grid_height * roi_bin_grid_width) {
-                int iy_tmp = (sample_index + 1) / roi_bin_grid_width;
-                int ix_tmp = (sample_index + 1) % roi_bin_grid_width;
-                T y_tmp = roi_y_min + out_height * bin_height +
-                          static_cast<T>(iy_tmp + .5f) * bin_height /
-                              static_cast<T>(roi_bin_grid_height);
-                T x_tmp = roi_x_min + out_width * bin_width +
-                          static_cast<T>(ix_tmp + .5f) * bin_width /
-                              static_cast<T>(roi_bin_grid_width);
-                int x_low_tmp = 0, x_high_tmp = 0, y_low_tmp = 0,
-                    y_high_tmp = 0;
-                if (y_tmp < -1.0 || y_tmp > height) {
-                  is_empty = true;
-                } else {
-                  T w1_tmp, w2_tmp, w3_tmp, w4_tmp;
-                  if (y_tmp <= 0) {
-                    y_tmp = 0;
-                  }
-                  y_low_tmp = int(y_tmp);
-                  if (y_low_tmp >= height - 1) {
-                    y_high_tmp = y_low_tmp = height - 1;
-                    y_tmp = T(y_low_tmp);
-                  } else {
-                    y_high_tmp = y_low_tmp + 1;
-                  }
-                  bilinearInterpolate(width, y_tmp, x_tmp, &w1_tmp, &w2_tmp,
-                                      &w3_tmp, &w4_tmp, &x_low_tmp, &x_high_tmp,
-                                      y_low_tmp, &is_empty);
-                }
-                pongc_slice = nram_limit / FOURSPLIT / type_align * type_align;
-                pongc_slice =
-                    pongc_slice > channels_num ? channels_num : pongc_slice;
-                pongc_slice_align = CEIL_ALIGN(pongc_slice, type_align);
-                __bang_write_value(nram_pong_input,
-                                   FOURSPLIT * pongc_slice_align, (T)0);
-                __asm__ volatile("sync;");
-                if (!is_empty) {
-                  const T *src_offset1 = offset_input +
-                                         y_low_tmp * width * channels +
-                                         x_low_tmp * channels + channel_offset;
-                  const T *src_offset2 = offset_input +
-                                         y_low_tmp * width * channels +
-                                         x_high_tmp * channels + channel_offset;
-                  const T *src_offset3 = offset_input +
-                                         y_high_tmp * width * channels +
-                                         x_low_tmp * channels + channel_offset;
-                  const T *src_offset4 = offset_input +
-                                         y_high_tmp * width * channels +
-                                         x_high_tmp * channels + channel_offset;
-                  __memcpy_async(nram_pong_input, src_offset1,
-                                 pongc_slice * sizeof(T), GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + pongc_slice_align,
-                                 src_offset2, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + 2 * pongc_slice_align,
-                                 src_offset3, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                  __memcpy_async(nram_pong_input + 3 * pongc_slice_align,
-                                 src_offset4, pongc_slice * sizeof(T),
-                                 GDRAM2NRAM);
-                }
-              }
-
-              __bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
-                                y - y_low, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
-                                y_high - y, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
-                                y_low - y, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input, y - y_high,
-                                c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_width,
-                                c_slice_align);
-              __bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
-              const int32_t kernel_width =
-                  c_slice_align / nram_sum_tmp_channel +
-                  (int32_t)(c_slice_align % nram_sum_tmp_channel > 0);
-              __bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
-                             kernel_width, 1, kernel_width, kernel_width, 1);
-              __bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
-                                nram_sum_tmp_channel);
-              __bang_atomic_add(
-                  (T *)nram_sum_tmp,
-                  (T *)(grad_offset +
-                        out_batch * pooled_width * pooled_height * 2 +
-                        out_height * pooled_width + out_width),
-                  (T *)nram_sum_tmp, 1);
-              __bang_write_value((T *)nram_sum_tmp, nram_sum_tmp_channel, (T)0);
-              __bang_mul_scalar(nram_tmp1, nram_ping_input + 3 * c_slice_align,
-                                x - x_low, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + 2 * c_slice_align,
-                                x_high - x, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input + c_slice_align,
-                                x_low - x, c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp2, nram_ping_input, x - x_high,
-                                c_slice_align);
-              __bang_add(nram_tmp1, nram_tmp1, nram_tmp2, c_slice_align);
-              __bang_mul_scalar(nram_tmp1, nram_tmp1, gamma * roi_height,
-                                c_slice_align);
-              __bang_mul(nram_tmp1, nram_grad_output, nram_tmp1, c_slice_align);
-              __bang_sumpool(nram_sum_tmp, nram_tmp1, nram_sum_tmp_channel, 1,
-                             kernel_width, 1, kernel_width, kernel_width, 1);
-              __bang_reduce_sum(nram_sum_tmp, nram_sum_tmp,
-                                NFU_ALIGN_SIZE / sizeof(T));
-              __bang_atomic_add(
-                  (T *)nram_sum_tmp,
-                  (T *)(grad_offset +
-                        out_batch * pooled_width * pooled_height * 2 +
-                        pooled_width * pooled_height +
-                        out_height * pooled_width + out_width),
-                  (T *)nram_sum_tmp, 1);
-
-              T *nram_tmp = nram_ping_input;
-              nram_ping_input = nram_pong_input;
-              nram_pong_input = nram_tmp;
-              c_rem -= c_slice;
-              c_offset += c_slice;
-              __asm__ volatile("sync;");
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelDeformRoIPoolBackward(
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1DeformRoIPoolBackward(
-          (half *)grad_output, (half *)input, (half *)rois, (half *)offset,
-          (half *)grad_input, (half *)grad_offset, channels, height, width,
-          num_rois, pooled_height, pooled_width,
-          static_cast<half>(spatial_scale), sampling_ratio,
-          static_cast<half>(gamma));
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1DeformRoIPoolBackward(
-          (float *)grad_output, (float *)input, (float *)rois, (float *)offset,
-          (float *)grad_input, (float *)grad_offset, channels, height, width,
-          num_rois, pooled_height, pooled_width,
-          static_cast<float>(spatial_scale), sampling_ratio,
-          static_cast<float>(gamma));
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelDeformRoIPoolBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t data_type, const void *grad_output, const void *input,
-    const void *rois, const void *offset, void *grad_input, void *grad_offset,
-    const int channels, const int height, const int width, const int num_rois,
-    const int pooled_height, const int pooled_width, const float spatial_scale,
-    const int sampling_ratio, const float gamma) {
-  MLUKernelDeformRoIPoolBackward<<<k_dim, k_type, queue>>>(
-      data_type, grad_output, input, rois, offset, grad_input, grad_offset,
-      channels, height, width, num_rois, pooled_height, pooled_width,
-      spatial_scale, sampling_ratio, gamma);
-}
--- a/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include <float.h>
-
-#include "common_mlu_helper.hpp"
-
-#define PING 0
-#define PONG 1
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-namespace forward {
-template <typename T>
-__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
-                            const int32_t dst_stride = 0,
-                            const int32_t src_stride = 0,
-                            const int32_t count = 1) {
-  if (dst_stride == src_stride) {
-    __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
-  } else {
-    __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
-                   src_stride, count - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
-                             const int32_t c, const int32_t has_weight,
-                             const int32_t partition_nc) {
-  if (has_weight && partition_nc && t >= 0 && t < c) {
-    __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
-                              const int32_t size, const int32_t dst_stride = 0,
-                              const int32_t src_stride = 0,
-                              const int32_t count = 1) {
-  if (dst_stride == src_stride) {
-    __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
-  } else {
-    __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
-                   src_stride, count - 1);
-  }
-}
-
-template <typename T>
-__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
-                          const int32_t has_weight, const int32_t partition_nc,
-                          const int32_t deal_num, const int32_t n_seg,
-                          const int32_t c, const int32_t c_seg,
-                          const int32_t c_start_index, const float alpha,
-                          const float gamma, T *compute_a, T *compute_b,
-                          T *output) {
-  // set params
-  const int32_t c_num =
-      has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
-  const int32_t c_end_index = c_start_index + c_seg;
-  const int32_t half_epsilon = 0x0400;
-  const T epsilon_f =
-      sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
-
-  // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma  if t == c_i
-  //                    = (1 - alpha) * p ^ gamma  if t != c_i
-  __nramset((T *)output, deal_num, (T)(1 - alpha));
-  __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
-  for (int32_t i = 0; i < n_seg; ++i) {
-    const int32_t t = *((uint32_t *)target + i);
-    if (t >= c_start_index && t < c_end_index) {
-      const uint32_t index = i * c_num + t - c_start_index;
-      *((T *)input + index) = -1.0 * (*((T *)input + index));
-      *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
-      *((T *)output + index) = alpha;
-    }
-  }
-  if (sizeof(T) == sizeof(half)) {
-    __bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
-    __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
-    __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
-                     deal_num);
-    __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
-    __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
-  } else {
-    __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
-    __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
-    __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
-  }
-  __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
-
-  // 1. max = max(0, -x)  if t == c_i
-  //        = max(0, x)   if t != c_i
-  __nramset((T *)compute_b, deal_num, (T)0);
-  __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
-
-  // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max   if t == c_i
-  //              = ln(e^(-max)+ e^(-max+x) + max   if t != c_i
-  __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
-  __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
-
-  __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
-  __bang_active_exphp((T *)input, (T *)input, deal_num);
-  __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
-  __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
-  __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
-
-  // 3. output = alpha_t * p_t^r * [-log(p_t)]
-  __bang_mul((T *)output, (T *)output, (T *)input, deal_num);
-
-  // 4. with weight
-  if (has_weight) {
-    for (int32_t i = 0; i < n_seg; ++i) {
-      int32_t t = *((int32_t *)target + i);
-      if (t >= 0 && t < c) {
-        t = partition_nc ? 0 : t;
-        __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
-                         *((T *)weight + t), c_num);
-      }
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void startPipeline(
-    const T *input, const int32_t *target, const T *weight,
-    char *nram_compute_a, char *nram_compute_b, char *nram_input,
-    char *nram_target, char *nram_weight, char *nram_output,
-    const int32_t has_weight, const int32_t partition_nc,
-    const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
-    const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
-    const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
-    T *output) {
-  // with offset
-  input = (T *)((char *)input + c_offset_num * sizeof(T));
-  output = (T *)((char *)output + c_offset_num * sizeof(T));
-
-  const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
-  const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t load_size = c_seg * sizeof(T);
-  const int32_t dram_stride = c * sizeof(T);
-  const int32_t nram_stride = c_num * sizeof(T);
-
-  if (has_weight && !partition_nc) {
-    loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
-                 1);
-    __asm__ volatile("sync;\n\t");
-  }
-  const int32_t repeat = n / n_seg;
-  const int32_t remain = n % n_seg;
-
-  /*
-   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
-   *           The allocated memory space of NRAM is divided into two parts:
-   *           PING and Pong. In a single time slice, PING is used to process
-   *           IO stream and PONG is used for computation. Both of them are
-   *           processed synchronously until finished.
-   *
-   * diagram of PINGPONG:
-   * |------|-----------------------------------------------------------------|
-   * |      |                              space                              |
-   * |------|-----------------------------------------------------------------|
-   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
-   * |------|-----------------------------------------------------------------|
-   * |  0   |    L0    |          |          |          |          |          |
-   * |  1   |    C0    |    L1    |          |          |          |          |
-   * |  2   |    S0    |    C1    |    L2    |          |          |          |
-   * |  3   |          |    S1    |    C2    |    L3    |          |          |
-   * |  4   |          |          |    S2    |    C3    |    L4    |          |
-   * |  5   |          |          |          |    S3    |    C4    |    L5    |
-   * |  6   |          |          |          |          |    S4    |    C5    |
-   * |  7   |          |          |          |          |          |    S5    |
-   * |------|-----------------------------------------------------------------|
-   */
-
-  // diagram of PINGPONG: L0
-  if (repeat > 0) {
-    loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
-                 n_seg);
-    loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
-    loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
-                  partition_nc);
-    __asm__ volatile("sync;\n\t");
-  }
-
-  // diagram of PINGPONG: C0 and L1
-  if (repeat > 1) {
-    compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)nram_output);
-    loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
-                 load_size, nram_stride, dram_stride, n_seg);
-    loadInput<int32_t>((char *)nram_target + pingpong_offset,
-                       (int32_t *)target + n_seg, n_seg * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
-                  *((int32_t *)target + n_seg), c, has_weight, partition_nc);
-    __asm__ volatile("sync;\n\t");
-  }
-
-  for (int32_t i = 0; i < repeat - 2; ++i) {
-    storeOutput<T>((T *)output + i * c * n_seg,
-                   nram_output + (i % 2) * pingpong_offset, load_size,
-                   dram_stride, nram_stride, n_seg);
-    loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
-                 (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
-                 dram_stride, n_seg);
-    loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
-                       (int32_t *)target + (i + 2) * n_seg,
-                       n_seg * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
-                  (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
-                  has_weight, partition_nc);
-    compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
-            (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
-    __asm__ volatile("sync;\n\t");
-  }
-
-  if (repeat > 1) {
-    storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
-                   (char *)nram_output + (repeat % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, n_seg);
-  }
-
-  if (remain > 0) {
-    loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
-                 (T *)input + repeat * c * n_seg, load_size, nram_stride,
-                 dram_stride, remain);
-    loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
-                       (int32_t *)target + repeat * n_seg,
-                       remain * sizeof(int32_t));
-    loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
-                  (T *)weight, *((int32_t *)target + repeat * n_seg), c,
-                  has_weight, partition_nc);
-  }
-
-  if (repeat > 0) {
-    compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
-            (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
-  }
-  __asm__ volatile("sync;\n\t");
-
-  if (repeat > 0) {
-    storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
-                   (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, n_seg);
-  }
-
-  if (remain > 0) {
-    int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
-    compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
-            (int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
-            (T *)(nram_weight +
-                  partition_nc * (repeat % 2) * pingpong_weight_offset),
-            has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
-            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
-            (T *)(nram_output + (repeat % 2) * pingpong_offset));
-    __asm__ volatile("sync;\n\t");
-
-    storeOutput<T>((T *)output + repeat * c * n_seg,
-                   (char *)nram_output + (repeat % 2) * pingpong_offset,
-                   load_size, dram_stride, nram_stride, remain);
-  }
-  __asm__ volatile("sync;\n\t");
-}
-
-template <typename T>
-__mlu_func__ void focalLossSigmoidForwardBlock(
-    const T *input, const int32_t *target, const T *weight, const int32_t n,
-    const int32_t c, const float alpha, const float gamma, T *output) {
-  /*
-   * NRAM partition
-   *  |-----------------------------------------------------------------------|
-   *  |                                weight                                 |
-   *  |------------------------------- COMPUTE -------------------------------|
-   *  |                                   |                                   |
-   *  |              computeA             |               computeB            |
-   *  |                                   |                                   |
-   *  |------------- PING ------------------------------- PONG ---------------|
-   *  |                                   |                                   |
-   *  |              input                |               input               |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |                                   |                                   |
-   *  |              output               |               output              |
-   *  |                                   |                                   |
-   *  |-----------------------------------|-----------------------------------|
-   *  |              target               |               target              |
-   *  |-----------------------------------|-----------------------------------|
-   *
-   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
-   * PONG(input,output).
-   * split_target_num is 2: PING(target), PONG(target).
-   * weight is not NULL:
-   *   The nram-size of weight is equal to c_align_size when partition input-N.
-   *   The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
-   * input-NC.
-  */
-
-  // calculate threshold of c
-  const int32_t split_pipeline_num = 6;
-  const int32_t split_target_num = 2;
-  const int32_t has_weight = weight != NULL;
-  const int32_t threshold_c =
-      PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
-                   (split_pipeline_num + has_weight),
-               NFU_ALIGN_SIZE) /
-      sizeof(T);
-  const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
-  const int32_t c_align_size = c_align * sizeof(T);
-
-  if (c <= threshold_c) {
-    // partition inputN
-    int32_t c_num = c;
-    int32_t reservered_align_size =
-        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
-    int32_t weight_size = 0;
-    if (has_weight) {
-      c_num = c_align;
-      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
-      weight_size = c_align_size;
-    }
-
-    const int32_t remain_size =
-        MAX_NRAM_SIZE - weight_size - reservered_align_size;
-    const int32_t n_seg =
-        remain_size / (split_pipeline_num * c_num * sizeof(T) +
-                       split_target_num * sizeof(int32_t));
-    const int32_t split_pipeline_size =
-        PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
-    const int32_t compute_size = 2 * split_pipeline_size;
-    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
-
-    char *nram_weight = (char *)nram_buffer;
-    char *nram_compute_a = nram_weight + has_weight * c_align_size;
-    char *nram_compute_b = nram_compute_a + split_pipeline_size;
-    char *nram_input = nram_compute_b + split_pipeline_size;
-    char *nram_output = nram_input + split_pipeline_size;
-    char *nram_target = nram_output + split_pipeline_size;
-
-    startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
-                     nram_input, nram_target, nram_weight, nram_output,
-                     has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
-                     alpha, gamma, output);
-  } else {
-    // partition inputNC
-    const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
-    const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
-    const int32_t split_pipeline_size = PAD_DOWN(
-        (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
-        NFU_ALIGN_SIZE);
-    const int32_t c_seg = split_pipeline_size / sizeof(T);
-    const int32_t n_seg = 1;
-    const int32_t compute_size = 2 * split_pipeline_size;
-    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
-    const int32_t pingpong_weight_offset = weight_size / 2;
-
-    char *nram_weight = (char *)nram_buffer;
-    char *nram_compute_a = nram_weight + weight_size;
-    char *nram_compute_b = nram_compute_a + split_pipeline_size;
-    char *nram_input = nram_compute_b + split_pipeline_size;
-    char *nram_output = nram_input + split_pipeline_size;
-    char *nram_target = nram_output + split_pipeline_size;
-
-    const int32_t loop_num = (c + c_seg - 1) / c_seg;
-    const int32_t partition_nc = 1;
-    for (int32_t i = 0; i < loop_num; ++i) {
-      const int32_t c_index = i * c_seg;
-      const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
-      startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
-                       nram_input, nram_target, nram_weight, nram_output,
-                       has_weight, partition_nc, pingpong_offset,
-                       pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
-                       alpha, gamma, output);
-    }
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
-    const void *input, const void *target, const void *weight, const int32_t N,
-    const int32_t C, const float alpha, const float gamma, void *output) {
-  const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
-  const T *input_offset = (T *)input + N / taskDim * taskId * C;
-  const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
-  T *output_offset = (T *)output + N / taskDim * taskId * C;
-
-  focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
-                               (T *)weight, n_seg, C, alpha, gamma,
-                               (T *)output_offset);
-}
-}  // namespace forward
-
-namespace backward {
-template <typename T>
-__mlu_func__ void loadInput(char *nram_input, char *nram_target,
-                            const T *gdram_input, const int32_t *gdram_target,
-                            const int32_t deal_n, const int32_t total_c,
-                            const bool pingping_flag, const bool has_weight,
-                            const int32_t nram_offset,
-                            const int32_t gdram_offset) {
-  if (pingping_flag == PONG) {
-    nram_input += nram_offset;
-    nram_target += nram_offset;
-  }
-
-  __memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
-                 deal_n * sizeof(int32_t), GDRAM2NRAM);
-
-  char *nram_input_load = nram_input;
-  int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
-  if (has_weight) {
-    if (sizeof(T) == sizeof(half)) {
-      int32_t compute_align_num = compute_align_size / sizeof(float);
-      int32_t align_c = PAD_UP(total_c, compute_align_num);
-      int32_t compute_size = deal_n * align_c * sizeof(float);
-      nram_input_load += compute_size / 2;
-    }
-    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
-    int32_t total_c_size = total_c * sizeof(T);
-    int32_t align_c_size = align_c * sizeof(T);
-    __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
-                   GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
-  } else {
-    if (sizeof(T) == sizeof(half)) {
-      int32_t compute_size =
-          PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
-      nram_input_load += compute_size / 2;
-    }
-    int32_t load_size = deal_n * total_c * sizeof(T);
-    __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
-                   GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
-                          const int32_t elem_count) {
-  __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
-  __bang_active_exphp(dst_data, dst_data, elem_count);
-  __bang_add_const(dst_data, dst_data, T(1), elem_count);
-  __bang_active_reciphp(dst_data, dst_data, elem_count);
-}
-
-template <typename T>
-__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
-                              const float *nram_flt_min, char *nram_pt,
-                              char *nram_alpha_t, char *nram_temp,
-                              char *nram_target, const float *nram_gamma,
-                              char *nram_output, const float alpha,
-                              const int32_t compute_num, const int32_t deal_n,
-                              const int32_t total_c, const bool pingpong_flag,
-                              const int32_t nram_offset,
-                              const bool has_weight) {
-  if (pingpong_flag == PONG) {
-    nram_input += nram_offset;
-    nram_pt += nram_offset;
-    nram_alpha_t += nram_offset;
-    nram_temp += nram_offset;
-    nram_output += nram_offset;
-    nram_target += nram_offset;
-  }
-
-  if (sizeof(T) == sizeof(half)) {
-    const int32_t compute_size = compute_num * sizeof(float);
-    char *nram_input_load = nram_input + compute_size / 2;
-    __bang_half2float((float *)nram_input, (half *)nram_input_load,
-                      compute_num);
-  }
-
-  // 0. alpha_t = alpha - 1
-  __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
-
-  // 1. pt = 1 - sigmoid(x)
-  sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
-  __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
-
-  // 2. pt      = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
-  //    alpha_t = target[n] == c ? alpha      : alpha - 1
-  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
-  for (int n = 0; n < deal_n; n++) {
-    const int32_t target_value = ((int32_t *)nram_target)[n];
-    if (target_value >= total_c || target_value < 0) continue;
-    int32_t c_offset = 0;
-    if (has_weight) {
-      int32_t c_align_num = nfu_align_num;
-      if (sizeof(T) == sizeof(half)) {
-        c_align_num += nfu_align_num;
-      }
-      c_offset = PAD_UP(total_c, c_align_num);
-    } else {
-      c_offset = total_c;
-    }
-    int32_t idx = n * c_offset + target_value;
-    *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
-    *((float *)nram_alpha_t + idx) = alpha;
-  }
-
-  // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
-  __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
-                   compute_num);
-  __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
-                        (float *)nram_flt_min, compute_num, nfu_align_num);
-  __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
-  __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
-                   compute_num, nfu_align_num);
-  __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
-  __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
-             compute_num);
-  __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
-                   compute_num);
-
-  // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
-  __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
-                        (float *)nram_flt_min, compute_num, nfu_align_num);
-  __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
-  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
-             compute_num);
-  __bang_cycle_mul((float *)nram_output, (float *)nram_output,
-                   (float *)nram_gamma, compute_num, nfu_align_num);
-  __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
-             compute_num);
-  __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
-                   compute_num);
-  __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
-                   compute_num);
-
-  // 5. output = output * temp
-  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
-             compute_num);
-
-  if (sizeof(T) == sizeof(half)) {
-    __bang_float2half_rd((half *)nram_output, (float *)nram_output,
-                         compute_num);
-  }
-
-  if (has_weight) {
-    // with weight
-    for (int n = 0; n < deal_n; n++) {
-      int32_t c_align_num = nfu_align_num;
-      if (sizeof(T) == sizeof(half)) {
-        c_align_num += nfu_align_num;
-      }
-      int32_t align_c = PAD_UP(total_c, c_align_num);
-      int32_t target_value = ((int32_t *)nram_target)[n];
-      T weight_value = nram_weight[target_value];
-      __bang_mul_const((T *)nram_output + n * align_c,
-                       (T *)nram_output + n * align_c, weight_value, align_c);
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
-                              const int32_t deal_n, const int32_t total_c,
-                              const bool pingpong_flag, const bool has_weight,
-                              const int32_t nram_offset,
-                              const int32_t gdram_offset) {
-  if (pingpong_flag == PONG) {
-    nram_output += nram_offset;
-  }
-  const int32_t store_size = deal_n * total_c * sizeof(T);
-  if (has_weight) {
-    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
-    int32_t total_c_size = total_c * sizeof(T);
-    int32_t align_c_size = align_c * sizeof(T);
-    __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
-                   NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
-  } else {
-    __memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
-                   NRAM2GDRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void focalLossSigmoidBackwardBlock(
-    const T *input, const int32_t *target, const T *weight, const float gamma,
-    const float alpha, const int32_t total_n, const int32_t deal_n,
-    const int32_t total_c, T *output) {
-  // params per time slice
-  int32_t deal_num = deal_n * total_c;
-  int32_t deal_size = deal_num * sizeof(float);
-  int32_t compute_num = 0;
-  int32_t compute_size = 0;
-  int32_t compute_align_size = NFU_ALIGN_SIZE;
-  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
-  if (sizeof(T) == sizeof(half)) {
-    compute_align_size += NFU_ALIGN_SIZE;
-  }
-  const int32_t compute_align_num = compute_align_size / sizeof(float);
-  bool has_weight = false;
-  if (weight != NULL) {
-    has_weight = true;
-    int32_t align_c = PAD_UP(total_c, compute_align_num);
-    compute_num = deal_n * align_c;
-    compute_size = compute_num * sizeof(float);
-  } else {
-    compute_size = PAD_UP(deal_size, compute_align_size);
-    compute_num = compute_size / sizeof(float);
-  }
-
-  // params per core
-  int32_t total_num = total_n * total_c;
-  int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
-  int32_t loop_per_core = num_per_core / deal_num;
-
-  /* NRAM partition:
-   *
-   * |-----------------ping pong--------------------|
-   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
-   *
-   * split_pipeline_num is 5: input, pt, alpha_t, temp, output.
-   * nram_reserved_line_num is 2: flt_min, gamma.
-   */
-  const int32_t split_pipeline_num = 5;
-  const int32_t nram_reserved_line_num = 2;
-  int32_t target_deal_size = deal_n * sizeof(int32_t);
-  int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
-  // nram PING/PONG offset
-  int32_t ping_pong_offset =
-      compute_size * split_pipeline_num + target_deal_size_align;
-
-  // gdram addr
-  int32_t *base_addr_target =
-      (int32_t *)target + taskId * loop_per_core * deal_n;
-  T *base_addr_input = (T *)input + taskId * num_per_core;
-  T *base_addr_output = output + taskId * num_per_core;
-
-  // nram addr
-  char *nram_input = (char *)nram_buffer;
-  char *nram_pt = nram_input + compute_size;
-  char *nram_alpha_t = nram_pt + compute_size;
-  char *nram_temp = nram_alpha_t + compute_size;
-  char *nram_output = nram_temp + compute_size;
-  char *nram_target = nram_output + compute_size;
-  float *nram_flt_min = NULL;
-  float *nram_gamma = NULL;
-  T *nram_weight = NULL;
-
-  if (!has_weight) {
-    nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
-                             nram_reserved_line_num * NFU_ALIGN_SIZE);
-    nram_gamma = nram_flt_min + nfu_align_num;
-  } else {
-    int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
-    nram_flt_min =
-        (float *)(nram_buffer + MAX_NRAM_SIZE -
-                  nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
-    nram_gamma = nram_flt_min + nfu_align_num;
-    nram_weight = (T *)(nram_gamma + nfu_align_num);
-    __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
-  }
-
-  // nram set gamma and FLT_MIN
-  __nramset(nram_gamma, nfu_align_num, gamma);
-  __nramset(nram_flt_min, nfu_align_num, FLT_MIN);
-
-  /*
-   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
-   *           The allocated memory space of NRAM is divided into two parts:
-   *           PING and Pong. In a single time slice, PING is used to process
-   *           IO stream and PONG is used for computation. Both of them are
-   *           processed synchronously until finished.
-   *
-   * diagram of PINGPONG:
-   * |------|-----------------------------------------------------------------|
-   * |      |                              space                              |
-   * |------|-----------------------------------------------------------------|
-   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
-   * |------|-----------------------------------------------------------------|
-   * |  0   |    L0    |          |          |          |          |          |
-   * |  1   |    C0    |    L1    |          |          |          |          |
-   * |  2   |    S0    |    C1    |    L2    |          |          |          |
-   * |  3   |          |    S1    |    C2    |    L3    |          |          |
-   * |  4   |          |          |    S2    |    C3    |    L4    |          |
-   * |  5   |          |          |          |    S3    |    C4    |    L5    |
-   * |  6   |          |          |          |          |    S4    |    C5    |
-   * |  7   |          |          |          |          |          |    S5    |
-   * |------|-----------------------------------------------------------------|
-   */
-
-  // diagram of PINGPONG: L0
-  if (loop_per_core > 0) {
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
-    __asm__ volatile("sync;");
-  }
-
-  // diagram of PINGPONG: C0 and L1
-  if (loop_per_core > 1) {
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                compute_num, deal_n, total_c, PING, ping_pong_offset,
-                has_weight);
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
-    __asm__ volatile("sync;");
-  }
-
-  for (int i = 0; i < loop_per_core - 2; ++i) {
-    if (i % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, i * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
-                  has_weight);
-      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-                deal_n, total_c, PING, has_weight, ping_pong_offset,
-                (i + 2) * deal_num);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, i * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-                deal_n, total_c, PONG, has_weight, ping_pong_offset,
-                (i + 2) * deal_num);
-    }
-    __asm__ volatile("sync;");
-  }
-
-  if (loop_per_core > 1) {
-    if ((loop_per_core - 2) % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
-                  has_weight);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-    }
-    __asm__ volatile("sync;");
-  }
-
-  if (loop_per_core > 0) {
-    if (loop_per_core == 1) {
-      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                  compute_num, deal_n, total_c, PING, ping_pong_offset,
-                  has_weight);
-      __asm__ volatile("sync;");
-    }
-    if ((loop_per_core - 1) % 2 == PING) {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
-                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
-    } else {
-      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
-                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
-    }
-  }
-
-  // process the remaining data which N remainder per core is less than deal_n
-  int32_t rem_for_all = total_num - num_per_core * taskDim;
-  if (rem_for_all == 0) return;
-  int32_t rem_n_for_all = rem_for_all / total_c;
-  int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
-  int32_t rem_num_per_core = rem_n_per_core * total_c;
-  int32_t rem_num_per_core_align = 0;
-  int32_t rem_core_num = rem_for_all / rem_num_per_core;
-
-  int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
-  int32_t rem_num_for_last = rem_n_for_last * total_c;
-  int32_t rem_num_for_last_align = 0;
-
-  if (has_weight) {
-    int32_t align_c = PAD_UP(total_c, compute_align_num);
-    rem_num_per_core_align = rem_n_per_core * align_c;
-    rem_num_for_last_align = rem_n_for_last * align_c;
-  } else {
-    rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
-    rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
-  }
-
-  int32_t rem_addr_base = num_per_core * taskDim;
-  int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
-  base_addr_target = (int32_t *)target + rem_target_addr_base;
-  base_addr_input = (T *)input + rem_addr_base;
-  base_addr_output = output + rem_addr_base;
-
-  if (taskId < rem_core_num) {
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
-              taskId * rem_num_per_core);
-    __asm__ volatile("sync;");
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                rem_num_per_core_align, rem_n_per_core, total_c, PING,
-                ping_pong_offset, has_weight);
-    __asm__ volatile("sync;");
-    storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
-                has_weight, ping_pong_offset, taskId * rem_num_per_core);
-  } else if (taskId == rem_core_num) {
-    if (rem_num_for_last == 0) return;
-    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
-              rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
-              taskId * rem_num_per_core);
-    __asm__ volatile("sync;");
-    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
-                nram_temp, nram_target, nram_gamma, nram_output, alpha,
-                rem_num_for_last_align, rem_n_for_last, total_c, PING,
-                ping_pong_offset, has_weight);
-    __asm__ volatile("sync;");
-    storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
-                has_weight, ping_pong_offset, taskId * rem_num_per_core);
-  } else {
-    return;
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
-    const void *input, const void *target, const void *weight,
-    const float gamma, const float alpha, const int32_t total_n,
-    const int32_t deal_n, const int32_t total_c, void *output) {
-  focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
-                                gamma, alpha, total_n, deal_n, total_c,
-                                (T *)output);
-}
-}  // namespace backward
-
-void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                   cnrtQueue_t queue,
-                                   const cnrtDataType_t d_type,
-                                   const void *input, const void *target,
-                                   const void *weight, const int32_t N,
-                                   const int32_t C, const float alpha,
-                                   const float gamma, void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    forward::MLUUnion1KernelFocalLossSigmoidForward<
-        half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
-                                        gamma, output);
-  } else {
-    forward::MLUUnion1KernelFocalLossSigmoidForward<
-        float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
-                                         gamma, output);
-  }
-}
-
-void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                                    cnrtQueue_t queue,
-                                    const cnrtDataType_t d_type,
-                                    const void *input, const void *target,
-                                    const void *weight, const float gamma,
-                                    const float alpha, const int32_t dim_n,
-                                    const int32_t deal_n, const int32_t dim_c,
-                                    void *output) {
-  if (d_type == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelFocalLossSigmoidBackward<
-        half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
-                                        dim_n, deal_n, dim_c, output);
-  } else {
-    backward::MLUUnion1KernelFocalLossSigmoidBackward<
-        float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
-                                         dim_n, deal_n, dim_c, output);
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-#include "iou3d_utils.hpp"
-
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-
-/* NRAM buffer
- * Suppose deal N boxes once time.
----------------------------------------------------------------
-| Basic |score (1N)+       |intersect_pts(48N)|                |
-|       |valid_box(1N)     |+ ordered_pts(48N)| temp_long(72N) |
-|       |+ temp_buffer(10N)|                  |                |
-|--------------------------|------------------|----------------|
-| Reuse |     null         |     null         |rotated_pts(16N)|
-|-------|------------------|------------------|----------------|
-
---------------------------------------------------------------------------
-| Basic |  dist_ram(24N)   | valid_pts(24N)  |box1(5N)  |box1_buffer(5KB) |
-|       |                  |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB)  |
-|--------------------------|-----------------|----------|-----------------|
-| Reuse |  vec_buffer(5N)  |    null         |   null   |      null       |
-|-------|------------------|-----------------|----------|-----------------|
-Total Basic Memory Size = 239N * sizeof(float) + 10KB
-*/
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
-
-template <typename T>
-__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
-                                  const T *boxes_data, float *scores_data,
-                                  const int core_limit, const int input_box_num,
-                                  const float iou_threshold,
-                                  mluMemcpyDirection_t scores_load_dir,
-                                  mluMemcpyDirection_t scores_store_dir,
-                                  mluMemcpyDirection_t boxes_load_dir) {
-  // NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
-  const int nram_save_limit_count = 256;
-  int box_read_limit_count = 256;
-  float div_thresh_iou = 1.0 / iou_threshold;
-  // every box require 239 * sizeof(float) space in nram;
-  const int32_t copies_of_nram = 239 * sizeof(float);
-  const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
-                         nram_save_limit_count * sizeof(int32_t)) /
-                        copies_of_nram;
-
-  // x,y,z,dx,dy,dz,angle
-  const T *input_x_ptr = boxes_data;
-  const T *input_y_ptr = input_x_ptr + input_box_num;
-  const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
-  const T *input_dy_ptr = input_dx_ptr + input_box_num;
-  const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
-  float *input_score_ptr = scores_data;
-
-  // data split
-  int avg_cluster = 0;
-  int rem_cluster = 0;
-  int len_cluster = 0;
-  int cluster_offset = 0;
-  if (clusterDim > 0) {
-    // union
-    avg_cluster = input_box_num / clusterDim;
-    rem_cluster = input_box_num % clusterDim;
-    len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
-    cluster_offset = avg_cluster * clusterId +
-                     (clusterId <= rem_cluster ? clusterId : rem_cluster);
-  } else {
-    // block
-    len_cluster = input_box_num;
-    cluster_offset = 0;
-  }
-  int len_core = input_box_num;
-  int input_offset = 0;
-  if (core_limit > 1) {
-    int avg_core = len_cluster / coreDim;
-    int rem_core = len_cluster % coreDim;
-    len_core = avg_core + (coreId < rem_core ? 1 : 0);
-    int core_offset =
-        avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-    input_offset = cluster_offset + core_offset;
-  }
-
-  int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
-  int repeat_iou_compute = len_core / max_seg_pad;
-  int remain_iou_compute = len_core % max_seg_pad;
-
-  // basic consistent memory layout
-  void *score = ((char *)nram_buffer);
-  void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
-  void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
-  void *intersect_pts_x =
-      ((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
-  void *intersect_pts_y =
-      ((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_x =
-      ((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_y =
-      ((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_1 =
-      ((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
-  void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
-  void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
-  void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
-  T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
-  T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
-  void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
-  int32_t *nram_save =
-      (int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
-  // nram_save ~ nram_save_limit_count * sizeof(int32_t)
-  int nram_save_count = 0;
-
-  // reuse memory
-  void *rotated_pts1_x = ((char *)dist_ram);
-  void *rotated_pts1_y =
-      ((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_x =
-      ((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_y =
-      ((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
-  void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
-  // vec_buffer ~ 16 * max_seg_pad * sizeof(float)
-
-  // First, initialize ram with all 0, or could cause nan/inf unexcepted results
-  __bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
-  // number 8 and 0xff relay on box_read_limit_count initial as 256
-  const int max_box_seg_id = (input_box_num - 1) >> 8;
-  const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
-  for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
-    __sync_all();
-    int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
-    box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
-                                                        : box_read_limit_count;
-    if (box_id == 0) {
-      // x,y,z,dx,dy,dz,angle
-      int offset_num = box_seg_id << 8;
-      // x
-      __memcpy((char *)box1_buffer, input_x_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // y
-      __memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
-               input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dx
-      __memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
-               input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dy
-      __memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
-               input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // angle
-      __memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
-               input_angle_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-    }
-    if (((float *)input_score_ptr)[cur_box] == 0) {
-      continue;
-    }
-    // save result
-    nram_save[nram_save_count] = cur_box;
-    result_box_num++;
-    nram_save_count++;
-    if (clusterId == 0 && coreId == 0 &&
-        nram_save_count == nram_save_limit_count) {
-      pvLock();
-      __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-               NRAM2GDRAM);
-      pvUnlock();
-      output_data += nram_save_count;
-      nram_save_count = 0;
-    }
-    // prepare box1
-    // x
-    __bang_write_value((float *)box1, max_seg_pad,
-                       float(((T *)box1_buffer)[box_id]));
-    // y
-    __bang_write_value(
-        (float *)box1 + max_seg_pad, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
-    // dx
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 2, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
-    // dy
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 3, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
-    // angle
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 4, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
-
-    float max_area = 1.0f *
-                     ((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
-                     ((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
-    // update score
-
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = max_seg_pad;
-      int cpy_len =
-          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
-      // int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
-      int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
-      // score
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(float), scores_load_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      // x
-      __memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // y
-      __memcpy(box2 + half_offset + seg_len * 1,
-               input_y_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dx
-      __memcpy(box2 + half_offset + seg_len * 2,
-               input_dx_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dy
-      __memcpy(box2 + half_offset + seg_len * 3,
-               input_dy_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // angle
-      __memcpy(box2 + half_offset + seg_len * 4,
-               input_angle_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // if (std::is_same<T, half>::value) {
-      if (sizeof(T) == sizeof(half)) {
-        __bang_half2float((float *)box2, (half *)(box2 + half_offset),
-                          seg_len * 5);
-      }
-
-      // Calculate rotated vertices
-      void *temp1_ram = ((char *)temp_buffer);
-      void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
-      void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
-      void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
-      getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
-                         (float *)box1, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-      getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
-                         (float *)box2, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-
-      __bang_write_zero((float *)valid_pts, 24 * seg_len);
-      __bang_write_zero((float *)nums_in_ram, seg_len);
-      __bang_write_value(((float *)valid_box), seg_len, 1.0f);
-      void *vec1_x = ((char *)vec_buffer);
-      void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
-      void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
-      void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
-      void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
-      void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
-      void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
-      void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
-      void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
-      void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
-
-      // Get all intersection points
-      getIntersectPts(
-          (float *)rotated_pts1_x, (float *)rotated_pts1_y,
-          (float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
-          (float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
-          (float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
-          (float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
-          (float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
-
-      // Where nums_in <= 2, set valid_box to false
-      __bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
-      __bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
-                      (float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
-      __bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
-                 seg_len);
-      __bang_cycle_and((float *)valid_pts, (float *)valid_pts,
-                       (float *)valid_box, 24 * seg_len, seg_len);
-
-      // Convex-hull-graham to order the intersection points in clockwise order
-      // and find the contour area
-
-      convexHullGraham(
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
-          (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-          (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-          (float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
-          seg_len, seg_len);
-      // Calculate polygon area
-      // set temp1 = intersection part area
-      polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
-                  (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-                  (float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
-                  (float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
-                  (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-                  seg_len);
-      // area
-      __bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
-                 (float *)box2 + seg_len * 3, seg_len);
-      // get the area_U: area + max_area - area_I
-      __bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
-                        seg_len);
-      __bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
-                 seg_len);  // area_U
-      if (iou_threshold > 0.0) {
-        __bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
-                          div_thresh_iou, seg_len);
-      } else {
-        __bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
-                          seg_len);
-      }
-      __bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
-
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
-               cpy_len * sizeof(float), scores_store_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      pvUnlock();
-    }
-  }
-  if (clusterId == 0 && coreId == 0 && nram_save_count) {
-    pvLock();
-    __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-             NRAM2GDRAM);
-    pvUnlock();
-  }
-}
-__mlu_global__ void MLUBlockorUnionIKernelOU3D(
-    const void *input_boxes, const int input_box_num, const float iou_threshold,
-    const cnrtDataType_t data_type_input, void *workspace, void *result_num,
-    void *output) {
-  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
-  mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
-  mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
-  float *scores_data = (float *)workspace;
-  float *boxes_data = (float *)input_boxes;
-  const int cluster_score_size = input_box_num * sizeof(float);
-  const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
-  char *sram_score = (char *)sram_buffer;
-  char *sram_boxes = (char *)sram_buffer + cluster_score_size;
-  if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
-    scores_data = (float *)sram_score;
-    scores_load_dir = SRAM2NRAM;
-    scores_store_dir = NRAM2SRAM;
-    if (coreId == 0x80) {
-      __sramset((void *)sram_buffer, input_box_num, 1.0f);
-    }
-  } else {
-    if (coreId == 0) {
-      __gdramset(scores_data, input_box_num, 1.0f);
-    }
-  }
-  if (clusterDim == 1 &&
-      SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
-    boxes_load_dir = SRAM2NRAM;
-    boxes_data = (float *)sram_boxes;
-    if (coreId == 0x80) {
-      __memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
-               GDRAM2SRAM);
-    }
-  }
-  __sync_cluster();
-
-  int32_t result_box_num = 0;
-  int32_t *out_data = (int32_t *)output;
-
-  switch (data_type_input) {
-    default: { return; }
-    case CNRT_FLOAT16: {
-      iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-    case CNRT_FLOAT32: {
-      iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-  }
-  ((int32_t *)result_num)[0] = result_box_num;
-}
-
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output) {
-  switch (k_type) {
-    default: { return; }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1:
-    case CNRT_FUNC_TYPE_UNION2:
-    case CNRT_FUNC_TYPE_UNION4:
-    case CNRT_FUNC_TYPE_UNION8:
-    case CNRT_FUNC_TYPE_UNION16: {
-      MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
-          (void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
-          workspace, output_size, output);
-    }; break;
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#ifndef IOU3D_UTILS_HPP_
-#define IOU3D_UTILS_HPP_
-#include "common_mlu_helper.hpp"
-
-#define IOU3D_SIZE 64
-#define IOU3D_UP(x, y) (x / y + (int)(x % y > 0)) * y
-#define IOU3D_DOWN(x, y) (x / y) * y
-#define SIZE_NRAM_BUF (MAX_NRAM_SIZE)
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-#define COMPUTE_COUNT_ALIGN 64
-#define INFO_NUM (5)  // score, x1, y1, x2, y2
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
-#define SINGLE_BOX_DIM 5
-#define MEMORY_CORE (0x80)
-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-
-// cross2d<T>(A, B) = A.x * B.y - A.y * B.x;
-template <typename T>
-inline __mlu_func__ void cross2d(T *result, const T *p1_x, const T *p1_y,
-                                 const T *p2_x, const T *p2_y,
-                                 const int &length, T *temp_ram) {
-  __bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_y, length);
-  __bang_mul((T *)result, (T *)p1_y, (T *)p2_x, length);
-  __bang_sub((T *)result, (T *)temp_ram, (T *)result, length);
-}
-
-// dot2d<T>(A, B) =  A.x * B.x + A.y * B.y
-template <typename T>
-inline __mlu_func__ void dot2d(T *result, const T *p1_x, const T *p1_y,
-                               const T *p2_x, const T *p2_y, const int &length,
-                               T *temp_ram) {
-  __bang_mul((T *)temp_ram, (T *)p1_x, (T *)p2_x, length);
-  __bang_mul((T *)result, (T *)p1_y, (T *)p2_y, length);
-  __bang_add((T *)result, (T *)temp_ram, (T *)result, length);
-}
-
-template <typename T>
-__mlu_func__ void getRotatedVertices(T *pts_x, T *pts_y, T *box, T *temp1,
-                                     T *temp2, T *temp3, T *temp4,
-                                     const uint32_t &actual_compute_box_num) {
-// T cosTheta2 = (T)cos(theta) * 0.5f; -- temp1
-// T sinTheta2 = (T)sin(theta) * 0.5f; -- temp2
-// theta is the box's 5th data: a, rotated radian;
-#if __BANG_ARCH__ >= 300
-  __bang_cos((float *)temp1, ((float *)box) + 4 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sin((float *)temp2, ((float *)box) + 4 * actual_compute_box_num,
-             actual_compute_box_num);
-#else
-  __bang_taylor4_cos((T *)temp1, ((T *)box) + 4 * actual_compute_box_num,
-                     (T *)temp3, (T *)temp4, actual_compute_box_num);
-  __bang_taylor4_sin((T *)temp2, ((T *)box) + 4 * actual_compute_box_num,
-                     (T *)temp3, (T *)temp4, actual_compute_box_num);
-#endif
-  __bang_mul_scalar((T *)temp1, (T *)temp1, (T)0.5, actual_compute_box_num);
-  __bang_mul_scalar((T *)temp2, (T *)temp2, (T)0.5, actual_compute_box_num);
-
-  // Temp3 = sinTheta2 * box.h;
-  // Temp4 = cosTheta2 * box.w;
-  __bang_mul((T *)temp3, (T *)temp2, ((T *)box) + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_mul((T *)temp4, (T *)temp1, ((T *)box) + 2 * actual_compute_box_num,
-             actual_compute_box_num);
-  // pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  // pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  __bang_sub((T *)pts_x, (T *)box, (T *)temp3, actual_compute_box_num);
-  __bang_sub((T *)pts_x, (T *)pts_x, (T *)temp4, actual_compute_box_num);
-  __bang_add((T *)pts_x + 1 * actual_compute_box_num, (T *)box, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 1 * actual_compute_box_num,
-             (T *)pts_x + 1 * actual_compute_box_num, (T *)temp4,
-             actual_compute_box_num);
-  // Temp3 = cosTheta2 * box.h;
-  // Temp4 = sinTheta2 * box.w;
-  __bang_mul((T *)temp3, (T *)temp1, box + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_mul((T *)temp4, (T *)temp2, box + 2 * actual_compute_box_num,
-             actual_compute_box_num);
-  // pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  // pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  __bang_add((T *)pts_y, (T *)box + 1 * actual_compute_box_num, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_y, (T *)pts_y, (T *)temp4, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, (T *)temp3,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_y + 1 * actual_compute_box_num,
-             (T *)pts_y + 1 * actual_compute_box_num, (T *)temp4,
-             actual_compute_box_num);
-  // pts[2].x = 2 * box.x_ctr - pts[0].x;
-  // pts[3].x = 2 * box.x_ctr - pts[1].x;
-  __bang_add((T *)pts_x + 2 * actual_compute_box_num, (T *)box, (T *)box,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 2 * actual_compute_box_num,
-             (T *)pts_x + 2 * actual_compute_box_num, (T *)pts_x,
-             actual_compute_box_num);
-  __bang_add((T *)pts_x + 3 * actual_compute_box_num, (T *)box, (T *)box,
-             actual_compute_box_num);
-  __bang_sub((T *)pts_x + 3 * actual_compute_box_num,
-             (T *)pts_x + 3 * actual_compute_box_num,
-             (T *)pts_x + 1 * actual_compute_box_num, actual_compute_box_num);
-  // pts[2].y = 2 * box.y_ctr - pts[0].y;
-  // pts[3].y = 2 * box.y_ctr - pts[1].y;
-  __bang_add((T *)pts_y + 2 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 2 * actual_compute_box_num,
-             (T *)pts_y + 2 * actual_compute_box_num, (T *)pts_y,
-             actual_compute_box_num);
-  __bang_add((T *)pts_y + 3 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num,
-             (T *)box + 1 * actual_compute_box_num, actual_compute_box_num);
-  __bang_sub((T *)pts_y + 3 * actual_compute_box_num,
-             (T *)pts_y + 3 * actual_compute_box_num,
-             (T *)pts_y + 1 * actual_compute_box_num, actual_compute_box_num);
-}
-
-template <typename T>
-__mlu_func__ void getIntersectPts(T *rotated_pts1_x, T *rotated_pts1_y,
-                                  T *rotated_pts2_x, T *rotated_pts2_y,
-                                  T *vec1_x, T *vec1_y, T *vec2_x, T *vec2_y,
-                                  T *intersect_pts_x, T *intersect_pts_y,
-                                  T *valid_pts, T *nums_in_ram, T *temp1_ram,
-                                  T *temp2_ram, T *temp3_ram, T *temp4_ram,
-                                  T *temp5_ram, T *temp6_ram, T *temp7_ram,
-                                  T *temp8_ram, T *temp9_ram, T *temp10_ram,
-                                  const uint32_t &actual_compute_box_num) {
-// Initialize const data to ram
-// temp3 = const 1e-14(@float), length = COMPUTE_COUNT_ALIGN
-#if __BANG_ARCH__ >= 300
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (T)1e-14);
-#else
-  // NOTE: Since active_reciphp function has strict value range,
-  //       [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, (float)1e-14);
-#endif
-  // temp4 = const T(0), length = COMPUTE_COUNT_ALIGN
-  __bang_write_value((T *)temp4_ram, COMPUTE_COUNT_ALIGN, (T)0);
-  // temp5 = const T(1), length = COMPUTE_COUNT_ALIGN
-  __bang_write_value((T *)temp5_ram, COMPUTE_COUNT_ALIGN, (T)1);
-
-  // Line vector, from p1 to p2 is: p1+(p2-p1)*t, t=[0,1]
-  // for i = 0~3, vec[i] = pts[(i+1)%4] - pts[i]
-  __bang_sub((T *)vec1_x, (T *)rotated_pts1_x + actual_compute_box_num,
-             (T *)rotated_pts1_x, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec1_x + 3 * actual_compute_box_num, (T *)rotated_pts1_x,
-             (T *)rotated_pts1_x + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sub((T *)vec1_y, (T *)rotated_pts1_y + actual_compute_box_num,
-             (T *)rotated_pts1_y, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec1_y + 3 * actual_compute_box_num, (T *)rotated_pts1_y,
-             (T *)rotated_pts1_y + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-
-  __bang_sub((T *)vec2_x, (T *)rotated_pts2_x + actual_compute_box_num,
-             (T *)rotated_pts2_x, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec2_x + 3 * actual_compute_box_num, (T *)rotated_pts2_x,
-             (T *)rotated_pts2_x + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-  __bang_sub((T *)vec2_y, (T *)rotated_pts2_y + actual_compute_box_num,
-             (T *)rotated_pts2_y, 3 * actual_compute_box_num);
-  __bang_sub((T *)vec2_y + 3 * actual_compute_box_num, (T *)rotated_pts2_y,
-             (T *)rotated_pts2_y + 3 * actual_compute_box_num,
-             actual_compute_box_num);
-
-  // First, line test - test all line combos for intersection, 4x4 possible
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // T det = cross2d<T>(vec2[j], vec1[i]) -- temp2
-      cross2d<T>((T *)temp2_ram, (T *)vec2_x + j * actual_compute_box_num,
-                 (T *)vec2_y + j * actual_compute_box_num,
-                 (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)vec1_y + i * actual_compute_box_num,
-                 actual_compute_box_num, (T *)temp1_ram);
-      // temp8 = sign(det), since active_reciphp only receive positive values
-      __bang_active_sign((T *)temp8_ram, (T *)temp2_ram,
-                         actual_compute_box_num);
-      // deal with parallel lines, temp2 = fabs(det), temp1 = temp2 > 1e-14
-      __bang_active_abs((T *)temp2_ram, (T *)temp2_ram, actual_compute_box_num);
-      __bang_cycle_gt((T *)temp1_ram, (T *)temp2_ram, (T *)temp3_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      // Where temp1 = false, set recip input to 1, avoiding recip(0), cause inf
-      __bang_not((T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-      __bang_add((T *)temp2_ram, (T *)temp2_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-// temp2 = 1/temp2, use mult (1/temp2) instead of div temp2
-#if __BANG_ARCH__ >= 300
-      __bang_recip((float *)temp2_ram, (float *)temp2_ram,
-                   actual_compute_box_num);
-#else
-      // NOTE: active_reciphp function has strict value range:
-      //       [2.2205e-16, 2e6]@float, [0.00391, 65504]@half
-      __bang_active_reciphp((T *)temp2_ram, (T *)temp2_ram,
-                            actual_compute_box_num);
-#endif
-      // Restore temp2 invalid box value 1 and sign-bit
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-      __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)temp8_ram,
-                 actual_compute_box_num);
-
-      // auto vec12 = pts2[j] - pts1[i], (temp6, temp7) = (x, y)
-      __bang_sub((T *)temp6_ram,
-                 (T *)rotated_pts2_x + j * actual_compute_box_num,
-                 (T *)rotated_pts1_x + i * actual_compute_box_num,
-                 actual_compute_box_num);
-      __bang_sub((T *)temp7_ram,
-                 (T *)rotated_pts2_y + j * actual_compute_box_num,
-                 (T *)rotated_pts1_y + i * actual_compute_box_num,
-                 actual_compute_box_num);
-
-      // T t1 = cross2d<T>(vec2[j], vec12) mult (1/det)  -- temp8
-      cross2d<T>((T *)temp8_ram, (T *)vec2_x + j * actual_compute_box_num,
-                 (T *)vec2_y + j * actual_compute_box_num, (T *)temp6_ram,
-                 (T *)temp7_ram, actual_compute_box_num, (T *)temp9_ram);
-      __bang_mul((T *)temp8_ram, (T *)temp8_ram, (T *)temp2_ram,
-                 actual_compute_box_num);
-
-      // temp1 &= (t1 >= 0.0f && t1 <= 1.0f)  -- temp9
-      __bang_cycle_ge((T *)temp9_ram, (T *)temp8_ram, (T *)temp4_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-      __bang_cycle_le((T *)temp9_ram, (T *)temp8_ram, (T *)temp5_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp9_ram,
-                 actual_compute_box_num);
-
-      // T t2 = cross2d<T>(vec1[i], vec12) mult temp2  -- temp9
-      // NOTE: temp8(t1) is used after, reuse temp7(p2_y) as cross2d temp ram
-      cross2d<T>((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)vec1_y + i * actual_compute_box_num, (T *)temp6_ram,
-                 (T *)temp7_ram, actual_compute_box_num, (T *)temp7_ram);
-      __bang_mul((T *)temp9_ram, (T *)temp9_ram, (T *)temp2_ram,
-                 actual_compute_box_num);
-
-      // temp1 &= (t2 >= 0.0f && t2 <= 1.0f)  -- temp9
-      __bang_cycle_ge((T *)temp7_ram, (T *)temp9_ram, (T *)temp4_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
-                 actual_compute_box_num);
-      __bang_cycle_le((T *)temp7_ram, (T *)temp9_ram, (T *)temp5_ram,
-                      actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-      __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp7_ram,
-                 actual_compute_box_num);
-
-      // intersections = (pts1[i] + vec1[i] * t1) * temp1
-      __bang_mul((T *)temp9_ram, (T *)vec1_x + i * actual_compute_box_num,
-                 (T *)temp8_ram, actual_compute_box_num);
-      __bang_add((T *)temp9_ram,
-                 (T *)rotated_pts1_x + i * actual_compute_box_num,
-                 (T *)temp9_ram, actual_compute_box_num);
-      __bang_mul((T *)intersect_pts_x + (4 * i + j) * actual_compute_box_num,
-                 (T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-      __bang_mul((T *)temp9_ram, (T *)vec1_y + i * actual_compute_box_num,
-                 (T *)temp8_ram, actual_compute_box_num);
-      __bang_add((T *)temp9_ram,
-                 (T *)rotated_pts1_y + i * actual_compute_box_num,
-                 (T *)temp9_ram, actual_compute_box_num);
-      __bang_mul((T *)intersect_pts_y + (4 * i + j) * actual_compute_box_num,
-                 (T *)temp9_ram, (T *)temp1_ram, actual_compute_box_num);
-
-      // Assign `valid_pts` bit and accumulate `nums_in` of valid points of each
-      // box pair
-      __bang_or((T *)valid_pts + (4 * i + j) * actual_compute_box_num,
-                (T *)valid_pts + (4 * i + j) * actual_compute_box_num,
-                (T *)temp1_ram, actual_compute_box_num);
-      __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-                 actual_compute_box_num);
-    }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  // temp5 = ABdotAB
-  dot2d<T>((T *)temp5_ram, (T *)vec2_x, (T *)vec2_y, (T *)vec2_x, (T *)vec2_y,
-           actual_compute_box_num, (T *)temp9_ram);
-  // temp6 = ADdotAD
-  dot2d<T>((T *)temp6_ram, (T *)vec2_x + 3 * actual_compute_box_num,
-           (T *)vec2_y + 3 * actual_compute_box_num,
-           (T *)vec2_x + 3 * actual_compute_box_num,
-           (T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
-           (T *)temp9_ram);
-  // assume ABCD is the rectangle, and P is the point to be judged
-  // P is inside ABCD iff. P's projection on AB lines within AB
-  // and P's projection on AD lies within AD
-  for (int i = 0; i < 4; i++) {
-    // AP = pts1[i] - pts2[0] = (temp7, temp8)
-    __bang_sub((T *)temp7_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
-               (T *)rotated_pts2_x, actual_compute_box_num);
-    __bang_sub((T *)temp8_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
-               (T *)rotated_pts2_y, actual_compute_box_num);
-
-    // temp9 = APdotAB = dot2d<T>(AP, AB)
-    dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec2_x,
-             (T *)vec2_y, actual_compute_box_num, (T *)temp2_ram);
-    // temp10 = APdotAD = -dot2d<T>(AP, DA)
-    dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
-             (T *)vec2_x + 3 * actual_compute_box_num,
-             (T *)vec2_y + 3 * actual_compute_box_num, actual_compute_box_num,
-             (T *)temp2_ram);
-    __bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
-                      actual_compute_box_num);
-
-    // ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
-    // ADdotAD))
-    __bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-
-    // 16 means the 4x4 possible intersection points above
-    __bang_mul((T *)intersect_pts_x + (16 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts1_x + i * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)intersect_pts_y + (16 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts1_y + i * actual_compute_box_num,
-               actual_compute_box_num);
-
-    // assign valid_pts bit and accumulate nums of valid points of each box pair
-    __bang_or((T *)valid_pts + (16 + i) * actual_compute_box_num,
-              (T *)valid_pts + (16 + i) * actual_compute_box_num,
-              (T *)temp1_ram, actual_compute_box_num);
-    __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-               actual_compute_box_num);
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  // temp5 = ABdotAB
-  dot2d<T>((T *)temp5_ram, (T *)vec1_x, (T *)vec1_y, (T *)vec1_x, (T *)vec1_y,
-           actual_compute_box_num, (T *)temp9_ram);
-  // temp6 = ADdotAD
-  dot2d<T>((T *)temp6_ram, (T *)vec1_x + 3 * actual_compute_box_num,
-           (T *)vec1_y + 3 * actual_compute_box_num,
-           (T *)vec1_x + 3 * actual_compute_box_num,
-           (T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
-           (T *)temp9_ram);
-  for (int i = 0; i < 4; i++) {
-    // AP = pts2[i] - pts1[0] = (temp7, temp8)
-    __bang_sub((T *)temp7_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
-               (T *)rotated_pts1_x, actual_compute_box_num);
-    __bang_sub((T *)temp8_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
-               (T *)rotated_pts1_y, actual_compute_box_num);
-
-    // temp9 = APdotAB = dot2d<T>(AP, AB)
-    dot2d<T>((T *)temp9_ram, (T *)temp7_ram, (T *)temp8_ram, (T *)vec1_x,
-             (T *)vec1_y, actual_compute_box_num, (T *)temp2_ram);
-    // temp10 = APdotAD = -dot2d<T>(AP, DA)
-    dot2d<T>((T *)temp10_ram, (T *)temp7_ram, (T *)temp8_ram,
-             (T *)vec1_x + 3 * actual_compute_box_num,
-             (T *)vec1_y + 3 * actual_compute_box_num, actual_compute_box_num,
-             (T *)temp2_ram);
-    __bang_mul_scalar((T *)temp10_ram, (T *)temp10_ram, (T)-1,
-                      actual_compute_box_num);
-
-    // ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && (APdotAD <=
-    // ADdotAD))
-    __bang_cycle_ge((T *)temp1_ram, (T *)temp9_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_cycle_ge((T *)temp2_ram, (T *)temp10_ram, (T *)temp4_ram,
-                    actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp9_ram, (T *)temp5_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-    __bang_le((T *)temp2_ram, (T *)temp10_ram, (T *)temp6_ram,
-              actual_compute_box_num);
-    __bang_and((T *)temp1_ram, (T *)temp1_ram, (T *)temp2_ram,
-               actual_compute_box_num);
-
-    // 20 means the (4x4+4) possible intersection points above
-    __bang_mul((T *)intersect_pts_x + (20 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts2_x + i * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)intersect_pts_y + (20 + i) * actual_compute_box_num,
-               (T *)temp1_ram, (T *)rotated_pts2_y + i * actual_compute_box_num,
-               actual_compute_box_num);
-
-    // assign valid_pts bit and accumulate nums of valid points of each box pair
-    __bang_or((T *)valid_pts + (20 + i) * actual_compute_box_num,
-              (T *)valid_pts + (20 + i) * actual_compute_box_num,
-              (T *)temp1_ram, actual_compute_box_num);
-    __bang_add((T *)nums_in_ram, (T *)nums_in_ram, (T *)temp1_ram,
-               actual_compute_box_num);
-  }
-}
-
-template <typename T>
-__mlu_func__ void convexHullGraham(
-    T *intersect_pts_x, T *intersect_pts_y, T *ordered_pts_x, T *ordered_pts_y,
-    T *dist_ram, T *valid_box, T *valid_pts, T *nums_in_ram, T *temp1_ram,
-    T *temp2_ram, T *temp3_ram, T *temp_long_1, T *temp_long_2, T *temp_long_3,
-    const uint32_t &actual_box_num, const uint32_t &actual_compute_box_num) {
-  // Step1. Find the point with minimum y, if more than 1 points have the same
-  // minimum y,
-  //        pick the one with the minimum x.
-  // set p[i].y to max_y_value if not valid_pts, to avoid invalid result
-  // 24 means all possible intersection points
-  __bang_max((T *)temp2_ram, (T *)intersect_pts_y, 24 * actual_compute_box_num);
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp2_ram)[0]);
-  __bang_not((T *)temp_long_1, (T *)valid_pts, 24 * actual_compute_box_num);
-  __bang_cycle_mul((T *)temp_long_1, (T *)temp_long_1, (T *)temp3_ram,
-                   24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_mul((T *)temp_long_2, (T *)intersect_pts_y, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_add((T *)temp_long_2, (T *)temp_long_2, (T *)temp_long_1,
-             24 * actual_compute_box_num);
-  // temp2 = min_y_value(temp_long_2), use min_pool, channel=box_num, h=1, w=24
-  __bang_minpool((T *)temp2_ram, (T *)temp_long_2, actual_compute_box_num, 1,
-                 24, 1, 24, 1, 24);
-  __bang_mul((T *)temp2_ram, (T *)temp2_ram, (T *)valid_box,
-             actual_compute_box_num);
-
-  // set p[i].x to max_x_value if not min_y point
-  __bang_max((T *)temp1_ram, (T *)intersect_pts_x, 24 * actual_compute_box_num);
-  __bang_write_value((T *)temp3_ram, COMPUTE_COUNT_ALIGN, ((T *)temp1_ram)[0]);
-  __bang_cycle_eq((T *)temp_long_1, (T *)temp_long_2, (T *)temp2_ram,
-                  24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_and((T *)temp_long_1, (T *)temp_long_1, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_not((T *)temp_long_3, (T *)temp_long_1, 24 * actual_compute_box_num);
-  __bang_cycle_mul((T *)temp_long_3, (T *)temp_long_3, (T *)temp3_ram,
-                   24 * actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_mul((T *)temp_long_1, (T *)intersect_pts_x, (T *)temp_long_1,
-             24 * actual_compute_box_num);
-  __bang_add((T *)temp_long_1, (T *)temp_long_1, (T *)temp_long_3,
-             24 * actual_compute_box_num);
-  // temp3 = min_x_value(temp_long_1), use min_pool, channel=box_num, h=1, w=24
-  __bang_minpool((T *)temp3_ram, (T *)temp_long_1, actual_compute_box_num, 1,
-                 24, 1, 24, 1, 24);
-  __bang_mul((T *)temp3_ram, (T *)temp3_ram, (T *)valid_box,
-             actual_compute_box_num);
-
-  // Step2. All points subtract starting-point (for sorting in the next step)
-  __bang_cycle_sub((T *)ordered_pts_x, (T *)intersect_pts_x, (T *)temp3_ram,
-                   24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_cycle_sub((T *)ordered_pts_y, (T *)intersect_pts_y, (T *)temp2_ram,
-                   24 * actual_compute_box_num, actual_compute_box_num);
-  __bang_mul((T *)ordered_pts_x, (T *)ordered_pts_x, (T *)valid_pts,
-             24 * actual_compute_box_num);
-  __bang_mul((T *)ordered_pts_y, (T *)ordered_pts_y, (T *)valid_pts,
-             24 * actual_compute_box_num);
-
-  // Step3. Sort every intersection point according to their relative
-  //        cross-product values (essentially sorting according to angles)
-  //        If the angles are the same, sort according to distance to origin
-  dot2d<T>((T *)dist_ram, (T *)ordered_pts_x, (T *)ordered_pts_y,
-           (T *)ordered_pts_x, (T *)ordered_pts_y, 24 * actual_compute_box_num,
-           (T *)temp_long_3);
-
-  T temp, temp_nums_in, temp_dist_1, temp_dist_2;
-  T temp1_x, temp1_y;
-  T temp2_x, temp2_y;
-  for (int i = 0; i < actual_box_num; i++) {
-    if (((T *)valid_box)[i]) {
-      // make sure all nums_in[i] points are at the front
-      for (int ii = 0; ii < 23; ii++) {
-        for (int jj = ii + 1; jj < 24; jj++) {
-          int ii_index = ii * actual_compute_box_num + i;
-          int jj_index = jj * actual_compute_box_num + i;
-          // ii point is not valid and jj point is valid, swap jj for ii
-          if ((!((T *)valid_pts)[ii_index]) && ((T *)valid_pts)[jj_index]) {
-            ((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[jj_index];
-            ((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[jj_index];
-            ((T *)dist_ram)[ii_index] = ((T *)dist_ram)[jj_index];
-            ((T *)valid_pts)[ii_index] = true;
-            ((T *)ordered_pts_x)[jj_index] = 0;
-            ((T *)ordered_pts_y)[jj_index] = 0;
-            ((T *)dist_ram)[jj_index] = 0;
-            ((T *)valid_pts)[jj_index] = false;
-            break;
-          }
-        }
-      }
-      temp_nums_in = ((T *)nums_in_ram)[i];
-      // make original q[0] = min_x, min_y before sort
-      for (int ii = 1; ii < temp_nums_in; ii++) {
-        int ii_index = ii * actual_compute_box_num + i;
-        if (((T *)dist_ram)[ii_index] == 0) {
-          // swap q[ii_index] and q[0]
-          ((T *)ordered_pts_x)[ii_index] = ((T *)ordered_pts_x)[i];
-          ((T *)ordered_pts_y)[ii_index] = ((T *)ordered_pts_y)[i];
-          ((T *)dist_ram)[ii_index] = ((T *)dist_ram)[i];
-          ((T *)ordered_pts_x)[i] = 0;
-          ((T *)ordered_pts_y)[i] = 0;
-          ((T *)dist_ram)[i] = 0;
-          break;
-        }
-      }
-      for (int ii = 1; ii < temp_nums_in - 1; ii++) {
-        for (int jj = ii + 1; jj < temp_nums_in; jj++) {
-          int ii_index = ii * actual_compute_box_num + i;
-          int jj_index = jj * actual_compute_box_num + i;
-          temp1_x = ((T *)ordered_pts_x)[ii_index];
-          temp1_y = ((T *)ordered_pts_y)[ii_index];
-          temp2_x = ((T *)ordered_pts_x)[jj_index];
-          temp2_y = ((T *)ordered_pts_y)[jj_index];
-          // calculate cross product and sort q (ordered_pts)
-          temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-          temp_dist_1 = ((T *)dist_ram)[ii_index];
-          temp_dist_2 = ((T *)dist_ram)[jj_index];
-          if ((temp < (T)-1e-6) ||
-              ((fabs(temp) < (T)1e-6) && (temp_dist_1 > temp_dist_2))) {
-            ((T *)ordered_pts_x)[ii_index] = temp2_x;
-            ((T *)ordered_pts_y)[ii_index] = temp2_y;
-            ((T *)ordered_pts_x)[jj_index] = temp1_x;
-            ((T *)ordered_pts_y)[jj_index] = temp1_y;
-            ((T *)dist_ram)[ii_index] = temp_dist_2;
-            ((T *)dist_ram)[jj_index] = temp_dist_1;
-          }
-        }
-      }
-
-      // Step4:
-      // Make sure there are at least 2 points(that don't overlap with each
-      // other) in the stack
-      int k;  // index of the non-overlapped second point
-      for (k = 1; k < temp_nums_in; k++) {
-        if (((T *)dist_ram)[k * actual_compute_box_num + i] > (T)1e-8) {
-          break;
-        }
-      }
-      if (k == temp_nums_in) {
-        // We reach the end, which means the convex hull is just one point
-        // set valid_box = 0, to get ious = 0
-        ((T *)valid_box)[i] = 0;
-        continue;
-      }
-      // q[1] = q[k];
-      ((T *)ordered_pts_x)[actual_compute_box_num + i] =
-          ((T *)ordered_pts_x)[k * actual_compute_box_num + i];
-      ((T *)ordered_pts_y)[actual_compute_box_num + i] =
-          ((T *)ordered_pts_y)[k * actual_compute_box_num + i];
-
-      // Step 5:
-      // Finally we can start the scanning process.
-      // When a non-convex relationship between the 3 points is found
-      // (either concave shape or duplicated points),
-      // we pop the previous point from the stack
-      // until the 3-point relationship is convex again, or
-      // until the stack only contains two points
-      int m = 2;  // 2 points in the stack
-      for (int j = k + 1; j < temp_nums_in; j++) {
-        // while (m > 1 && cross2d<T>(q[j] - q[m - 2], q[m - 1] - q[m - 2]) >=
-        // 0) {
-        //   m--;
-        // }
-        temp1_x = ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-        temp1_y = ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-        temp2_x = ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-        temp2_y = ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
-                  ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-        temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-        while ((m > 1) && (temp >= 0)) {
-          m--;
-          if (m > 1) {
-            temp1_x =
-                ((T *)ordered_pts_x)[j * actual_compute_box_num + i] -
-                ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-            temp1_y =
-                ((T *)ordered_pts_y)[j * actual_compute_box_num + i] -
-                ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-            temp2_x =
-                ((T *)ordered_pts_x)[(m - 1) * actual_compute_box_num + i] -
-                ((T *)ordered_pts_x)[(m - 2) * actual_compute_box_num + i];
-            temp2_y =
-                ((T *)ordered_pts_y)[(m - 1) * actual_compute_box_num + i] -
-                ((T *)ordered_pts_y)[(m - 2) * actual_compute_box_num + i];
-            temp = (temp1_x * temp2_y) - (temp1_y * temp2_x);
-          }
-        }
-        // q[m++] = q[j];
-        ((T *)ordered_pts_x)[m * actual_compute_box_num + i] =
-            ((T *)ordered_pts_x)[j * actual_compute_box_num + i];
-        ((T *)ordered_pts_y)[m * actual_compute_box_num + i] =
-            ((T *)ordered_pts_y)[j * actual_compute_box_num + i];
-        m++;
-      }
-      // set last(24-m) valid_pts to false, to erase invalid q in polygon area
-      for (int j = m; j < temp_nums_in; j++) {
-        ((T *)valid_pts)[j * actual_compute_box_num + i] = 0;
-      }
-      ((T *)nums_in_ram)[i] = m;
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void polygonArea(T *ordered_pts_x, T *ordered_pts_y, T *valid_box,
-                              T *valid_pts, T *nums_in_ram, T *temp1_ram,
-                              T *temp2_ram, T *temp3_ram, T *temp4_ram,
-                              T *temp5_ram, T *temp6_ram, T *temp7_ram,
-                              T *temp8_ram, T *temp9_ram,
-                              const uint32_t &actual_compute_box_num) {
-  // Set where nums_in <= 2, valid_box = false
-  __bang_write_value((T *)temp9_ram, COMPUTE_COUNT_ALIGN, (T)2);
-  __bang_cycle_gt((T *)temp1_ram, (T *)nums_in_ram, (T *)temp9_ram,
-                  actual_compute_box_num, COMPUTE_COUNT_ALIGN);
-  __bang_and((T *)valid_box, (T *)valid_box, (T *)temp1_ram,
-             actual_compute_box_num);
-
-  // temp1 = area, initialize with all 0
-  __bang_write_zero((T *)temp1_ram, actual_compute_box_num);
-  __bang_max((T *)temp7_ram, (T *)nums_in_ram, actual_compute_box_num);
-
-  // temp_nums_in = max(nums_in)
-  T temp_nums_in = ((T *)temp7_ram)[0];
-  for (int i = 1; i < temp_nums_in - 1; i++) {
-    // q[i] - q[0]: (temp6, temp7)
-    __bang_sub((T *)temp6_ram, (T *)ordered_pts_x + i * actual_compute_box_num,
-               (T *)ordered_pts_x, actual_compute_box_num);
-    __bang_sub((T *)temp7_ram, (T *)ordered_pts_y + i * actual_compute_box_num,
-               (T *)ordered_pts_y, actual_compute_box_num);
-    __bang_mul((T *)temp6_ram, (T *)temp6_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)temp7_ram, (T *)temp7_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    // q[i + 1] - q[0]: (temp8, temp9)
-    __bang_sub((T *)temp8_ram,
-               (T *)ordered_pts_x + (i + 1) * actual_compute_box_num,
-               (T *)ordered_pts_x, actual_compute_box_num);
-    __bang_sub((T *)temp9_ram,
-               (T *)ordered_pts_y + (i + 1) * actual_compute_box_num,
-               (T *)ordered_pts_y, actual_compute_box_num);
-    __bang_mul((T *)temp8_ram, (T *)temp8_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    __bang_mul((T *)temp9_ram, (T *)temp9_ram,
-               (T *)valid_pts + (i + 1) * actual_compute_box_num,
-               actual_compute_box_num);
-    // area += fabs(cross2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-    __bang_mul((T *)temp4_ram, (T *)temp6_ram, (T *)temp9_ram,
-               actual_compute_box_num);
-    __bang_mul((T *)temp5_ram, (T *)temp7_ram, (T *)temp8_ram,
-               actual_compute_box_num);
-    __bang_sub((T *)temp3_ram, (T *)temp4_ram, (T *)temp5_ram,
-               actual_compute_box_num);
-    __bang_active_abs((T *)temp3_ram, (T *)temp3_ram, actual_compute_box_num);
-    __bang_add((T *)temp1_ram, (T *)temp1_ram, (T *)temp3_ram,
-               actual_compute_box_num);
-  }
-  //  Set where valid_box = false, intersection = 0
-  __bang_mul((T *)temp1_ram, (T *)temp1_ram, (T *)valid_box,
-             actual_compute_box_num);
-  //  area = area / 2.0
-  __bang_mul_scalar((T *)temp1_ram, (T *)temp1_ram, (T)0.5,
-                    actual_compute_box_num);
-}
-
-#endif  // IOU3D_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/masked_conv2d_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void MLUUnion1MaskedIm2colForward(
-    const T *feature, const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const int32_t *mask_h_idx, const int32_t *mask_w_idx, const int mask_cnt,
-    T *data_col) {
-  for (int index = taskId; index < mask_cnt; index += taskDim) {
-    const int h_col = mask_h_idx[index];
-    const int w_col = mask_w_idx[index];
-    const int h_offset = h_col - pad_h;
-    const int w_offset = w_col - pad_w;
-    int h_start = h_offset;
-    int h_end = h_offset + kernel_h - 1;
-    int w_start = w_offset;
-    int w_end = w_start + kernel_w - 1;
-    if (h_start >= height || w_start >= width || h_end < 0 || w_end < 0) {
-      continue;
-    } else {
-      int h_start_valid = max(0, h_start);
-      int h_end_valid = min(height - 1, h_end);
-      int w_start_valid = max(0, w_start);
-      int w_end_valid = min(width - 1, w_end);
-      __memcpy(
-          data_col + index * kernel_h * kernel_w * channels +
-              ((h_start_valid - h_start) * kernel_w +
-               (w_start_valid - w_start)) *
-                  channels,
-          feature + h_start_valid * width * channels + w_start_valid * channels,
-          (w_end_valid - w_start_valid + 1) * channels * sizeof(T), GDRAM2GDRAM,
-          kernel_w * channels * sizeof(T), width * channels * sizeof(T),
-          h_end_valid - h_start_valid);
-    }
-  }
-}
-
-template <typename T>
-__mlu_func__ void MLUUnion1MaskedCol2imForward(const T *col, const int height,
-                                               const int width,
-                                               const int channels,
-                                               const int32_t *mask_h_idx,
-                                               const int32_t *mask_w_idx,
-                                               const int mask_cnt, T *im) {
-  const int channels_max_num_nram = MAX_NRAM_SIZE / sizeof(T);
-  if (channels <= channels_max_num_nram) {
-    const int deal_num = channels_max_num_nram / channels;
-    int mask_per_core = mask_cnt / taskDim;
-    const int mask_remain = mask_cnt % taskDim;
-    mask_per_core += taskId < mask_remain ? 1 : 0;
-    int index_start = taskId < mask_remain
-                          ? taskId * mask_per_core
-                          : taskId * mask_per_core + mask_remain;
-    int loop = mask_per_core / deal_num;
-    int remain_num = mask_per_core % deal_num;
-    T *nram_col = (T *)nram_buffer;
-    for (int index = 0; index < loop; ++index) {
-      int cur_index = index_start + index * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               deal_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < deal_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        // if(h_im>=height || w_im>=width) continue;
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-    if (remain_num > 0) {
-      int cur_index = index_start + loop * deal_num;
-      __memcpy(nram_col, col + cur_index * channels,
-               remain_num * channels * sizeof(T), GDRAM2NRAM);
-      for (int i = 0; i < remain_num; ++i) {
-        int mask_index = cur_index + i;
-        const int h_im = mask_h_idx[mask_index];
-        const int w_im = mask_w_idx[mask_index];
-        // if(h_im>=height || w_im>=width) continue;
-        __memcpy(im + (h_im * width + w_im) * channels, nram_col + i * channels,
-                 channels * sizeof(T), NRAM2GDRAM);
-      }
-    }
-  } else {
-    for (int index = taskId; index < mask_cnt; index += taskDim) {
-      const int m_index = index % mask_cnt;
-      const int h_im = mask_h_idx[m_index];
-      const int w_im = mask_w_idx[m_index];
-      // if(h_im>=height || w_im>=width) continue;
-      __memcpy(im + (h_im * width + w_im) * channels, col + index * channels,
-               channels * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelMaskedIm2colForward(
-    const void *feature, const int height, const int width, const int channels,
-    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
-    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
-    void *data_col, const cnrtDataType_t data_dtype) {
-  if (coreId == 0x80) {
-    return;
-  }
-
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      MLUUnion1MaskedIm2colForward((half *)feature, height, width, channels,
-                                   kernel_h, kernel_w, pad_h, pad_w,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (half *)data_col);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1MaskedIm2colForward((float *)feature, height, width, channels,
-                                   kernel_h, kernel_w, pad_h, pad_w,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (float *)data_col);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-__mlu_global__ void MLUKernelMaskedCol2imForward(
-    const void *col, const int height, const int width, const int channels,
-    const void *mask_h_idx, const void *mask_w_idx, const int mask_cnt,
-    void *im, const cnrtDataType_t data_dtype) {
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      MLUUnion1MaskedCol2imForward((half *)col, height, width, channels,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (half *)im);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1MaskedCol2imForward((float *)col, height, width, channels,
-                                   (int32_t *)mask_h_idx, (int32_t *)mask_w_idx,
-                                   mask_cnt, (float *)im);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-
-void KernelMaskedIm2colForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    cnrtDataType_t k_dtype, const void *im_ptr, const int height,
-    const int width, const int channels, const int kernel_h, const int kernel_w,
-    const int pad_h, const int pad_w, const void *mask_h_idx_ptr,
-    const void *mask_w_idx_ptr, const int mask_cnt, void *col_ptr) {
-  MLUKernelMaskedIm2colForward<<<k_dim, k_type, queue>>>(
-      im_ptr, height, width, channels, kernel_h, kernel_w, pad_h, pad_w,
-      mask_h_idx_ptr, mask_w_idx_ptr, mask_cnt, col_ptr, k_dtype);
-}
-
-void KernelMaskedCol2imForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                               cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                               const void *col_ptr, const int height,
-                               const int width, const int channels,
-                               const void *mask_h_idx_ptr,
-                               const void *mask_w_idx_ptr, const int mask_cnt,
-                               void *im_ptr) {
-  MLUKernelMaskedCol2imForward<<<k_dim, k_type, queue>>>(
-      col_ptr, height, width, channels, mask_h_idx_ptr, mask_w_idx_ptr,
-      mask_cnt, im_ptr, k_dtype);
-}
--- a/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 by Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-
-#include "common_mlu_helper.hpp"
-#include <math.h>
-
-/****************************************************************************************
- *
- * NRAM partition forward:
- * | spatial_shapes     | data_value_p1_ping | data_value_p2_ping |
- * | data_value_p3_ping | data_value_p4_ping | data_col_ping      |
- * | data_value_p1_pong | data_value_p2_pong | data_value_p3_pong |
- * | data_value_p4_pong | data_col_pong      | auxiliary_a        |
- * | auxiliary_b        |
- * | 128bytes           | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          | deal_size          | deal_size          |
- * | deal_size          |
- *
- ****************************************************************************************/
-
-/****************************************************************************************
- *
- * NRAM partition backward:
- * | grad_output_nram   | grad_output_nram_temp | grad_weight       |
- * | grad_h_weight      | grad_w_weight         | top_grad          |
- * | top_grad_temp      | spatial_shapes_nram   | sampling_loc_nram |
- * | deal_size          | deal_size             | deal_size         |
- * | deal_size          | deal_size             | deal_size         |
- * | deal_size          | deal_size             | 64bytes           |
- *
- ****************************************************************************************/
-
-#define TWELVE_SPLIT 12
-#define ALIGN_NUM 64
-#define ALIGN_NUM_FOR_REDUCE 32
-
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-
-template <typename T>
-__mlu_func__ void loadNeighborPointsData(
-    const T *data_value_gdram, T *data_value_p1_nram, T *data_value_p2_nram,
-    T *data_value_p3_nram, T *data_value_p4_nram, const size_t deal_num,
-    const int32_t &width, const int32_t &height, const int32_t &num_heads,
-    const int32_t &channels, const T &x, const T &y, const int32_t &head_idx) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-
-  const int32_t w_stride = num_heads * channels;
-  const int32_t h_stride = width * w_stride;
-  const int32_t h_low_ptr_offset = h_low * h_stride;
-  const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int32_t w_low_ptr_offset = w_low * w_stride;
-  const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int32_t base_ptr_offset = head_idx * channels;
-
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    const int32_t v1_offset =
-        h_low_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p1_nram, data_value_gdram + v1_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int32_t v2_offset =
-        h_low_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p2_nram, data_value_gdram + v2_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int32_t v3_offset =
-        h_high_ptr_offset + w_low_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p3_nram, data_value_gdram + v3_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int32_t v4_offset =
-        h_high_ptr_offset + w_high_ptr_offset + base_ptr_offset;
-    __memcpy_async(data_value_p4_nram, data_value_gdram + v4_offset,
-                   deal_num * sizeof(T), GDRAM2NRAM);
-  }
-}
-
-template <typename T>
-__mlu_func__ void bilinearInterpolation(
-    T *data_value_p1_nram, T *data_value_p2_nram, T *data_value_p3_nram,
-    T *data_value_p4_nram, T *sample_point_value, T *auxiliary_b,
-    const size_t deal_num, const int32_t &width, const int32_t &height,
-    const T &x, const T &y) {
-  const int32_t w_low = floorf(x);
-  const int32_t h_low = floorf(y);
-  const int32_t w_high = w_low + 1;
-  const int32_t h_high = h_low + 1;
-
-  const T lw = x - w_low;
-  const T lh = y - h_low;
-  const T hw = 1 - lw;
-  const T hh = 1 - lh;
-  const T w1 = hh * hw;
-  const T w2 = hh * lw;
-  const T w3 = lh * hw;
-  const T w4 = lh * lw;
-
-  __bang_write_value((T *)sample_point_value, deal_num, (T)0);
-
-  // top-left point
-  if (h_low >= 0 && w_low >= 0) {
-    // sample_point_value += v1 * w1
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p1_nram, (T)w1,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // top-right point
-  if (h_low >= 0 && w_high <= width - 1) {
-    // sample_point_value += v2 * w2
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p2_nram, (T)w2,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // bottom-left point
-  if (h_high <= height - 1 && w_low >= 0) {
-    // sample_point_value += v3 * w3
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p3_nram, (T)w3,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-
-  // bottom-right point
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    // sample_point_value += v4 * w4
-    __bang_mul_scalar((T *)auxiliary_b, (T *)data_value_p4_nram, (T)w4,
-                      deal_num);
-    __bang_add((T *)sample_point_value, (T *)sample_point_value,
-               (T *)auxiliary_b, deal_num);
-  }
-}
-
-template <typename T>
-__mlu_global__ void MLUKernelMsDeformAttnForward(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-
-  const size_t spatial_size = PAD_UP(2 * sizeof(int32_t), NFU_ALIGN_SIZE);
-  const size_t span_num_deal =
-      PAD_DOWN((MAX_NRAM_SIZE - spatial_size) / TWELVE_SPLIT / sizeof(T),
-               NFU_ALIGN_SIZE);
-  const size_t align_num = NFU_ALIGN_SIZE;
-  const int32_t channels_seg_num = channels / span_num_deal;
-  const size_t channels_rem = channels % span_num_deal;
-  const size_t channels_align_rem = CEIL_ALIGN(channels_rem, align_num);
-  char *data_spatial_shapes_nram = nram_buffer;
-  char *ping_data_value_p1_nram = data_spatial_shapes_nram + spatial_size;
-  char *ping_data_value_p2_nram =
-      ping_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p3_nram =
-      ping_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *ping_data_value_p4_nram =
-      ping_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *ping_data_col_nram =
-      ping_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p1_nram =
-      ping_data_col_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p2_nram =
-      pong_data_value_p1_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p3_nram =
-      pong_data_value_p2_nram + span_num_deal * sizeof(T);
-  char *pong_data_value_p4_nram =
-      pong_data_value_p3_nram + span_num_deal * sizeof(T);
-  char *pong_data_col_nram =
-      pong_data_value_p4_nram + span_num_deal * sizeof(T);
-  char *auxiliary_a = pong_data_col_nram + span_num_deal * sizeof(T);
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  const size_t ping_pong_gap = 5 * span_num_deal * sizeof(T);
-  size_t data_col_ping_pong_idx = 0;
-
-  int32_t block_num_per_core = (batch_size * num_queries * num_heads) / taskDim;
-  const int32_t block_num_rem =
-      (batch_size * num_queries * num_heads) % taskDim;
-  const int32_t idx_start = taskId < (block_num_rem + 1)
-                                ? taskId * (block_num_per_core + 1)
-                                : taskId * block_num_per_core + block_num_rem;
-  block_num_per_core =
-      taskId < block_num_rem
-          ? (batch_size * num_queries * num_heads) / taskDim + 1
-          : (batch_size * num_queries * num_heads) / taskDim;
-
-  for (int32_t cur_idx = idx_start; cur_idx < idx_start + block_num_per_core;
-       ++cur_idx) {
-    // cur_idx = batch_idx * num_queries * num_heads + query_idx * num_heads +
-    // head_idx
-    const int32_t head_idx = cur_idx % num_heads;
-    const int32_t batch_idx = (cur_idx / num_heads) / num_queries;
-
-    const char *data_value_gdram_start =
-        data_value_gdram +
-        batch_idx * num_keys * num_heads * channels * sizeof(T);
-    const char *data_sampling_loc_gdram_start =
-        data_sampling_loc_gdram +
-        cur_idx * num_levels * num_points * 2 * sizeof(T);
-    const char *data_attn_weight_gdram_start =
-        data_attn_weight_gdram + cur_idx * num_levels * num_points * sizeof(T);
-    char *data_col_gdram_start =
-        data_col_gdram + cur_idx * channels * sizeof(T);
-
-    for (int32_t c_seg_idx = 0; c_seg_idx < channels_seg_num; ++c_seg_idx) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          span_num_deal, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + c_seg_idx * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, span_num_deal, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __asm__ volatile("sync;");
-
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              c_seg_idx * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  span_num_deal, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_h_next_point = spatial_h;
-            spatial_w_next_point = spatial_w;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  span_num_deal, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            bilinearInterpolation(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)auxiliary_a, (T *)auxiliary_b, span_num_deal, spatial_w,
-                spatial_h, x, y);
-            __bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
-                              span_num_deal);
-            __bang_add((T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)auxiliary_a, span_num_deal);
-          }
-
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __asm__ volatile("sync;");
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + c_seg_idx * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          span_num_deal * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-
-    if (channels_rem > 0) {
-      __bang_write_value(
-          (T *)(ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap),
-          channels_align_rem, (T)0);
-      // load data
-      // level_idx = 0, point_idx = 0
-      __memcpy(data_spatial_shapes_nram, data_spatial_shapes_gdram,
-               2 * sizeof(int32_t), GDRAM2NRAM);
-      int32_t spatial_h = ((int32_t *)data_spatial_shapes_nram)[0];
-      int32_t spatial_w = ((int32_t *)data_spatial_shapes_nram)[1];
-      const char *data_value_ptr =
-          data_value_gdram_start + channels_seg_num * span_num_deal * sizeof(T);
-      T loc_w = ((T *)data_sampling_loc_gdram_start)[0];
-      T loc_h = ((T *)data_sampling_loc_gdram_start)[1];
-      T weight = ((T *)data_attn_weight_gdram_start)[0];
-      T x = loc_w * spatial_w - 0.5;
-      T y = loc_h * spatial_h - 0.5;
-      if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-        loadNeighborPointsData(
-            (T *)data_value_ptr, (T *)ping_data_value_p1_nram,
-            (T *)ping_data_value_p2_nram, (T *)ping_data_value_p3_nram,
-            (T *)ping_data_value_p4_nram, channels_rem, spatial_w, spatial_h,
-            num_heads, channels, x, y, head_idx);
-      }
-      T spatial_h_next_point = 0;
-      T spatial_w_next_point = 0;
-      T weight_next_point = 0;
-      T x_next_point = 0;
-      T y_next_point = 0;
-      __asm__ volatile("sync;");
-
-      for (int32_t level_idx = 0; level_idx < num_levels; ++level_idx) {
-        for (int32_t point_idx = 0; point_idx < num_points; ++point_idx) {
-          // load data
-          if (point_idx == num_points - 1 && level_idx == num_levels - 1) {
-            // last point no need to load data, continue to compute
-          } else if (point_idx == num_points - 1) {
-            const int32_t level_start_id =
-                ((int32_t *)data_level_start_index_gdram)[level_idx + 1];
-            const int32_t spatial_h_ptr = (level_idx + 1) << 1;
-            __memcpy(
-                data_spatial_shapes_nram,
-                data_spatial_shapes_gdram + spatial_h_ptr * sizeof(int32_t),
-                2 * sizeof(int32_t), GDRAM2NRAM);
-            spatial_h_next_point = ((int32_t *)data_spatial_shapes_nram)[0];
-            spatial_w_next_point = ((int32_t *)data_spatial_shapes_nram)[1];
-            data_value_ptr = data_value_gdram_start +
-                             (level_start_id * num_heads * channels +
-                              channels_seg_num * span_num_deal) *
-                                 sizeof(T);
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w_next_point - 0.5;
-            y_next_point = loc_h * spatial_h_next_point - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h_next_point &&
-                x_next_point < spatial_w_next_point) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  channels_rem, spatial_w_next_point, spatial_h_next_point,
-                  num_heads, channels, x_next_point, y_next_point, head_idx);
-            }
-          } else {
-            spatial_w_next_point = spatial_w;
-            spatial_h_next_point = spatial_h;
-            loc_w = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2];
-            loc_h = ((T *)data_sampling_loc_gdram_start)
-                [(level_idx * num_points + point_idx + 1) * 2 + 1];
-            weight_next_point =
-                ((T *)data_attn_weight_gdram_start)[level_idx * num_points +
-                                                    point_idx + 1];
-            x_next_point = loc_w * spatial_w - 0.5;
-            y_next_point = loc_h * spatial_h - 0.5;
-            if (y_next_point > -1 && x_next_point > -1 &&
-                y_next_point < spatial_h && x_next_point < spatial_w) {
-              loadNeighborPointsData(
-                  (T *)data_value_ptr,
-                  (T *)(ping_data_value_p1_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p2_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p3_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  (T *)(ping_data_value_p4_nram +
-                        ((level_idx * num_points + point_idx + 1) % 2) *
-                            ping_pong_gap),
-                  channels_rem, spatial_w, spatial_h, num_heads, channels,
-                  x_next_point, y_next_point, head_idx);
-            }
-          }
-
-          // compute
-          if (y > -1 && x > -1 && y < spatial_h && x < spatial_w) {
-            bilinearInterpolation(
-                (T *)(ping_data_value_p1_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p2_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p3_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)(ping_data_value_p4_nram +
-                      ((level_idx * num_points + point_idx) % 2) *
-                          ping_pong_gap),
-                (T *)auxiliary_a, (T *)auxiliary_b, channels_align_rem,
-                spatial_w, spatial_h, x, y);
-            __bang_mul_scalar((T *)auxiliary_a, (T *)auxiliary_a, (T)weight,
-                              channels_align_rem);
-            __bang_add((T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)(ping_data_col_nram +
-                             data_col_ping_pong_idx * ping_pong_gap),
-                       (T *)auxiliary_a, channels_align_rem);
-          }
-
-          spatial_w = spatial_w_next_point;
-          spatial_h = spatial_h_next_point;
-          weight = weight_next_point;
-          x = x_next_point;
-          y = y_next_point;
-          __asm__ volatile("sync;");
-        }
-      }
-      // store
-      __memcpy_async(
-          data_col_gdram_start + channels_seg_num * span_num_deal * sizeof(T),
-          ping_data_col_nram + data_col_ping_pong_idx * ping_pong_gap,
-          channels_rem * sizeof(T), NRAM2GDRAM);
-      data_col_ping_pong_idx = (data_col_ping_pong_idx + 1) % 2;
-    }
-  }
-  __asm__ volatile("sync;");
-  return;
-}
-
-template __mlu_global__ void MLUKernelMsDeformAttnForward<float>(
-    const char *data_value_gdram, const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram);
-
-void KernelMsDeformAttnForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const char *data_value_gdram,
-    const char *data_spatial_shapes_gdram,
-    const char *data_level_start_index_gdram,
-    const char *data_sampling_loc_gdram, const char *data_attn_weight_gdram,
-    const int32_t batch_size, const int32_t num_keys, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_queries,
-    const int32_t num_points, char *data_col_gdram) {
-  MLUKernelMsDeformAttnForward<float><<<k_dim, k_type, queue>>>(
-      data_value_gdram, data_spatial_shapes_gdram, data_level_start_index_gdram,
-      data_sampling_loc_gdram, data_attn_weight_gdram, batch_size, num_keys,
-      num_heads, channels, num_levels, num_queries, num_points, data_col_gdram);
-}
-
-template <typename T>
-void __mlu_func__ msDeformAttnCol2imBilinear(
-    T *top_grad_temp, const int32_t &height, const int32_t &width, const T &w1,
-    const T &w2, const T &w3, const T &w4, const int32_t &h_low,
-    const int32_t &w_low, const int32_t &h_high, const int32_t &w_high,
-    const int32_t &base_ptr, const int32_t &h_low_ptr_offset,
-    const int32_t &w_low_ptr_offset, const int32_t &h_high_ptr_offset,
-    const int32_t &w_high_ptr_offset, const T &hh, const T &hw, const T &lh,
-    const T &lw, T *top_grad, const T &data_attn_weight, T *grad_h_weight,
-    T *grad_w_weight, T *grad_value, T *grad_output_nram, T *grad_weight,
-    T *grad_sampling_loc, T *grad_attn_weight, T *grad_output_nram_temp,
-    const int32_t &deal_num, const int32_t &deal_num_real,
-    const T *data_value_ptr) {
-  if (h_low >= 0 && w_low >= 0) {
-    int32_t offset1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram, data_value_ptr + offset1,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hw, deal_num);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram, hh, deal_num);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w1, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram, grad_output_nram, w1, deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset1),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_low >= 0 && w_high <= width - 1) {
-    int32_t offset2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset2,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
-    __bang_sub(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hh, deal_num);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w2, deal_num);
-
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w2,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset2),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_high <= height - 1 && w_low >= 0) {
-    int32_t offset3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset3,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, hw, deal_num);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
-    __bang_sub(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w3, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w3,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset3),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    int32_t offset4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    __memcpy(grad_output_nram_temp, data_value_ptr + offset4,
-             deal_num_real * sizeof(T), GDRAM2NRAM);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lw, deal_num);
-    __bang_add(grad_h_weight, grad_h_weight, grad_weight, deal_num);
-    __bang_mul_scalar(grad_weight, grad_output_nram_temp, lh, deal_num);
-    __bang_add(grad_w_weight, grad_w_weight, grad_weight, deal_num);
-
-    __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-    __bang_mul_scalar(top_grad_temp, top_grad_temp, w4, deal_num);
-    // for calc grad_attn_weight
-    __bang_mul_scalar(grad_output_nram_temp, grad_output_nram_temp, w4,
-                      deal_num);
-    __bang_add(grad_output_nram, grad_output_nram, grad_output_nram_temp,
-               deal_num);
-
-    __bang_atomic_add((T *)top_grad_temp, (T *)(grad_value + offset4),
-                      (T *)top_grad_temp, deal_num_real);
-  }
-  __bang_mul(grad_output_nram, grad_output_nram, top_grad, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_output_nram, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  const int32_t align_num_on_200 = NFU_ALIGN_SIZE / sizeof(float);
-  recursiveSumPool(grad_output_nram, align_num_on_200,
-                   deal_num / align_num_on_200, ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_output_nram, grad_output_nram,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_output_nram, (T *)grad_attn_weight,
-                    (T *)grad_output_nram, 1);
-  __bang_mul_scalar(grad_w_weight, grad_w_weight, width, deal_num);
-  __bang_mul_scalar(top_grad_temp, top_grad, data_attn_weight, deal_num);
-  __bang_mul(grad_w_weight, grad_w_weight, top_grad_temp, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_w_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  recursiveSumPool(grad_w_weight, align_num_on_200, deal_num / align_num_on_200,
-                   ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_w_weight, grad_w_weight,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_w_weight, (T *)(grad_sampling_loc),
-                    (T *)grad_w_weight, 1);
-
-  __bang_mul_scalar(grad_h_weight, grad_h_weight, height, deal_num);
-  __bang_mul(grad_h_weight, grad_h_weight, top_grad_temp, deal_num);
-#if __BANG_ARCH__ >= 322
-  recursiveSumPool(grad_h_weight, 1, deal_num_real, ALIGN_NUM_FOR_REDUCE);
-#else
-  recursiveSumPool(grad_h_weight, align_num_on_200, deal_num / align_num_on_200,
-                   ALIGN_NUM_FOR_REDUCE);
-  __bang_reduce_sum(grad_h_weight, grad_h_weight,
-                    NFU_ALIGN_SIZE / sizeof(float));
-#endif
-  __bang_atomic_add((T *)grad_h_weight, (T *)(grad_sampling_loc + 1),
-                    (T *)grad_h_weight, 1);
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight) {
-  if (coreId == 0x80) {
-    return;
-  }
-  const int32_t split_num = 8;
-  const int32_t spatial_shapes_size = 64;
-  int32_t deal_num = PAD_DOWN(
-      (MAX_NRAM_SIZE - spatial_shapes_size) / split_num / sizeof(float),
-      ALIGN_NUM);
-  float *grad_output_nram = (float *)nram_buffer;
-  float *grad_output_nram_temp = (float *)nram_buffer + deal_num;
-  float *grad_weight = (float *)nram_buffer + 2 * deal_num;
-  float *grad_h_weight = (float *)nram_buffer + 3 * deal_num;
-  float *grad_w_weight = (float *)nram_buffer + 4 * deal_num;
-  float *top_grad = (float *)nram_buffer + 5 * deal_num;
-  float *top_grad_temp = (float *)nram_buffer + 6 * deal_num;
-  int32_t *spatial_shapes_nram =
-      (int32_t *)((float *)nram_buffer + 7 * deal_num);
-  float *sampling_loc_nram =
-      (float *)nram_buffer + 7 * deal_num + 2 * sizeof(int32_t);
-  const int32_t total_num = batch * num_query * num_heads * num_levels;
-  int32_t num_per_core = total_num / taskDim;
-  int32_t num_rem = total_num % taskDim;
-  num_per_core = num_per_core + int32_t(taskId < num_rem);
-  int32_t start_per_core =
-      num_rem > taskId
-          ? (taskId * num_per_core)
-          : ((num_per_core + 1) * num_rem + (taskId - num_rem) * num_per_core);
-  int32_t end_per_core = start_per_core + num_per_core;
-  const int32_t C_repeat = channels / deal_num;
-  const int32_t C_tail = channels % deal_num;
-  const int32_t qid_stride = num_heads * channels;
-  int32_t base_ptr = 0;
-  for (int32_t num_loop = start_per_core; num_loop < end_per_core; ++num_loop) {
-    const int32_t l_col = num_loop % num_levels;
-    const int32_t m_col = num_loop / num_levels % num_heads;
-    const int32_t q_col = num_loop / num_levels / num_heads % num_query;
-    const int32_t b_col = num_loop / num_query / num_heads / num_levels;
-    int32_t data_weight_ptr = num_loop * num_points;
-    int32_t data_loc_w_ptr = data_weight_ptr << 1;
-    const int32_t value_offset = b_col * spatial_size * num_heads * channels;
-    const int32_t level_start_id = data_level_start_index[l_col];
-    int32_t spatial_h_ptr = l_col << 1;
-    int32_t grad_output_offset = b_col * num_query * num_heads * channels +
-                                 q_col * num_heads * channels +
-                                 m_col * channels;
-    __memcpy(spatial_shapes_nram, spatial_shapes + spatial_h_ptr,
-             2 * sizeof(int32_t), GDRAM2NRAM);
-    const int32_t spatial_h = spatial_shapes_nram[0];
-    const int32_t spatial_w = spatial_shapes_nram[1];
-    const int32_t value_ptr_offset = value_offset + level_start_id * qid_stride;
-    const float *data_value_ptr = data_value + value_ptr_offset;
-    float *grad_value_ptr = grad_value + value_ptr_offset;
-    const int32_t grad_attn_weight_out = num_loop * num_points;
-    const int32_t grad_sampling_loc_out = num_loop * num_points * 2;
-    for (int32_t p_col = 0; p_col < num_points; ++p_col) {
-      __memcpy(sampling_loc_nram, data_sampling_loc + data_loc_w_ptr,
-               2 * sizeof(float), GDRAM2NRAM);
-      const float loc_w = sampling_loc_nram[0];
-      const float loc_h = sampling_loc_nram[1];
-      const float weight = data_attn_weight[data_weight_ptr];
-      const float h_im = loc_h * spatial_h - 0.5;
-      const float w_im = loc_w * spatial_w - 0.5;
-      if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-        const int32_t h_low = floorf(h_im);
-        const int32_t w_low = floorf(w_im);
-        const int32_t h_high = h_low + 1;
-        const int32_t w_high = w_low + 1;
-
-        const float lh = h_im - h_low;
-        const float lw = w_im - w_low;
-        const float hh = 1.0 - lh;
-        const float hw = 1.0 - lw;
-
-        const int32_t w_stride = num_heads * channels;
-        const int32_t h_stride = spatial_w * w_stride;
-        const int32_t h_low_ptr_offset = h_low * h_stride;
-        const int32_t h_high_ptr_offset = h_low_ptr_offset + h_stride;
-        const int32_t w_low_ptr_offset = w_low * w_stride;
-        const int32_t w_high_ptr_offset = w_low_ptr_offset + w_stride;
-
-        float w1 = hh * hw;
-        float w2 = hh * lw;
-        float w3 = lh * hw;
-        float w4 = lh * lw;
-
-        for (int32_t C_loop = 0; C_loop < C_repeat; ++C_loop) {
-          base_ptr = m_col * channels + C_loop * deal_num;
-          __bang_write_zero(grad_weight, 3 * deal_num);
-          __bang_write_zero(grad_output_nram, deal_num);
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_loop * deal_num,
-                   deal_num * sizeof(float), GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, deal_num, data_value_ptr);
-        }
-        if (C_tail != 0) {
-          base_ptr = m_col * channels + C_repeat * deal_num;
-          __bang_write_zero(grad_output_nram, 8 * deal_num);
-          __memcpy(top_grad,
-                   grad_output + grad_output_offset + C_repeat * deal_num,
-                   C_tail * sizeof(float), GDRAM2NRAM);
-          msDeformAttnCol2imBilinear(
-              top_grad_temp, spatial_h, spatial_w, w1, w2, w3, w4, h_low, w_low,
-              h_high, w_high, base_ptr, h_low_ptr_offset, w_low_ptr_offset,
-              h_high_ptr_offset, w_high_ptr_offset, hh, hw, lh, lw, top_grad,
-              weight, grad_h_weight, grad_w_weight, grad_value_ptr,
-              grad_output_nram, grad_weight,
-              grad_sampling_loc + grad_sampling_loc_out + p_col * 2,
-              grad_attn_weight + grad_attn_weight_out + p_col,
-              grad_output_nram_temp, deal_num, C_tail, data_value_ptr);
-        }
-      }
-      data_weight_ptr += 1;
-      data_loc_w_ptr += 2;
-    }
-  }
-}
-
-__mlu_global__ void MLUUnion1KernelMsDeformAttnBackward(
-    const float *data_value, const int32_t *spatial_shapes,
-    const int32_t *data_level_start_index, const float *data_sampling_loc,
-    const float *data_attn_weight, const float *grad_output,
-    const int32_t batch, const int32_t spatial_size, const int32_t num_heads,
-    const int32_t channels, const int32_t num_levels, const int32_t num_query,
-    const int32_t num_points, float *grad_value, float *grad_sampling_loc,
-    float *grad_attn_weight);
-
-void KernelMsDeformAttnBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const float *data_value,
-    const int32_t *spatial_shapes, const int32_t *data_level_start_index,
-    const float *data_sampling_loc, const float *data_attn_weight,
-    const float *grad_output, const int32_t batch, const int32_t spatial_size,
-    const int32_t num_heads, const int32_t channels, const int32_t num_levels,
-    const int32_t num_query, const int32_t num_points, float *grad_value,
-    float *grad_sampling_loc, float *grad_attn_weight) {
-  MLUUnion1KernelMsDeformAttnBackward<<<k_dim, k_type, queue>>>(
-      data_value, spatial_shapes, data_level_start_index, data_sampling_loc,
-      data_attn_weight, grad_output, batch, spatial_size, num_heads, channels,
-      num_levels, num_query, num_points, grad_value, grad_sampling_loc,
-      grad_attn_weight);
-}