support v1.4.0

6f3c5f1c · limm · 6f674c7e · 6f674c7e · 6f674c7e · 6f674c7e
Commit 6f3c5f1c authored Jul 11, 2024 by limm
20 changed files
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "nms_utils.hpp"
-#define COORD_DIM (4)
-#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
-__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
-enum Addr { SRAM, GDRAM };
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void nms_detection(
-    uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
-    IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
-    IN_DT *sram, const int core_limit, const int input_num_boxes,
-    const int max_output_size, const float thresh_iou, const float thresh_score,
-    const float offset, const int algo) {
-  // global value
-  int32_t *exit_flag = (int32_t *)(sram + 28);
-  exit_flag[0] = 0;
-  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  int nms_buffer_count1 = 9;
-  // temp nram buffer to store selected target.
-  int nram_save_limit_count = 256;
-  float div_thresh_iou = 1.0 / thresh_iou;
-  // input data ptr
-  const IN_DT *input_x1_ptr = input_data_box;
-  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
-  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
-  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
-  int limit = 0;        // find limit when GDRAM or SRAM
-  int max_seg_pad = 0;  // the max length every repeat
-  int repeat = 0;
-  int remain = 0;
-  int remain_pad = 0;
-  int input_offset = 0;  // offset of input_data for current core
-  int nram_save_count = 0;
-  if (output_mode == 0) {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  } else {
-    // 5 maens: score, x1, y1, x2, y2
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * 5 * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  }
-  int max_seg_iou_compute = 0;
-  int repeat_iou_compute = 0;
-  int remain_iou_compute = 0;
-  int remain_pad_iou_compute = 0;
-  getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
-                            input_offset, max_seg_pad, repeat, remain,
-                            remain_pad, max_seg_iou_compute, repeat_iou_compute,
-                            remain_iou_compute, remain_pad_iou_compute);
-  // init the data ptr
-  IN_DT *score = (IN_DT *)nram_buffer;
-  IN_DT *x1 = score + max_seg_pad;
-  IN_DT *y1 = x1 + max_seg_pad;
-  IN_DT *x2 = y1 + max_seg_pad;
-  IN_DT *y2 = x2 + max_seg_pad;
-  IN_DT *inter_x1 = y2 + max_seg_pad;
-  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
-  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
-  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
-  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
-  OUT_DT *nram_save =
-      (OUT_DT *)((char *)max_box +
-                 NFU_ALIGN_SIZE);  // offset two line from max_box
-#if __BANG_ARCH__ >= 300
-  float max_box_x1 = 0;
-  float max_box_y1 = 0;
-  float max_box_x2 = 0;
-  float max_box_y2 = 0;
-#endif
-  mluMemcpyDirection_t load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t store_dir = NRAM2SRAM;
-  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
-  for (int keep = 0; keep < max_output_size;
-       keep++) {  // loop until the max_score <= 0
-    if (core_limit != 1) {
-      __sync_cluster();  // sync before current loop
-    }
-    /******FIND MAX START******/
-    int max_index = 0;         // the max score index
-    int global_max_index = 0;  // for U1
-    float max_area = 0;        // the max socre area
-    max_box[0] = 0;            // init 0
-    findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
-                   input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
-                   input_offset, repeat, remain, remain_pad, max_seg_pad,
-                   max_index);
-    if (core_limit == 1) {
-#if __BANG_ARCH__ >= 300
-      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-                 max_box_x2, max_box_y2);
-#else
-      calMaxArea(max_box, algo, offset, max_area);
-#endif
-      input_data_score[max_index] = 0;
-      global_max_index = max_index;
-    } else if (core_limit == 4) {
-      __sync_cluster();
-      findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);
-#if __BANG_ARCH__ >= 300
-      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-                 max_box_x2, max_box_y2);
-#else
-      calMaxArea(max_box, algo, offset, max_area);
-#endif
-      global_max_index = ((uint32_t *)(max_box + 5))[0];
-      input_data_score[global_max_index] = 0;
-    }
-    // by now, we get: max_score|max_index|max_box|max_area
-    /******FIND MAX END******/
-    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
-                max_output_size, thresh_score, output_mode, nram_save_count,
-                output_box_num);
-    // if the max score <= 0, end
-    if (core_limit == 1) {
-      if (float(max_box[0]) <= thresh_score) {
-        break;
-      }
-    } else {
-      if (float(max_box[0]) <= thresh_score) {
-        if (coreId == 0) {
-          exit_flag[0] = 1;
-        }
-      }
-      __sync_cluster();
-      if (exit_flag[0] == 1) {
-        break;
-      }
-    }
-/******NMS STORE END******/
-#if __BANG_ARCH__ >= 300
-    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
-                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
-                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
-                max_box_y1, max_box_x2, max_box_y2, nram_save,
-                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
-                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
-                input_offset, offset, max_area, input_num_boxes, algo);
-#else
-    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
-                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
-                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
-                max_box[2], max_box[3], max_box[4], nram_save,
-                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
-                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
-                input_offset, offset, max_area, input_num_boxes, algo);
-#endif
-  }  // for max_output_size
-}
-__mlu_global__ void MLUUnion1KernelNMS(
-    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int max_output_size,
-    const float iou_threshold, const float confidence_threshold,
-    const int output_mode, void *workspace, void *result_num, void *output,
-    const cnrtDataType_t data_type_input, const float offset, const int algo) {
-  if (data_type_input == CNRT_FLOAT16) {
-    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
-             GDRAM2GDRAM);
-  } else if (data_type_input == CNRT_FLOAT32) {
-    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
-             GDRAM2GDRAM);
-  } else {
-  }
-  uint32_t output_box_num = 0;
-  float *score_data = (float *)workspace;
-  float *boxes_data = (float *)input_boxes;
-  float *sram = (float *)sram_buffer;
-  if (output_mode == 0) {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
-                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
-                    max_output_size, iou_threshold, confidence_threshold,
-                    offset, algo);
-    } else {
-      nms_detection(output_box_num, output_mode, (uint32_t *)output,
-                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
-                    taskDim, input_num_boxes, max_output_size, iou_threshold,
-                    confidence_threshold, offset, algo);
-    }
-  } else {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection(output_box_num, output_mode, (float *)output, score_data,
-                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
-                    max_output_size, iou_threshold, confidence_threshold,
-                    offset, algo);
-    } else {
-      nms_detection(output_box_num, output_mode, (half *)output,
-                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
-                    taskDim, input_num_boxes, max_output_size, iou_threshold,
-                    confidence_threshold, offset, algo);
-    }
-  }
-  ((uint32_t *)result_num)[0] = output_box_num;
-}
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void nms_detection_ux(
-    int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
-    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
-    const int input_num_boxes, const int max_output_size,
-    const float thresh_iou, const float thresh_score, const float offset,
-    const int output_mode, const int algo, char *cdma_gdram) {
-  exit_flag[0] = 0;
-  IN_DT *sram = (IN_DT *)sram_buffer;
-  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  int nms_buffer_count1 = 9;
-  // temp nram buffer to store selected target.
-  int nram_save_limit_count = 256;
-  float div_thresh_iou = 1.0 / thresh_iou;
-  // input data ptr
-  const IN_DT *input_x1_ptr = boxes_data;
-  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
-  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
-  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;
-  int limit = 0;        // find limit when GDRAM or SRAM
-  int max_seg_pad = 0;  // the max length every repeat
-  int repeat = 0;
-  int remain = 0;
-  int remain_pad = 0;
-  int nram_save_count = 0;
-  if (output_mode == 0) {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  } else {
-    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
-            (nms_buffer_count1 * sizeof(IN_DT));
-  }
-  int input_offset = 0;
-  int max_seg_iou_compute = 0;
-  int repeat_iou_compute = 0;
-  int remain_iou_compute = 0;
-  int remain_pad_iou_compute = 0;
-  getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
-                     max_seg_pad, repeat, remain, remain_pad,
-                     max_seg_iou_compute, repeat_iou_compute,
-                     remain_iou_compute, remain_pad_iou_compute);
-  // init the nram ptr
-  IN_DT *score = (IN_DT *)nram_buffer;
-  IN_DT *x1 = score + max_seg_pad;
-  IN_DT *y1 = x1 + max_seg_pad;
-  IN_DT *x2 = y1 + max_seg_pad;
-  IN_DT *y2 = x2 + max_seg_pad;
-  IN_DT *inter_x1 = y2 + max_seg_pad;
-  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
-  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
-  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
-  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
-  OUT_DT *nram_save =
-      (OUT_DT *)((char *)max_box +
-                 NFU_ALIGN_SIZE);  // offset two line from max_box
-#if __BANG_ARCH__ >= 300
-  float max_box_x1 = 0;
-  float max_box_y1 = 0;
-  float max_box_x2 = 0;
-  float max_box_y2 = 0;
-#endif
-  mluMemcpyDirection_t load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t store_dir = NRAM2SRAM;
-  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
-  for (int keep = 0; keep < max_output_size;
-       keep++) {  // loop until the max_score <= 0
-    __sync_all();
-    int max_index = 0;
-    int global_max_index = 0;  // for Ux
-    float max_area = 0;        // the max socre area
-    max_box[0] = 0;            // init 0
-    if (coreId == 0) {
-      findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
-                     input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
-                     input_offset, repeat, remain, remain_pad, max_seg_pad,
-                     max_index);
-      // copy max box info to sram
-      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-    }
-    __sync_all();
-#if __BANG_ARCH__ >= 590
-    __memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram,
-             REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM);
-    __sync_all();
-    if (clusterId == 0 && coreId == 0) {
-      __bang_write_zero(inter_x1, NMS_SIZE);
-      __memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM,
-               sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                            ? ((uint16_t *)max_box)[1]
-                            : ((uint32_t *)max_box)[1];
-      __memcpy((char *)cdma_gdram,
-               (char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT),
-               REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM);
-    }
-    __sync_all();
-    __memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM);
-#else
-    findGlobalMaxBox(max_box, sram, inter_x1);
-#endif
-#if __BANG_ARCH__ >= 300
-    calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
-               max_box_x2, max_box_y2);
-#else
-    calMaxArea(max_box, algo, offset, max_area);
-#endif
-    global_max_index = ((uint32_t *)(max_box + 5))[0];
-    if (coreId != MEMORY_CORE) {
-      score_data[global_max_index] = 0;
-    }
-    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
-                max_output_size, thresh_score, output_mode, nram_save_count,
-                output_box_num);
-    if (float(max_box[0]) <= thresh_score) {
-      if (clusterId == 0 && coreId == 0) {
-        exit_flag[0] = 1;  // dram
-      }
-    }
-    __sync_all();
-    if (exit_flag[0] == 1) {
-      break;
-    }
-/******NMS STORE END******/
-#if __BANG_ARCH__ >= 300
-    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
-                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
-                inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
-                max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
-                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
-                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
-                max_area, input_num_boxes, algo);
-#else
-    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
-                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
-                inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
-                max_box[3], max_box[4], nram_save, repeat_iou_compute,
-                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
-                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
-                max_area, input_num_boxes, algo);
-#endif
-  }  // for max_output_size
-}
-__mlu_global__ void MLUUionXKernelNMS(
-    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int max_output_size,
-    const float iou_threshold, const float confidence_threshold,
-    const float offset, const cnrtDataType_t data_type_input,
-    const int output_mode, const int algo, void *workspace, void *result_num,
-    void *output) {
-  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  int32_t *exit_flag = (int32_t *)((char *)workspace +
-                                   INFO_NUM * input_num_boxes * input_dwidth);
-  char *cdma_addr = (char *)exit_flag + sizeof(int32_t);
-  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
-  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
-  int cluster_score_size = input_num_boxes * input_dwidth;
-  int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
-  char *sram_score = (char *)sram_buffer + reduce_sram_size;
-  char *sram_boxes =
-      (char *)sram_buffer + reduce_sram_size + cluster_score_size;
-  Addr input_ram = GDRAM;
-  if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
-    input_ram = SRAM;
-    __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
-    __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
-  } else {
-    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
-  }
-  __sync_cluster();
-  uint32_t output_box_num = 0;
-  float *score_data;
-  float *boxes_data;
-  score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-  boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-  if (output_mode == 0) {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
-                       score_data, boxes_data, input_ram, input_num_boxes,
-                       max_output_size, iou_threshold, confidence_threshold,
-                       offset, output_mode, algo, cdma_addr);
-    } else {
-      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
-                       (half *)score_data, (half *)boxes_data, input_ram,
-                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo,
-                       cdma_addr);
-    }
-  } else {
-    if (data_type_input == CNRT_FLOAT32) {
-      nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
-                       boxes_data, input_ram, input_num_boxes, max_output_size,
-                       iou_threshold, confidence_threshold, offset, output_mode,
-                       algo, cdma_addr);
-    } else {
-      nms_detection_ux(exit_flag, output_box_num, (half *)output,
-                       (half *)score_data, (half *)boxes_data, input_ram,
-                       input_num_boxes, max_output_size, iou_threshold,
-                       confidence_threshold, offset, output_mode, algo,
-                       cdma_addr);
-    }
-  }
-  ((uint32_t *)result_num)[0] = output_box_num;
-}
-void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-               const cnrtDataType_t data_type_input, const void *boxes_ptr,
-               const void *scores_ptr, const int input_num_boxes,
-               const int max_output_boxes, const float iou_threshold,
-               const float offset, void *workspace_ptr, void *output_size_ptr,
-               void *output_ptr) {
-  switch (k_type) {
-    default: { return; }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1: {
-      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
-          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
-          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
-          /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
-          data_type_input, offset, /*algo=*/1);
-    }; break;
-    case CNRT_FUNC_TYPE_UNION2:
-    case CNRT_FUNC_TYPE_UNION4:
-    case CNRT_FUNC_TYPE_UNION8:
-    case CNRT_FUNC_TYPE_UNION16: {
-      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
-          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
-          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
-          data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
-          output_size_ptr, output_ptr);
-    }; break;
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
-/*************************************************************************
- * Copyright (C) [2019-2022] by Cambricon, Inc.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef NMS_UTILS_HPP_
-#define NMS_UTILS_HPP_
-#include "common_mlu_helper.hpp"
-#define NMS_SIZE (64)
-#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
-#define NMS_DOWN(x, y) (x / y) * y
-#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
-#define MEMORY_CORE (0x80)
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-template <typename T>
-static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
-                                      const int deal_num,
-                                      const T threshold = 0) {
-  if (threshold < 0) {
-    return;
-  }
-  if (threshold) {
-#if __BANG_ARCH__ >= 300
-    __bang_relun(nram_dst, nram_src, deal_num, threshold);
-#else
-    int align_num = NFU_ALIGN_SIZE / sizeof(T);
-    T *nram_aux_a = (T *)nram_tmp;
-    T *nram_aux_b = nram_aux_a + deal_num;
-    T *nram_zero = nram_aux_b + align_num;
-    __bang_write_value(nram_aux_b, align_num, threshold);
-    __bang_write_zero(nram_zero, align_num);
-    __bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
-                    align_num);
-    __bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
-    __bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
-                    align_num);
-    __bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
-                     deal_num, align_num);
-    __bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
-    __bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
-                    align_num);
-    __bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
-#endif
-  } else {
-#if __BANG_ARCH__ >= 300
-    __bang_relu(nram_dst, nram_src, deal_num);
-#else
-    __bang_active_relu(nram_dst, nram_src, deal_num);
-#endif
-  }
-}
-__mlu_func__ void getComputeParamsBlockOrU1(
-    const int input_dwidth, const int input_box_num, const int limit,
-    const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
-    int &remain, int &remain_pad, int &max_seg_iou_compute,
-    int &repeat_iou_compute, int &remain_iou_compute,
-    int &remain_pad_iou_compute) {
-  int avg_core = input_box_num / core_limit;
-  int rem = input_box_num % core_limit;
-  int len_core = avg_core + (coreId < rem ? 1 : 0);
-  input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
-  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
-  repeat = len_core / max_seg_pad;
-  remain = len_core % max_seg_pad;
-  remain_pad = NMS_UP(remain, NMS_SIZE);
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
-  repeat_iou_compute = len_core / max_seg_iou_compute;
-  remain_iou_compute = len_core % max_seg_iou_compute;
-  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
-}
-__mlu_func__ void getComputeParamsUx(
-    const int input_dwidth, const int input_num_boxes, const int limit,
-    int &input_offset, int &max_seg_pad, int &repeat, int &remain,
-    int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
-    int &remain_iou_compute, int &remain_pad_iou_compute) {
-  // data split
-  int avg_cluster = input_num_boxes / clusterDim;
-  int rem_cluster = input_num_boxes % clusterDim;
-  int len_cluster = avg_cluster + (clusterId < rem_cluster);
-  int cluster_offset = avg_cluster * clusterId +
-                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
-  int avg_core = len_cluster / coreDim;
-  int rem_core = len_cluster % coreDim;
-  int len_core = avg_core + (coreId < rem_core);
-  int core_offset =
-      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-  input_offset = cluster_offset + core_offset;
-  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
-  // core 0 of each cluster calculate the max score index
-  int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
-  repeat = max_index_len_core / max_seg_pad;
-  remain = max_index_len_core % max_seg_pad;
-  remain_pad = NMS_UP(remain, NMS_SIZE);
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  max_seg_iou_compute =
-      NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
-  repeat_iou_compute = len_core / max_seg_iou_compute;
-  remain_iou_compute = len_core % max_seg_iou_compute;
-  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
-}
-template <typename IN_DT>
-__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
-                                   IN_DT *inter_x1) {
-  // copy all partial max to the sram of cluster 0
-  if (clusterId != 0) {
-    __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
-             SRAM2SRAM, 0);
-  }
-  __sync_all();
-  // reduce between clusters to get the global max box
-  if (clusterId == 0) {
-    if (coreId == 0) {
-      __bang_write_zero(inter_x1, NMS_SIZE);
-      __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-               REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                            ? ((uint16_t *)max_box)[1]
-                            : ((uint32_t *)max_box)[1];
-      __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
-               REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-    }
-    __sync_cluster();
-    if (coreId == 0x80 && clusterDim > 1) {
-      // broadcast global max box to each cluster's sram
-      for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
-        __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
-                 cluster_idx);
-      }
-    }
-    __sync_cluster();
-  }
-  __sync_all();
-  // copy the global max box to max_box
-  __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-}
-template <typename IN_DT>
-__mlu_func__ void findCoreMaxBox(
-    IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
-    const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
-    const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
-    const mluMemcpyDirection_t load_dir, const int input_offset,
-    const int repeat, const int remain, const int remain_pad,
-    const int max_seg_pad, int &max_index) {
-  if (coreId != 0x80) {
-    for (int i = 0; i <= repeat; i++) {
-      if (i == repeat && remain == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
-      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
-      /******NMS LOAD START******/
-      __bang_write_zero(score, seg_len);
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      /******NMS LOAD END******/
-      __bang_max(inter_x1, score, seg_len);
-      if (inter_x1[0] > max_box[0]) {
-        max_box[0] = inter_x1[0];
-        if (sizeof(IN_DT) == sizeof(half)) {
-          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        } else if (sizeof(IN_DT) == sizeof(float)) {
-          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        }
-      }
-    }  // for repeat
-    // the max box's x1, y1, x2, y2 on every core
-    max_box[1] = input_x1_ptr[max_index];
-    max_box[2] = input_y1_ptr[max_index];
-    max_box[3] = input_x2_ptr[max_index];
-    max_box[4] = input_y2_ptr[max_index];
-    ((uint32_t *)(max_box + 5))[0] = max_index;
-  }
-}
-template <typename IN_DT>
-__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
-                                    IN_DT *inter_x1, IN_DT *input_data_score,
-                                    const int core_limit) {
-  // find the max with sram
-  // copy every core's box info to sram, form: score---x1---y1---x2---y2---
-  __memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
-           NRAM2SRAM);  // int32_t datatype
-  __sync_cluster();
-  // copy score from sram to nram and find the max
-  __bang_write_zero(inter_x1, 64);
-  __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-           REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
-  __bang_max(max_box, inter_x1, 64);
-  int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
-                                               : ((uint32_t *)max_box)[1];
-  // copy the max box to max_box
-  __memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
-           SRAM2NRAM);
-}
-/*****************************************************************************/
-/*******************************CALCULATE MAX AREA****************************/
-/*****************************************************************************/
-template <typename IN_DT>
-__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
-                             float &max_area) {
-  if (algo == 0 || offset == 0.0) {
-    max_area = ((float)max_box[3] - (float)max_box[1]) *
-               ((float)max_box[4] - (float)max_box[2]);
-  } else {
-    max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-               ((float)max_box[4] - (float)max_box[2] + offset);
-  }
-}
-template <typename IN_DT>
-__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
-                             float &max_area, float &max_box_x1,
-                             float &max_box_y1, float &max_box_x2,
-                             float &max_box_y2) {
-  // the case of random inf will break the requirement of x1<=x2, y1<=y2
-  // so exchange it if it happens.
-  max_box_x1 = float(max_box[1]);
-  max_box_x2 = float(max_box[3]);
-  if (max_box[1] > max_box[3]) {
-    max_box_x1 = float(max_box[3]);
-    max_box_x2 = float(max_box[1]);
-  }
-  max_box_y1 = float(max_box[2]);
-  max_box_y2 = float(max_box[4]);
-  if (max_box[2] > max_box[4]) {
-    max_box_y1 = float(max_box[4]);
-    max_box_y2 = float(max_box[2]);
-  }
-  if (algo == 0 || offset == 0.0) {
-    max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
-  } else {
-    max_area =
-        (max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
-  }
-}
-/***********************************************************************/
-/*******************************STORE RESULT****************************/
-/***********************************************************************/
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
-                              OUT_DT *&output_dram, const int keep,
-                              const int nram_save_limit_count,
-                              const int max_output_size,
-                              const float thresh_score, const int output_mode,
-                              int &nram_save_count, uint32_t &output_box_num) {
-  /******NMS STORE START******/
-  // store to nram
-  if (float(max_box[0]) > thresh_score) {
-    OUT_DT *save_ptr;
-    int save_offset = 0;
-    int save_str_num = 0;
-    save_ptr = nram_save;
-    save_offset = nram_save_count;
-    save_str_num = nram_save_limit_count;
-    if (clusterId == 0 && coreId == 0) {
-      if (output_mode == 0) {  // index1, index2, ...
-        save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
-      } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-        __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                 INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
-                 INFO_NUM * sizeof(IN_DT), 0);
-      } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-        __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
-                 save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
-      }
-    }
-    nram_save_count++;
-    output_box_num++;
-  }
-  // store to sram/gdram
-  if (output_box_num != 0) {
-    if ((nram_save_count == nram_save_limit_count) ||
-        (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
-      if (nram_save_count != 0) {
-        if (clusterId == 0 && coreId == 0) {
-          if (output_mode == 0) {  // index1, index2, ...
-            pvLock();
-            __memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
-                     NRAM2GDRAM);
-            pvUnlock();
-            output_dram += nram_save_count;
-          } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-            pvLock();
-            __memcpy(output_dram, nram_save,
-                     nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
-            pvUnlock();
-            output_dram += nram_save_count * INFO_NUM;
-          } else if (output_mode ==
-                     2) {  // score---, x1---, y1---, x2---, y2---
-            pvLock();
-            __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
-                     NRAM2GDRAM, max_output_size * sizeof(IN_DT),
-                     nram_save_limit_count * sizeof(IN_DT), 4);
-            pvUnlock();
-            output_dram += nram_save_count;
-          }
-          nram_save_count = 0;
-        }
-      }
-    }  // if move data nram->sram/gdram
-  }    // if dst
-}
-template <typename IN_DT, typename OUT_DT>
-__mlu_func__ void scoreUpdate(
-    IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
-    const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
-    const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
-    const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
-    IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
-    IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
-    const float max_box_y1, const float max_box_x2, const float max_box_y2,
-    OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
-    int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
-    const float thresh_iou, const float div_thresh_iou, const int input_offset,
-    const float offset, const float max_area, const int input_num_boxes,
-    const int algo) {
-  for (int i = 0; i <= repeat_iou_compute; i++) {
-    if (i == repeat_iou_compute && remain_iou_compute == 0) {
-      break;
-    }
-    int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
-                                            : max_seg_iou_compute;
-    int cpy_len =
-        (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
-    /******NMS LOAD START******/
-    int dt_offset = 0;
-    if (sizeof(IN_DT) == sizeof(float)) {
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      dt_offset = 0;
-    } else if (sizeof(IN_DT) == sizeof(half)) {
-      __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      __bang_half2float((float *)score, (half *)x1, seg_len);
-      dt_offset = max_seg_iou_compute;
-    }
-#if __BANG_ARCH__ >= 300
-    __memcpy(inter_x1 + dt_offset,
-             input_x1_ptr + input_offset + i * max_seg_iou_compute,
-             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
-             input_num_boxes * sizeof(IN_DT), 3);
-    if (sizeof(IN_DT) == sizeof(half)) {
-      __bang_half2float((float *)inter_x1,
-                        (half *)inter_x1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_y1,
-                        (half *)inter_y1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_x2,
-                        (half *)inter_x2 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)inter_y2,
-                        (half *)inter_y2 + max_seg_iou_compute, seg_len);
-    }
-    // box transfer
-    __bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
-    __bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
-    __bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
-    __bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
-    // 1、 compute IOU
-    // get the area_I
-    __bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
-                        seg_len);  // inter_x1
-    __bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
-                        seg_len);  // inter_x2
-    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-    }
-    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
-                 seg_len);  // inter_w
-    __bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
-                        seg_len);  // inter_y1
-    __bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
-                        seg_len);  // inter_y2
-    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-    }
-    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
-                 seg_len);  // inter_h
-    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-               seg_len);  // area_I
-    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-    if (algo == 1 && offset != 0.0) {
-      __bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
-                    offset, seg_len, seg_len);
-      __bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
-                    offset, seg_len, seg_len);
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-    } else {
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
-                    (float *)inter_y1, seg_len, seg_len);
-    }
-    // get the area_U: area + max_area - area_I
-    __bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
-                  (float *)inter_x1, seg_len, seg_len);
-    // 2、 select the box
-    // if IOU greater than thres, set the score to zero, abort it: area_U >
-    // area_I * (1 / thresh)?
-    if (thresh_iou > 0.0) {
-      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                        seg_len);
-    } else {
-      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                        seg_len);
-    }
-    // process for nan
-    __bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
-    __bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
-    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-/******NMS COMPUTE END******/
-#else
-    __memcpy(x1 + dt_offset,
-             input_x1_ptr + input_offset + i * max_seg_iou_compute,
-             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
-             input_num_boxes * sizeof(IN_DT), 3);
-    if (sizeof(IN_DT) == sizeof(half)) {
-      __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
-      __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
-    }
-    // 1、 compute IOU
-    // get the area_I
-    __bang_write_value((float *)inter_y1, seg_len,
-                       float(max_box[1]));  // max_x1
-    __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                    seg_len);  // inter_x1
-    __bang_write_value((float *)inter_y2, seg_len,
-                       float(max_box[3]));  // max_x2
-    __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                    seg_len);  // inter_x2
-    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-    }
-    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
-                 seg_len);  // inter_w
-    __bang_write_value((float *)inter_x2, seg_len,
-                       float(max_box[2]));  // max_y1
-    __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                    seg_len);  // inter_y1
-    __bang_write_value((float *)inter_x2, seg_len,
-                       float(max_box[4]));  // max_y2
-    __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                    seg_len);  // inter_y2
-    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-               seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-    }
-    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
-                 seg_len);  // inter_h
-    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-               seg_len);  // area_I
-    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-    __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-    __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-    if (algo == 1 && offset != 0.0) {
-      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      __bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-    }
-    __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-               seg_len);  // area
-    // get the area_U: area + max_area - area_I
-    __bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
-                      seg_len);
-    __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-               seg_len);  // area_U
-    // 2、 select the box
-    // if IOU greater than thresh, set the score to zero, abort it: area_U >
-    // area_I * (1 / thresh)?
-    if (thresh_iou > 0.0) {
-      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                        seg_len);
-    } else {
-      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                        seg_len);
-    }
-    __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
-    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-/******NMS COMPUTE END******/
-#endif
-    // update the score
-    if (sizeof(IN_DT) == sizeof(half)) {
-      convertFloat2half((half *)score, (float *)score, seg_len);
-    }
-    pvLock();
-    __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-             cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
-             cpy_len * sizeof(IN_DT), 0);
-    pvUnlock();
-  }
-}
-#endif  // NMS_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include "psamask_utils.hpp"
-#define COMPUTE_COUNT_ALIGN 64
-__nram__ char buf[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void swap(T &a, T &b) {
-  T tmp = a;
-  a = b;
-  b = tmp;
-}
-template <typename T>
-__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
-                                          const PositionInCore &position,
-                                          const Shape &shape_full) {
-  int n_offset = shape_full.h * shape_full.w * shape_full.c;
-  int h_offset = shape_full.w * shape_full.c;
-  int w_offset = shape_full.c;
-  int n_seg = position.n_end - position.n_start;
-  int h_seg = position.h_end - position.h_start;
-  int w_seg = position.w_end - position.w_start;
-  int size = h_seg * w_seg * shape_full.c;
-  __memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
-               position.w_start * w_offset,
-           src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
-           size * sizeof(T), n_seg - 1);
-}
-template <typename T>
-__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
-                                         const PositionInCore &position,
-                                         const Shape &shape_full) {
-  int n_offset = shape_full.h * shape_full.w * shape_full.c;
-  int h_offset = shape_full.w * shape_full.c;
-  int w_offset = shape_full.c;
-  int n_seg = position.n_end - position.n_start;
-  int h_seg = position.h_end - position.h_start;
-  int w_seg = position.w_end - position.w_start;
-  int size = h_seg * w_seg * shape_full.c;
-  __memcpy(dst, src + position.n_start * n_offset +
-                    position.h_start * h_offset + position.w_start * w_offset,
-           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
-           n_seg - 1);
-}
-// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
-template <typename T>
-__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
-  int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  for (int i = 0; i < shape_seg.n; ++i) {
-    __bang_transpose(dst, src, align_hw, align_c);
-    dst += align_hw * align_c;
-    src += align_hw * align_c;
-  }
-}
-template <typename T>
-__mlu_func__ void psamaskCollectForward(
-    const T *x_dram, T *y_dram, const PositionInCore &position,
-    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *x_nram = (T *)buf;
-  T *y_nram =
-      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
-                          COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
-  // fill zeros to output
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
-                 NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(y_nram, elem_count, (T)0);
-  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
-  int y_h_offset = shape_seg.w * shape_seg.c;
-  int y_w_offset = shape_seg.c;
-  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
-  int y_c_offset = 1;
-  int x_h_offset = shape_seg.w * x_full.c;
-  int x_w_offset = x_full.c;
-  int x_c_offset = 1;
-  int x_start = 0;
-  int y_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int y_offset = y_start;
-        int x_offset = x_start;
-        y_offset += hidx * y_h_offset + widx * y_w_offset;
-        x_offset += hidx * x_h_offset + widx * x_w_offset;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = x_full.h + half_h_mask - h_abs < h_mask
-                             ? x_full.h + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = x_full.w + half_w_mask - w_abs < w_mask
-                             ? x_full.w + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                      w                  ) with mask-indexed
-        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
-        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
-                     w_abs - half_w_mask) *
-                    y_c_offset;
-        x_offset += (hstart * w_mask + wstart) * x_c_offset;
-        int count = wend - wstart;
-        __memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
-                 NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
-                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
-      }
-    }
-    y_start += y_n_offset;
-    x_start += x_n_offset;
-  }
-  storeDataFromNramToDram(y_dram, y_nram, position, y_full);
-}
-template <typename T>
-__mlu_func__ void psamaskDistributeForward(
-    const T *x_dram, T *y_dram, const PositionInCore &position,
-    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *x_nram = (T *)buf;
-  T *y_nram_temp =
-      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
-                          COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
-  // fill zeros to output
-  int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(y_nram_temp, elem_count, (T)0);
-  int y_n_offset = align_hw * align_c;
-  int y_h_offset = shape_seg.w * align_c;
-  int y_w_offset = align_c;
-  int y_c_offset = 1;
-  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
-  int x_h_offset = shape_seg.w * x_full.c;
-  int x_w_offset = x_full.c;
-  int x_c_offset = 1;
-  int h_feature = y_full.h;
-  int w_feature = y_full.w;
-  int y_start = 0;
-  int x_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int y_offset = y_start;
-        int x_offset = x_start;
-        y_offset += hidx * y_h_offset + widx * y_w_offset;
-        x_offset += hidx * x_h_offset + widx * x_w_offset;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                      w                     ) with mask-indexed
-        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
-        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
-                     w_abs - half_w_mask) *
-                    y_c_offset;
-        x_offset += (hstart * w_mask + wstart) * x_c_offset;
-        int count = wend - wstart;
-        __memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
-                 NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
-                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
-      }
-    }
-    y_start += y_n_offset;
-    x_start += x_n_offset;
-  }
-  // transpose y
-  T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
-  Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
-  transposeData(y_nram, y_nram_temp, y_seg);
-  swap(align_c, align_hw);
-  // store y from nram to dram
-  int y_n_offset_full = y_full.h * y_full.w * y_full.c;
-  int y_w_offset_full = y_full.c;
-  int y_c_offset_full = 1;
-  int y_dram_start =
-      position.n_start * y_n_offset_full +
-      (position.h_start * y_full.w + position.w_start) * y_c_offset_full;
-  int y_nram_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
-    int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
-    __memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
-             shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
-             y_w_offset_full * sizeof(T), align_c * sizeof(T),
-             h_feature * w_feature - 1);
-  }
-}
-template <typename T>
-__mlu_func__ void psamaskCollectBackward(
-    const T *dy_dram, T *dx_dram, const PositionInCore &position,
-    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  T *dy_nram = (T *)buf;
-  T *dx_nram =
-      dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
-                           COMPUTE_COUNT_ALIGN / sizeof(T));
-  loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
-  // fill zeros to output
-  int elem_count =
-      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
-                 NFU_ALIGN_SIZE / sizeof(T));
-  __bang_write_value(dx_nram, elem_count, (T)0);
-  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
-  int dy_h_offset = shape_seg.w * dy_full.c;
-  int dy_w_offset = dy_full.c;
-  int dy_c_offset = 1;
-  int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
-  int dx_h_offset = shape_seg.w * dx_full.c;
-  int dx_w_offset = dx_full.c;
-  int dx_c_offset = 1;
-  int h_feature = dy_full.h;
-  int w_feature = dy_full.w;
-  int dy_start = 0;
-  int dx_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int dy_offset = dy_start;
-        int dx_offset = dx_start;
-        dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
-        dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                       w                      ) with mask-indexed
-        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
-        // feature-indexed
-        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
-                      w_abs - half_w_mask) *
-                     dy_c_offset;
-        dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
-        int count = wend - wstart;
-        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
-                 NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
-                 dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
-      }
-    }
-    dy_start += dy_n_offset;
-    dx_start += dx_n_offset;
-  }
-  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
-}
-template <typename T>
-__mlu_func__ void psamaskDistributeBackward(
-    const T *dy_dram, T *dx_dram, const PositionInCore &position,
-    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
-    const int h_mask, const int w_mask, const int half_h_mask,
-    const int half_w_mask) {
-  // load dy from dram to nram
-  T *dy_nram_temp = (T *)buf;
-  int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
-  int dy_c_offset_full = 1;
-  int h_feature = dy_full.h;
-  int w_feature = dy_full.w;
-  int align_c =
-      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int align_hw =
-      CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
-  int dy_dram_start =
-      position.n_start * dy_n_offset_full +
-      (position.h_start * w_feature + position.w_start) * dy_c_offset_full;
-  int dy_nram_start = 0;
-  for (int i = 0; i < shape_seg.n; ++i) {
-    int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
-    int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
-    __memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
-             shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
-             align_c * sizeof(T), dy_full.c * sizeof(T),
-             h_feature * w_feature - 1);
-  }
-  T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
-  Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
-  transposeData(dy_nram, dy_nram_temp, dy_seg);
-  swap(align_c, align_hw);
-  // fill zeros to dx
-  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
-  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
-  __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
-                     (T)0);
-  int dy_n_offset_seg = align_hw * align_c;
-  int dy_h_offset_seg = shape_seg.w * align_c;
-  int dy_w_offset_seg = align_c;
-  int dy_c_offset_seg = 1;
-  int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
-  int dx_h_offset_seg = shape_seg.w * shape_seg.c;
-  int dx_w_offset_seg = shape_seg.c;
-  int dx_c_offset_seg = 1;
-  int dy_start = 0;
-  int dx_start = 0;
-  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
-    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
-      for (int widx = 0; widx < shape_seg.w; ++widx) {
-        int h_abs = hidx + position.h_start;
-        int w_abs = widx + position.w_start;
-        int dy_offset = dy_start;
-        int dx_offset = dx_start;
-        dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
-        dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
-        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
-        const int hend = h_feature + half_h_mask - h_abs < h_mask
-                             ? h_feature + half_h_mask - h_abs
-                             : h_mask;
-        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
-        const int wend = w_feature + half_w_mask - w_abs < w_mask
-                             ? w_feature + half_w_mask - w_abs
-                             : w_mask;
-        // (h,                       w                      ) with mask-indexed
-        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
-        // feature-indexed
-        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
-                      w_abs - half_w_mask) *
-                     dy_c_offset_seg;
-        dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
-        int count = wend - wstart;
-        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
-                 NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
-                 w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
-      }
-    }
-    dy_start += dy_n_offset_seg;
-    dx_start += dx_n_offset_seg;
-  }
-  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
-}
-template <typename T>
-__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
-                              const Shape &input_full, const Shape &output_full,
-                              LimitParam &limit, const PsamaskType psa_type,
-                              const DimPartitionType core_partition,
-                              const DimPartitionType cluster_partition,
-                              const bool is_forward, const int h_mask,
-                              const int w_mask, const int half_h_mask,
-                              const int half_w_mask, const int n_per_core,
-                              const int h_per_core, const int n_per_cluster,
-                              const int h_per_cluster) {
-  PositionInCore position_full;
-  PositionInCore position_seg;
-  position_full.w_start = 0;
-  position_full.w_end = output_full.w;
-  int n_num_in_cluster = n_per_cluster;
-  int h_num_in_cluster = h_per_cluster;
-  switch (cluster_partition) {
-    case PARTITION_N: {
-      position_full.h_start = 0;
-      position_full.h_end = input_full.h;
-      position_full.n_start = taskIdY * n_per_cluster;
-      int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
-      if (taskIdY >= cluster_need) return;
-      int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
-      n_num_in_cluster =
-          (taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
-      position_full.n_end = position_full.n_start + n_num_in_cluster;
-    }; break;
-    case PARTITION_H: {
-      position_full.n_start = 0;
-      position_full.n_end = input_full.n;
-      position_full.h_start = taskIdY * h_per_cluster;
-      int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
-      if (taskIdY >= cluster_need) return;
-      int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
-      h_num_in_cluster =
-          (taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
-      position_full.h_end = position_full.h_start + h_num_in_cluster;
-    }; break;
-  }
-  switch (core_partition) {
-    case PARTITION_N: {
-      position_full.n_start += taskIdX * n_per_core;
-      int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
-      if (taskIdX >= core_need) return;
-      int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
-      position_full.n_end =
-          position_full.n_start +
-          ((taskIdX == core_need - 1) ? n_remainder : n_per_core);
-    }; break;
-    case PARTITION_H: {
-      position_full.h_start += taskIdX * h_per_core;
-      int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
-      if (taskIdX >= core_need) return;
-      int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
-      position_full.h_end =
-          position_full.h_start +
-          ((taskIdX == core_need - 1) ? h_remainder : h_per_core);
-    }; break;
-  }
-  // the count of n ,h and w need to be processed in the current core
-  int shape_core_n = position_full.n_end - position_full.n_start;
-  int shape_core_h = position_full.h_end - position_full.h_start;
-  int shape_core_w = input_full.w;
-  limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
-  limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
-  limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
-  // load the data to nram according to the limit
-  for (int nidx = position_full.n_start; nidx < position_full.n_end;
-       nidx += limit.n) {
-    position_seg.n_start = nidx;
-    position_seg.n_end =
-        position_seg.n_start + (position_full.n_end - nidx < limit.n
-                                    ? position_full.n_end - nidx
-                                    : limit.n);
-    for (int hidx = position_full.h_start; hidx < position_full.h_end;
-         hidx += limit.h) {
-      position_seg.h_start = hidx;
-      position_seg.h_end =
-          position_seg.h_start + (position_full.h_end - hidx < limit.h
-                                      ? position_full.h_end - hidx
-                                      : limit.h);
-      for (int widx = position_full.w_start; widx < position_full.w_end;
-           widx += limit.w) {
-        position_seg.w_start = widx;
-        position_seg.w_end =
-            position_seg.w_start + (position_full.w_end - widx < limit.w
-                                        ? position_full.w_end - widx
-                                        : limit.w);
-        // record the segment of output except the size of channel
-        // channel segments of output and input are the same
-        Shape shape_seg;
-        shape_seg.n = position_seg.n_end - position_seg.n_start;
-        shape_seg.h = position_seg.h_end - position_seg.h_start;
-        shape_seg.w = position_seg.w_end - position_seg.w_start;
-        shape_seg.c = output_full.c;
-        switch (psa_type) {
-          case COLLECT: {
-            if (is_forward) {
-              psamaskCollectForward(input_dram, output_dram, position_seg,
-                                    input_full, output_full, shape_seg, h_mask,
-                                    w_mask, half_h_mask, half_w_mask);
-            } else {
-              psamaskCollectBackward(input_dram, output_dram, position_seg,
-                                     input_full, output_full, shape_seg, h_mask,
-                                     w_mask, half_h_mask, half_w_mask);
-            }
-          } break;
-          case DISTRIBUTE: {
-            if (is_forward) {
-              psamaskDistributeForward(input_dram, output_dram, position_seg,
-                                       input_full, output_full, shape_seg,
-                                       h_mask, w_mask, half_h_mask,
-                                       half_w_mask);
-            } else {
-              psamaskDistributeBackward(input_dram, output_dram, position_seg,
-                                        input_full, output_full, shape_seg,
-                                        h_mask, w_mask, half_h_mask,
-                                        half_w_mask);
-            }
-          } break;
-        }
-      }
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPsamaskForward(
-    const T *x, T *y, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  if (coreId == 0x80) {
-    return;
-  }
-  Shape x_full, y_full;
-  x_full.n = batch;
-  x_full.h = h_feature;
-  x_full.w = w_feature;
-  x_full.c = x_c;
-  y_full.n = batch;
-  y_full.h = h_feature;
-  y_full.w = w_feature;
-  y_full.c = y_c;
-  LimitParam limit;
-  limit.n = limit_n_seg;
-  limit.h = limit_h_seg;
-  limit.w = limit_w_seg;
-  psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
-              cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
-              n_per_core, h_per_core, n_per_cluster, h_per_cluster);
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPsamaskBackward(
-    const T *dy, T *dx, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  if (coreId == 0x80) {
-    return;
-  }
-  Shape dy_full, dx_full;
-  dx_full.n = batch;
-  dx_full.h = h_feature;
-  dx_full.w = w_feature;
-  dx_full.c = dx_c;
-  dy_full.n = batch;
-  dy_full.h = h_feature;
-  dy_full.w = w_feature;
-  dy_full.c = dy_c;
-  LimitParam limit;
-  limit.n = limit_n_seg;
-  limit.h = limit_h_seg;
-  limit.w = limit_w_seg;
-  psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
-              cluster_partition, false, h_mask, w_mask, half_h_mask,
-              half_w_mask, n_per_core, h_per_core, n_per_cluster,
-              h_per_cluster);
-}
-void KernelPsamaskForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *x, void *y, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
-      static_cast<const float *>(x), static_cast<float *>(y), psa_type,
-      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
-      w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
-      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
-}
-void KernelPsamaskBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *dy, void *dx, const PsamaskType psa_type,
-    const DimPartitionType core_partition,
-    const DimPartitionType cluster_partition, const int batch,
-    const int h_feature, const int w_feature, const int h_mask,
-    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
-    const int half_w_mask, const int n_per_core, const int h_per_core,
-    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
-    const int limit_h_seg, const int limit_w_seg) {
-  MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
-      static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
-      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
-      w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
-      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
-}
--- a/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef PSAMASK_UTILS_HPP_
-#define PSAMASK_UTILS_HPP_
-typedef enum {
-  COLLECT = 0,
-  DISTRIBUTE = 1,
-} PsamaskType;
-typedef enum {
-  PARTITION_N = 0,
-  PARTITION_H = 1,
-} DimPartitionType;
-struct PartitionSeg {
-  int h_per_cluster;
-  int n_per_cluster;
-  int h_per_core;
-  int n_per_core;
-  DimPartitionType cluster_partition;
-  DimPartitionType core_partition;
-};
-struct Shape {
-  int n;
-  int h;
-  int w;
-  int c;
-};
-struct LimitParam {
-  int n;
-  int h;
-  int w;
-};
-struct PositionInCore {
-  int n_start;
-  int n_end;
-  int h_start;
-  int h_end;
-  int w_start;
-  int w_end;
-};
-#endif  // PSAMASK_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2021 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#define ROI_OFFSET 5
-__nram__ char buffer[MAX_NRAM_SIZE];
-namespace forward {
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int input_height,
-                                      const int input_width, T y, T x, T *w1,
-                                      T *w2, T *w3, T *w4, int *x_low,
-                                      int *x_high, int *y_low, int *y_high,
-                                      bool *empty) {
-  // deal with cases that inverse elements are of feature map boundary
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    return;
-  }
-  if (y <= 0) y = 0;
-  if (x <= 0) x = 0;
-  int y_low_ = int(y);
-  int x_low_ = int(x);
-  if (y_low_ >= input_height - 1) {
-    *y_high = y_low_ = input_height - 1;
-    y = (T)y_low_;
-  } else {
-    *y_high = y_low_ + 1;
-  }
-  if (x_low_ >= input_width - 1) {
-    *x_high = x_low_ = input_width - 1;
-    x = T(x_low_);
-  } else {
-    *x_high = x_low_ + 1;
-  }
-  *y_low = y_low_;
-  *x_low = x_low_;
-  T ly = y - y_low_;
-  T lx = x - x_low_;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-  return;
-}
-template <typename T>
-__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
-                                 T *nram_out, const int roi_bin_grid_h,
-                                 const int roi_bin_grid_w, const T roi_start_h,
-                                 const T roi_start_w, const int ph,
-                                 const int pw, const T bin_size_h,
-                                 const T bin_size_w, const float count,
-                                 const int input_height, const int input_width,
-                                 const int channels, const int cyc_num,
-                                 const int max_elements) {
-  int cyc_channel = max_elements;
-  for (int i = 0; i < cyc_num; i++) {
-    int real_channel =
-        (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
-    int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
-    __bang_write_zero(nram_out, align_channel);
-    uint32_t real_size = real_channel * sizeof(T);
-    int iy, ix;
-    for (iy = 0; iy < roi_bin_grid_h; iy++) {
-      // 1. compute the coordinates of the y axis in the current roi_bin_grid_h
-      T y = roi_start_h + ph * bin_size_h +
-            (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
-      for (ix = 0; ix < roi_bin_grid_w; ix++) {
-        // 2. compute the coordinates of the x axis in the current
-        //    roi_bin_grid_w
-        T x = roi_start_w + pw * bin_size_w +
-              (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
-        // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
-        //    and y_high) and weight (x_low and x_high) of input feature map in
-        //    the current roi bin grid, and the flag (empty) which shows if x, y
-        //    are out of input feature map ranges
-        T w1, w2, w3, w4;
-        int x_low, x_high, y_low, y_high;
-        bool empty = false;
-        bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
-                            &x_low, &x_high, &y_low, &y_high, &empty);
-        // 4. compute interpolation of the current roi bin grid
-        //    tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
-        //    to compute the interpolation, and then reused to compute
-        //    the argmax_x and argmax_y.
-        T *tmp_cyc1 = nram_in + cyc_channel;
-        T *tmp_cyc2 = nram_in + cyc_channel * 2;
-        T *tmp_cyc3 = nram_in + cyc_channel * 3;
-        T *tmp_cyc4 = nram_in + cyc_channel * 4;
-        if (empty) {  // exits abnormal values
-          __bang_write_zero(nram_in, align_channel);
-        } else {
-          __bang_write_zero(nram_in, align_channel);
-          uint32_t offset1 = (y_low * input_width + x_low) * channels;
-          uint32_t offset2 = (y_low * input_width + x_high) * channels;
-          uint32_t offset3 = (y_high * input_width + x_low) * channels;
-          uint32_t offset4 = (y_high * input_width + x_high) * channels;
-          T *input1 = (T *)input_core + offset1 + i * cyc_channel;
-          T *input2 = (T *)input_core + offset2 + i * cyc_channel;
-          T *input3 = (T *)input_core + offset3 + i * cyc_channel;
-          T *input4 = (T *)input_core + offset4 + i * cyc_channel;
-          // load the four pixels (p1, p2, p3 and p4) of input feature map to
-          // compute interpolation
-          __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
-          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
-          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
-          __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
-          __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
-          __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
-          __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);
-          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
-          __bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
-        }
-        // 5. compute sum value and corresponding coordinates of x axis and y
-        //    axis. Update the sum value.
-        __bang_add(nram_out, nram_in, nram_out, align_channel);
-      }  // loop_roi_grid_w
-    }    // loop_roi_grid_h
-    T count_value = (T)(1.0 / count);
-    __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
-    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
-  }  // loop_cyc_num
-}
-template <typename T>
-__mlu_func__ void roialignForwardAvg(
-    T *input, T *rois, T *output, const bool aligned, const int channels,
-    const int pooled_height, const int pooled_width, const int input_height,
-    const int input_width, const int sampling_ratio, const T spatial_scale,
-    const int num_rois) {
-  // find limit for channel, the nram space is divided to 6 parts that are
-  // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
-  // max_elements : 300 : float datatype : 27296, half datatype : 54592
-  // max_elements : 200 : float datatype : 16384, half datatype : 32768
-  int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
-  int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
-  T offset = aligned ? (T)0.5 : (T)0.0;
-  int task_num = num_rois * pooled_height * pooled_width;
-  T *nram_out = (T *)buffer;
-  T *nram_in = nram_out + max_elements;
-  if (task_num < taskDim) {
-    if (taskId >= task_num) {
-      return;
-    }
-  }
-  for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
-    if (bin_idx >= task_num) {
-      return;
-    }
-    // (n,ph.pw) is a c in the pooled output
-    int pw = bin_idx % pooled_width;
-    int ph = (bin_idx / pooled_width) % pooled_height;
-    int n = bin_idx / pooled_width / pooled_height;
-    T *roi_id_tmp = rois + n * ROI_OFFSET;
-    // 1. compute width and height of roi region.
-    int batch_idx = (int)roi_id_tmp[0];
-    T roi_x1 = roi_id_tmp[1];
-    T roi_y1 = roi_id_tmp[2];
-    T roi_x2 = roi_id_tmp[3];
-    T roi_y2 = roi_id_tmp[4];
-    T roi_start_w = roi_x1 * spatial_scale - offset;
-    T roi_start_h = roi_y1 * spatial_scale - offset;
-    T roi_end_w = roi_x2 * spatial_scale - offset;
-    T roi_end_h = roi_y2 * spatial_scale - offset;
-    T roi_width = roi_end_w - roi_start_w;
-    T roi_height = roi_end_h - roi_start_h;
-    if (!aligned) {
-      roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
-      roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
-    }
-    // 2. compute float-type width and height of roi bin region.
-    T bin_size_w = (T)roi_width / (T)pooled_width;
-    T bin_size_h = (T)roi_height / (T)pooled_height;
-    // 3. compute int-type width and height of roi bin region.
-    int roi_bin_grid_h, roi_bin_grid_w;
-    roi_bin_grid_h = (sampling_ratio > 0)
-                         ? sampling_ratio
-                         : int(ceilf(roi_height / pooled_height));
-    roi_bin_grid_w = (sampling_ratio > 0)
-                         ? sampling_ratio
-                         : int(ceilf(roi_width / pooled_width));
-    float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
-                              ? roi_bin_grid_h * roi_bin_grid_w
-                              : 1.0);
-    T *input_core = input + batch_idx * channels * input_width * input_height;
-    T *output_core = output + bin_idx * channels;
-    // 4. compute avg value and corresponding coordinates of x axis and y axis.
-    computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
-                   roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
-                   bin_size_w, count, input_height, input_width, channels,
-                   cyc_num, max_elements);
-  }
-}
-__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
-    const void *input, const void *rois, const int channels, const bool aligned,
-    const int pooled_height, const int pooled_width, const int input_height,
-    const int input_width, const int sampling_ratio, const float spatial_scale,
-    const int num_rois, const cnrtDataType_t data_type, void *output) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
-                         channels, pooled_height, pooled_width, input_height,
-                         input_width, sampling_ratio, (half)spatial_scale,
-                         num_rois);
-    }; break;
-    case CNRT_FLOAT32: {
-      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
-                         aligned, channels, pooled_height, pooled_width,
-                         input_height, input_width, sampling_ratio,
-                         (float)spatial_scale, num_rois);
-    }; break;
-    default:
-      break;
-  }
-  return;
-}
-}  // namespace forward
-namespace backward {
-__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
-                                              float x, float *w1, float *w2,
-                                              float *w3, float *w4, int *x_low,
-                                              int *x_high, int *y_low,
-                                              int *y_high) {
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
-    *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
-    return;
-  }
-  if (y <= 0) {
-    y = 0;
-  }
-  if (x <= 0) {
-    x = 0;
-  }
-  *y_low = (int)y;
-  *x_low = (int)x;
-  if (*y_low >= height - 1) {
-    *y_high = height - 1, *y_low = height - 1;
-    y = (float)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= width - 1) {
-    *x_high = width - 1, *x_low = width - 1;
-    x = (float)(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  float ly = y - *y_low, lx = x - *x_low;
-  float hy = 1.0 - ly, hx = 1.0 - lx;
-  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
-  return;
-}
-template <typename T>
-__mlu_func__ void unionRoiAlignBp(
-    T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
-    const int wi, const int c, const int no, const int ho, const int wo,
-    const float spatial_scale, const int sampling_ratio, const bool aligned) {
-  int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
-  int deal_all = boxes_num * hi * wi;
-  int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
-  for (int i = 0; i < deal_this_core; ++i) {
-    int bhw_id = i * taskDim + taskId;
-    int box_id = bhw_id / (hi * wi);
-    int ih = (bhw_id / wi) % hi;
-    int iw = bhw_id % wi;
-    T *box = boxes + box_id * 5;
-    int image_id = (int)box[0];
-    T *image_offset = grads_image + image_id * ho * wo * c;
-    T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
-    float offset = aligned ? 0.5 : 0.0;
-    float x1 = box[1] * spatial_scale - offset;
-    float y1 = box[2] * spatial_scale - offset;
-    float x2 = box[3] * spatial_scale - offset;
-    float y2 = box[4] * spatial_scale - offset;
-    float roi_width = x2 - x1;
-    float roi_height = y2 - y1;
-    if (!aligned) {
-      roi_width = (roi_width > 1.0) ? roi_width : 1.0;
-      roi_height = (roi_height > 1.0) ? roi_height : 1.0;
-    }
-    float bin_size_h = roi_height / hi;
-    float bin_size_w = roi_width / wi;
-    int roi_grid_h =
-        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
-    int roi_grid_w =
-        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
-    const T count = roi_grid_h * roi_grid_w;
-    if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
-      for (int iy = 0; iy < roi_grid_h; ++iy) {
-        const float y =
-            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
-        for (int ix = 0; ix < roi_grid_w; ++ix) {
-          const float x =
-              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
-          float w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
-                                      &x_high, &y_low, &y_high);
-          if (x_low >= 0 && y_low >= 0) {
-            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_low * wo * c + x_low * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_low * wo * c + x_high * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_high * wo * c + x_low * c,
-                              (T *)buffer + c_align, c);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
-                              c_align);
-            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
-                              1 / count, c_align);
-            __bang_atomic_add((T *)buffer + c_align,
-                              image_offset + y_high * wo * c + x_high * c,
-                              (T *)buffer + c_align, c);
-          }  // x_low && y_low
-        }    // ix
-      }      // iy
-    } else {
-      for (int iy = 0; iy < roi_grid_h; ++iy) {
-        const float y =
-            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
-        for (int ix = 0; ix < roi_grid_w; ++ix) {
-          const float x =
-              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
-          float w1, w2, w3, w4;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
-                                      &x_high, &y_low, &y_high);
-          if (x_low >= 0 && y_low >= 0) {
-            int deal_once =
-                PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
-            int c_repeat = c / deal_once + (int)(c % deal_once != 0);
-            for (int i = 0; i < c_repeat; ++i) {
-              int deal_c = deal_once;
-              int align_c = deal_once;
-              if (i == c_repeat - 1) {
-                deal_c = c - i * deal_once;
-                align_c = c_align - i * deal_once;
-              }
-              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
-                       GDRAM2NRAM);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
-                                align_c);
-              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
-                                1 / count, align_c);
-              __bang_atomic_add(
-                  (T *)buffer + align_c,
-                  image_offset + y_high * wo * c + x_high * c + i * deal_once,
-                  (T *)buffer + align_c, deal_c);
-            }  // for c_repeat
-          }    // x_low >= 0 && y_low >= 0
-        }      // ix
-      }        // iy
-    }          // if c
-  }            // i
-}
-__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
-    const void *grads, const void *boxes, void *grads_image,
-    const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
-    const int c, const int no, const int ho, const int wo,
-    const float spatial_scale, const int sampling_ratio, const bool aligned) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (dtype) {
-    case CNRT_FLOAT16: {
-      unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
-                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
-                      sampling_ratio, aligned);
-    }; break;
-    case CNRT_FLOAT32: {
-      unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
-                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
-                      sampling_ratio, aligned);
-    }; break;
-    default: { return; }
-  }
-}
-}  // namespace backward
-void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                    cnrtQueue_t queue, const cnrtDataType_t d_type,
-                    const void *input, const void *rois, const int channels,
-                    const bool aligned, const int pooled_height,
-                    const int pooled_width, const int input_height,
-                    const int input_width, const int sampling_ratio,
-                    const float spatial_scale, const int num_rois,
-                    void *output) {
-  forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
-      input, rois, channels, aligned, pooled_height, pooled_width, input_height,
-      input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
-}
-void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                            cnrtQueue_t queue, const cnrtDataType_t dtype,
-                            const void *grads, const void *boxes,
-                            void *grads_image, const int boxes_num,
-                            const int hi, const int wi, const int c,
-                            const int no, const int ho, const int wo,
-                            const float spatial_scale, const int sampling_ratio,
-                            const bool aligned) {
-  backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
-      grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
-      spatial_scale, sampling_ratio, aligned);
-}
--- a/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include "roi_align_rotated_utils.hpp"
-#define ROI_OFFSET 6
-#define SAMPLING_NUM 4
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void swap(T &a, T &b) {
-  T tmp = a;
-  a = b;
-  b = tmp;
-}
-template <typename T>
-__mlu_func__ void bilinearInterpolate(const int input_height,
-                                      const int input_width, T x, T y, T *w1,
-                                      T *w2, T *w3, T *w4, int *x_low,
-                                      int *x_high, int *y_low, int *y_high,
-                                      bool *empty) {
-  // deal with case that the point is out of feature map boundary
-  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
-    *empty = true;
-    return;
-  }
-  if (y <= 0) y = (T)0;
-  if (x <= 0) x = (T)0;
-  *y_low = int(y);
-  *x_low = int(x);
-  if (*y_low >= input_height - 1) {
-    *y_high = *y_low = input_height - 1;
-    y = (T)(*y_low);
-  } else {
-    *y_high = *y_low + 1;
-  }
-  if (*x_low >= input_width - 1) {
-    *x_high = *x_low = input_width - 1;
-    x = T(*x_low);
-  } else {
-    *x_high = *x_low + 1;
-  }
-  T ly = y - *y_low;
-  T lx = x - *x_low;
-  T hy = 1.0 - ly;
-  T hx = 1.0 - lx;
-  *w1 = hy * hx;
-  *w2 = hy * lx;
-  *w3 = ly * hx;
-  *w4 = ly * lx;
-  return;
-}
-template <typename T>
-__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
-                                const RoiAlignRotatedParams &params,
-                                int *batch_idx, int *roi_n, int *pw, int *ph,
-                                T *roi_center_x, T *roi_center_y, T *roi_width,
-                                T *roi_height, T *theta) {
-  T offset = params.aligned ? (T)0.5 : (T)0.0;
-  *pw = bin_i % params.pooled_width;
-  *ph = (bin_i / params.pooled_width) % params.pooled_height;
-  *roi_n = bin_i / params.pooled_width / params.pooled_height;
-  const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
-  *batch_idx = (int)roi_info[0];
-  *roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
-  *roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
-  *roi_width = roi_info[3] * (T)params.spatial_scale;
-  *roi_height = roi_info[4] * (T)params.spatial_scale;
-  *theta = roi_info[5];
-  if (params.clockwise) {
-    *theta = -(*theta);
-  }
-  if (!params.aligned) {
-    *roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
-    *roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
-  }
-}
-template <typename T>
-__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
-                                         const T *rois_dram, const int batch,
-                                         const int height, const int width,
-                                         const int channel, const int rois_num,
-                                         const RoiAlignRotatedParams &params,
-                                         T *output_dram) {
-  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
-  int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
-  channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
-  int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
-  channel_align = CEIL_ALIGN(channel_align, align_base_128);
-  T *nram_out = (T *)nram_buffer;
-  T *nram_ping = nram_out + channel_align;
-  T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
-  int bin_first = taskId;
-  int bin_end = rois_num * params.pooled_height * params.pooled_width;
-  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
-    T roi_center_x, roi_center_y, roi_width, roi_height, theta;
-    int batch_idx, roi_n, pw, ph;
-    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
-                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
-                  &theta);
-    T bin_size_h = roi_height / params.pooled_height;
-    T bin_size_w = roi_width / params.pooled_width;
-    int roi_bin_grid_h =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_height / params.pooled_height);
-    int roi_bin_grid_w =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_width / params.pooled_width);
-    T roi_start_y = -roi_height / 2;
-    T roi_start_x = -roi_width / 2;
-    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
-                            ? roi_bin_grid_h * roi_bin_grid_w
-                            : 1;
-    T cos_theta = std::cos(theta);
-    T sin_theta = std::sin(theta);
-    T zero_sign = 1.0f / bin_dim;
-    bool is_first_sample = true;
-    int src_offset = 0;
-    int dst_offset = 0;
-    int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
-    for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
-      __bang_write_value(nram_out, channel_align, (T)0);
-      c_rem = channel - c_offset;
-      c_slice = channel_align > c_rem ? c_rem : channel_align;
-      c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
-      is_first_sample = true;
-      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
-        const T yy = roi_start_y + ph * bin_size_h +
-                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
-        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
-          const T xx = roi_start_x + pw * bin_size_w +
-                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
-          int sample_i = iy * roi_bin_grid_w + ix;
-          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
-          T w1, w2, w3, w4;
-          bool empty = false;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, &y_low, &y_high, &empty);
-          /*******************************************************
-                 |          ping         |          pong         |
-          |------|-----|-----|-----|-----|-----|-----|-----|-----|
-          |output|  p1 |  p2 |  p3 |  p4 |  p1 |  p2 |  p3 |  p4 |
-          |------|-----|-----|-----|-----|-----|-----|-----|-----|
-          ********************************************************/
-          if (is_first_sample && !empty) {
-            // load input data from dram to nram
-            __bang_write_value(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
-            src_offset =
-                (batch_idx * height * width + y_low * width + x_low) * channel +
-                c_offset;
-            dst_offset = 0;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset = (batch_idx * height * width + y_low * width + x_high) *
-                             channel +
-                         c_offset;
-            dst_offset = c_slice_align;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset = (batch_idx * height * width + y_high * width + x_low) *
-                             channel +
-                         c_offset;
-            dst_offset = c_slice_align * 2;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-            src_offset =
-                (batch_idx * height * width + y_high * width + x_high) *
-                    channel +
-                c_offset;
-            dst_offset = c_slice_align * 3;
-            __memcpy(nram_ping + dst_offset, input_dram + src_offset,
-                     c_slice * sizeof(T), GDRAM2NRAM);
-          }
-          // load next input data to nram
-          if (sample_i + 1 < bin_dim) {
-            int p_iy = (sample_i + 1) / roi_bin_grid_w;
-            int p_ix = (sample_i + 1) % roi_bin_grid_w;
-            const T p_yy = roi_start_y + ph * bin_size_h +
-                           T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
-            const T p_xx = roi_start_x + pw * bin_size_w +
-                           T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
-            T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
-            T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
-            T p_w1, p_w2, p_w3, p_w4;
-            bool p_empty = false;
-            int p_x_low, p_x_high, p_y_low, p_y_high;
-            bilinearInterpolate(height, width, p_x, p_y, &p_w1, &p_w2, &p_w3,
-                                &p_w4, &p_x_low, &p_x_high, &p_y_low, &p_y_high,
-                                &p_empty);
-            pongc_slice = c_slice;
-            pongc_slice_align = c_slice_align;
-            if (!p_empty) {
-              __bang_write_value(nram_pong, SAMPLING_NUM * pongc_slice_align,
-                                 (T)0);
-              src_offset =
-                  (batch_idx * height * width + p_y_low * width + p_x_low) *
-                      channel +
-                  c_offset;
-              dst_offset = 0;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_low * width + p_x_high) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_high * width + p_x_low) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align * 2;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-              src_offset =
-                  (batch_idx * height * width + p_y_high * width + p_x_high) *
-                      channel +
-                  c_offset;
-              dst_offset = pongc_slice_align * 3;
-              __memcpy(nram_pong + dst_offset, input_dram + src_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-            }
-          }
-          T *tmp_sum = nram_ping + 3 * c_slice_align;
-          if (empty) {
-            __bang_write_value(tmp_sum, c_slice_align, T(0));
-          } else {
-            __bang_mul_scalar(nram_ping, nram_ping, w1, c_slice_align);
-            __bang_mul_scalar(nram_ping + c_slice_align,
-                              nram_ping + c_slice_align, w2, c_slice_align);
-            __bang_mul_scalar(nram_ping + 2 * c_slice_align,
-                              nram_ping + 2 * c_slice_align, w3, c_slice_align);
-            __bang_mul_scalar(nram_ping + 3 * c_slice_align,
-                              nram_ping + 3 * c_slice_align, w4, c_slice_align);
-            __bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
-                           1, SAMPLING_NUM, 1, 1);
-          }
-          __bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
-          swap(nram_ping, nram_pong);
-          __asm__ volatile("sync;");
-          is_first_sample = false;
-        }
-      }
-      __bang_mul_scalar(nram_out, nram_out, zero_sign, c_slice_align);
-      // store the result to dram
-      int output_offset =
-          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-              channel +
-          c_offset;
-      __memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
-               NRAM2GDRAM);
-    }
-  }
-}
-template <typename T>
-__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
-                                          const T *rois_dram, const int batch,
-                                          const int height, const int width,
-                                          const int channel, const int rois_num,
-                                          const RoiAlignRotatedParams &params,
-                                          T *bottom_grad_dram) {
-  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
-  int channel_align = CEIL_ALIGN(channel, align_base_128);
-  unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
-  int c_limit = max_element >> 2;
-  c_limit = c_limit > channel_align ? channel_align : c_limit;
-  T *nram_ping = (T *)nram_buffer;
-  T *nram_pong = nram_ping + 2 * c_limit;
-  T *nram_output = nullptr;
-  int bin_first = taskId;
-  int bin_end = rois_num * params.pooled_height * params.pooled_width;
-  bool is_first_bin = true;
-  T roi_center_x, roi_center_y, roi_width, roi_height, theta;
-  int batch_idx, roi_n, pw, ph;
-  T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
-      pong_theta;
-  int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
-  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
-    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
-                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
-                  &theta);
-    T bin_size_h = roi_height / params.pooled_height;
-    T bin_size_w = roi_width / params.pooled_width;
-    int roi_bin_grid_h =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_height / params.pooled_height);
-    int roi_bin_grid_w =
-        (params.sample_ratio > 0)
-            ? params.sample_ratio
-            : __float2int_up((float)roi_width / params.pooled_width);
-    T roi_start_y = -roi_height / 2;
-    T roi_start_x = -roi_width / 2;
-    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
-                            ? roi_bin_grid_h * roi_bin_grid_w
-                            : 1;
-    T cos_theta = std::cos(theta);
-    T sin_theta = std::sin(theta);
-    T zero_sign = 1.0f / bin_dim;
-    int c_rem, c_slice, pongc_slice, c_offset;
-    c_rem = channel;
-    c_offset = 0;
-    /****************************************
-    |        ping       |        pong       |
-    |---------|---------|---------|---------|
-    |  input  |  output |  input  |  output |
-    |---------|---------|---------|---------|
-    *****************************************/
-    if (is_first_bin) {
-      // load the first top_grad to nram
-      c_slice = c_limit < c_rem ? c_limit : c_rem;
-      int top_grad_offset =
-          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-          channel;
-      __memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
-               GDRAM2NRAM);
-    }
-    nram_output = nram_ping + c_limit;
-    while (c_rem > 0) {
-      c_slice = c_slice < c_rem ? c_slice : c_rem;
-      // load the next top_grad to nram
-      if (c_rem - c_slice > 0) {
-        // load the rest channels to nram
-        pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
-        int top_grad_offset =
-            ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
-                channel +
-            c_offset + c_slice;
-        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
-                       pongc_slice * sizeof(T), GDRAM2NRAM);
-      } else if (bin_i + taskDim < bin_end) {
-        // load next bin's data to nram
-        getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
-                      &pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
-                      &pong_roi_center_y, &pong_roi_width, &pong_roi_height,
-                      &pong_theta);
-        pongc_slice = c_limit < channel ? c_limit : channel;
-        int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
-                                   params.pooled_width +
-                               pong_pw) *
-                              channel;
-        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
-                       c_slice * sizeof(T), GDRAM2NRAM);
-      }
-      // comput the output in a single bin
-      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
-        const T yy = roi_start_y + ph * bin_size_h +
-                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
-        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
-          const T xx = roi_start_x + pw * bin_size_w +
-                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
-          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
-          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
-          T w1, w2, w3, w4;
-          bool empty = false;
-          int x_low, x_high, y_low, y_high;
-          bilinearInterpolate(height, width, x, y, &w1, &w2, &w3, &w4, &x_low,
-                              &x_high, &y_low, &y_high, &empty);
-          if (empty) {
-            continue;
-          } else {
-            __bang_mul_scalar(nram_output, nram_ping, w1 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_low * width * channel + x_low * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w2 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_low * width * channel + x_high * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w3 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_high * width * channel + x_low * channel + c_offset,
-                (T *)nram_output, c_slice);
-            __bang_mul_scalar(nram_output, nram_ping, w4 * zero_sign, c_limit);
-            __bang_atomic_add(
-                (T *)nram_output,
-                bottom_grad_dram + batch_idx * height * width * channel +
-                    y_high * width * channel + x_high * channel + c_offset,
-                (T *)nram_output, c_slice);
-          }
-        }
-      }
-      swap(nram_ping, nram_pong);
-      c_rem -= c_slice;
-      c_offset += c_slice;
-      __asm__ volatile("sync;");
-    }
-    is_first_bin = false;
-  }
-}
-__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
-    const void *features, const void *rois, void *output, const int batch,
-    const int height, const int width, const int channel, const int rois_num,
-    const RoiAlignRotatedParams rroiAlignParams,
-    const cnrtDataType_t data_type) {
-  if (0x80 == coreId) {
-    return;
-  }
-  if (data_type == CNRT_FLOAT32) {
-    roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
-                           width, channel, rois_num, rroiAlignParams,
-                           (float *)output);
-  } else {
-    roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
-                           channel, rois_num, rroiAlignParams, (half *)output);
-  }
-}
-__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
-    const void *top_grad, const void *rois, void *bottom_grad, const int batch,
-    const int height, const int width, const int channel, const int rois_num,
-    const RoiAlignRotatedParams rroiAlignParams,
-    const cnrtDataType_t data_type) {
-  if (0x80 == coreId) {
-    return;
-  }
-  if (data_type == CNRT_FLOAT32) {
-    roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
-                            width, channel, rois_num, rroiAlignParams,
-                            (float *)bottom_grad);
-  } else {
-    roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
-                            width, channel, rois_num, rroiAlignParams,
-                            (half *)bottom_grad);
-  }
-}
-void KernelRoiAlignRotatedForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *features, const void *rois,
-    void *output, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams) {
-  MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
-      features, rois, output, batch, height, width, channel, rois_num,
-      roiAlignRotatedParams, d_type);
-}
-void KernelRoiAlignRotatedBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
-    void *bottom_grad, const int batch, const int height, const int width,
-    const int channel, const int rois_num,
-    const RoiAlignRotatedParams roiAlignRotatedParams) {
-  MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
-      top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
-      roiAlignRotatedParams, d_type);
-}
--- a/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
-#define ROI_ALIGN_ROTATED_UTILS_HPP_
-struct RoiAlignRotatedParams {
-  int pooled_height;
-  int pooled_width;
-  int sample_ratio;
-  float spatial_scale;
-  bool aligned;
-  bool clockwise;
-};
-#endif  // ROI_ALIGN_ROTATED_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#define ALIGN_SIZE 64
-#define PIPELINE_COMMON_NUM 2
-#define PIPELINE_PINGPONG_NUM 10
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-namespace forward {
-template <typename T>
-__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
-                                int width, int channels, int p_height,
-                                int p_width, T spatial_scale, int *bin_x1,
-                                int *bin_y1, int *bin_x2, int *bin_y2,
-                                int *bin_wdim, int *bin_hdim, int *bin_dims,
-                                T **input_base, bool *is_empty) {
-  int pw = bin_i % p_width;
-  int ph = (bin_i / p_width) % p_height;
-  int roi_n = bin_i / p_width / p_height;
-  /*roi*/
-  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
-  int batch_index = (int)roi_info[0];
-  int roi_x1 = round(roi_info[1] * spatial_scale);
-  int roi_y1 = round(roi_info[2] * spatial_scale);
-  int roi_x2 = round(roi_info[3] * spatial_scale);
-  int roi_y2 = round(roi_info[4] * spatial_scale);
-  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
-  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
-  /*bin*/
-  T bin_w = (T)roi_w / (T)p_width;
-  T bin_h = (T)roi_h / (T)p_height;
-  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
-  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
-  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;
-  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
-  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
-  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;
-  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
-  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
-  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;
-  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
-  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
-  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;
-  *input_base = input_v + batch_index * height * width * channels;
-  *bin_wdim = *bin_x2 - *bin_x1;
-  *bin_hdim = *bin_y2 - *bin_y1;
-  *bin_dims = (*bin_hdim) * (*bin_wdim);
-  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
-}
-template <typename T>
-__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
-                                   int channels, int height, int width,
-                                   int p_height, int p_width, int rois_num,
-                                   T spatial_scale, T *output_v, int *argmax) {
-  /*
-   * NRAM partition
-   *  |---------------------------------------------------|
-   *  |                        ping                       |
-   *  |---------------------------------------------------|
-   *  |                        pong                       |
-   *  |---------------------------------------------------|
-   *  |                        out                        |
-   *  |---------------------------------------------------|
-   *  |                        argmax                     |
-   *  |---------------------------------------------------|
-   *  |                        a                          |
-   *  |---------------------------------------------------|
-   *  |                        b                          |
-   *  |---------------------------------------------------|
-   */
-  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
-  uint32_t t_size = sizeof(T);
-  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
-  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
-  uint32_t channels_align = PAD_UP(channels, float_div);
-  uint32_t nram_limit = PAD_DOWN(
-      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
-  // nram PING/PONG, output, argamx, a, b
-  float *nram_ping = (float *)nram_buffer;
-  float *nram_pong = (float *)nram_buffer + nram_limit;
-  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
-  float *nram_argmax = nram_out + channels_align;
-  float *nram_a = nram_out + 2 * channels_align;
-  float *nram_b = nram_out + 3 * channels_align;
-  uint32_t c_bins_num = rois_num * p_height * p_width;
-  uint32_t task_bins = c_bins_num / taskDim;
-  uint32_t rem_bins = c_bins_num % taskDim;
-  if (taskId < rem_bins) {
-    task_bins += 1;
-  }
-  int bin_first =
-      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
-  int bins_loop = bin_first + task_bins;
-  T *input_base = NULL;
-  T *output_base = output_v + bin_first * channels;
-  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
-  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
-  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
-  bool is_empty = false;
-  bool pong_is_empty = false;
-  bool is_first_bin = true;
-  uint32_t src_offset = 0;
-  uint32_t dst_offset = 0;
-  uint32_t nram_offset = 0;
-  uint32_t half_offset =
-      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
-  float *nram_tmp = NULL;
-  uint32_t c_slice = 0;
-  uint32_t c_slice_align = 0;
-  uint32_t pongc_slice = 0;
-  uint32_t pongc_slice_align = 0;
-  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
-    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
-                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
-                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
-                  &input_base, &is_empty);
-    uint32_t c_rem = channels;
-    c_slice = nram_limit / bin_dims / float_div * float_div;
-    if (is_first_bin && !is_empty) {
-      c_slice = c_slice > c_rem ? c_rem : c_slice;
-      c_slice_align = PAD_UP(c_slice, float_div);
-      for (int h = bin_y1; h < bin_y2; h++) {
-        src_offset = (h * width + bin_x1) * channels;
-        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
-        if (c_slice_align == channels) {
-          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
-                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
-        } else {
-          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
-                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
-                   channels * t_size, bin_wdim - 1);
-        }
-      }
-    }
-    uint32_t c_offset = 0;
-    while (c_rem > 0) {
-      c_slice = c_slice > c_rem ? c_rem : c_slice;
-      c_slice_align = PAD_UP(c_slice, float_div);
-      /*__memcpy_async*/
-      if (c_rem - c_slice > 0 && !is_empty) {
-        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
-        pongc_slice_align = PAD_UP(pongc_slice, float_div);
-        for (int h = bin_y1; h < bin_y2; h++) {
-          src_offset = (h * width + bin_x1) * channels + c_offset;
-          nram_offset =
-              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
-          __memcpy_async((T *)nram_pong + nram_offset,
-                         (T *)input_base + src_offset + c_slice,
-                         pongc_slice * t_size, GDRAM2NRAM,
-                         pongc_slice_align * t_size, channels * t_size,
-                         bin_wdim - 1);
-        }
-      } else if (bin_i + 1 < bins_loop) {
-        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
-                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
-                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
-                      &pbin_dims, &input_base, &pong_is_empty);
-        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
-        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
-        pongc_slice_align = PAD_UP(pongc_slice, float_div);
-        if (!pong_is_empty) {
-          for (int h = pbin_y1; h < pbin_y2; h++) {
-            src_offset = (h * width + pbin_x1) * channels;
-            nram_offset =
-                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
-            if (pongc_slice_align == channels) {
-              __memcpy_async((T *)nram_pong + nram_offset,
-                             (T *)input_base + src_offset,
-                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
-            } else {
-              __memcpy_async((T *)nram_pong + nram_offset,
-                             (T *)input_base + src_offset, pongc_slice * t_size,
-                             GDRAM2NRAM, pongc_slice_align * t_size,
-                             channels * t_size, pbin_wdim - 1);
-            }
-          }
-        }
-      }
-      if (is_empty) {
-        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
-        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
-                 c_slice * t_size, NRAM2GDRAM);
-        if (NULL != argmax) {
-          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
-          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
-                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
-        }
-      } else {
-        if (is_half) {
-          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
-          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
-                            bin_align64);
-        }
-        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
-                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
-        if (is_half) {
-          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
-          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
-        }
-        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
-                 c_slice * t_size, NRAM2GDRAM);
-        if (NULL != argmax) {
-          /*compute max_index*/
-          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
-                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
-                               bin_wdim, 1, 1);
-          convertInt2Float((float *)nram_argmax, (float *)nram_a,
-                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);
-          /*compute input_h*/
-          for (int i = 0; i < c_slice; i++) {
-            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
-          }
-          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
-                            c_slice_align);
-          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
-                            c_slice_align);
-          /*compute input_w*/
-          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
-                            c_slice_align);
-          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
-                     c_slice_align);
-          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
-                            c_slice_align);
-          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
-                     c_slice_align);
-          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
-                           (float *)nram_out, (float *)nram_b, c_slice_align);
-          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
-                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
-                   NRAM2GDRAM);
-        }
-      }
-      nram_tmp = nram_ping;
-      nram_ping = nram_pong;
-      nram_pong = nram_tmp;
-      c_offset += c_slice;
-      c_rem -= c_slice;
-      __asm__ volatile("sync;");
-    }
-    dst_offset += channels;
-    is_first_bin = false;
-  }
-}
-__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
-                                     const void *input_data,
-                                     const void *input_rois, int batch,
-                                     int channels, int height, int width,
-                                     int pooled_height, int pooled_width,
-                                     int rois_num, float spatial_scale,
-                                     void *output_data, int *argmax) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (half)spatial_scale, (half *)output_data, argmax);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
-                       channels, height, width, pooled_height, pooled_width,
-                       rois_num, (float)spatial_scale, (float *)output_data,
-                       argmax);
-    }; break;
-    default: { break; }
-  }
-}
-}  // namespace forward
-namespace backward {
-// Convert index of argmax from global grads_image to local bin in RoI. Vector
-// operations do not support int type, so conversion from int to float is
-// performed here.
-__mlu_func__ void convertIndex(
-    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
-    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
-    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
-    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
-    float *nram_atomic_add, float *nram_grads_image, int width, int height,
-    int wstart, int hstart, int w_compute, int h_compute, int align_c,
-    int channels, int loop_flag, int loop_id, int true_limit) {
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
-  // This step uses scalar division, because the above vector division causes
-  // rounding accuracy problem.
-  for (int i = 0; i < channels; ++i) {
-    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
-  }
-  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
-  // operation.
-  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
-                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
-                   align_c);
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
-                   align_c);
-  // Perform 'temp_result - hstart' operation
-  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
-                    align_c);
-  // Perform 'temp_result1 - temp_result2 * width' operation
-  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
-                    align_c);
-  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
-                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
-  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
-             (float *)nram_argmax_fp_w, align_c);
-  // Perform 'temp_result - wstart' operation
-  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
-                    wstart, align_c);
-  // Perform 'temp_result = h * w_compute + w' operation
-  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                    w_compute, align_c);
-  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-             (float *)nram_argmax_fp_w, align_c);
-  if (loop_flag == 1) {
-    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                      (loop_id * true_limit), align_c);
-  }
-  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
-                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
-                   align_c);
-}
-template <typename T>
-__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
-                                   const int32_t *argmax, T *grads_image,
-                                   int channels, int height, int width,
-                                   int pooled_height, int pooled_width,
-                                   int rois_num, const T spatial_scale,
-                                   int high_precision) {
-  // Calculate the number of rois processed by each core
-  int bin_num = rois_num * pooled_height * pooled_width;
-  int loop =
-      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
-  int tid = taskId * loop;
-  if (bin_num % taskDim != 0) {
-    if (tid >= bin_num) {
-      return;
-    } else {
-      // last part is (bin_num - tid).
-      loop = bin_num - tid < loop ? bin_num - tid : loop;
-    }
-  }
-  int align_c = PAD_UP(channels, ALIGN_SIZE);
-  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
-  int data_size =
-      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
-                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
-                2),
-               ALIGN_SIZE);
-  int hw_limit = data_size / align_c;
-  float *nram_grads = (float *)nram_buffer;
-  for (int idx = tid; idx < tid + loop; ++idx) {
-    // (n, ph, pw) is a C in the pooled output
-    int pw = idx % pooled_width;
-    int ph = (idx / pooled_width) % pooled_height;
-    int n = idx / pooled_width / pooled_height;
-    const T *offset_rois = (const T *)(rois + n * 5);
-    int roi_batch_ind = int(offset_rois[0]);
-    // Calculate the roi region on feature maps
-    int roi_start_w = round(offset_rois[1] * spatial_scale);
-    int roi_start_h = round(offset_rois[2] * spatial_scale);
-    int roi_end_w = round(offset_rois[3] * spatial_scale);
-    int roi_end_h = round(offset_rois[4] * spatial_scale);
-    // Force malformed rois to 1x1
-    int roi_width =
-        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
-    int roi_height =
-        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
-    T bin_size_h = (T)roi_height / (T)pooled_height;
-    T bin_size_w = (T)roi_width / (T)pooled_width;
-    // The corresponding bin region
-    int hstart = int(floor((T)ph * bin_size_h));
-    int wstart = int(floor((T)pw * bin_size_w));
-    int hend = int(ceil((T)(ph + 1) * bin_size_h));
-    int wend = int(ceil((T)(pw + 1) * bin_size_w));
-    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
-    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
-    hstart = hstart < height ? hstart : height;
-    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
-    hend = hend < height ? hend : height;
-    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
-    wstart = wstart < width ? wstart : width;
-    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
-    wend = wend < width ? wend : width;
-    bool is_empty = (hend <= hstart) || (wend <= wstart);
-    if (!is_empty) {
-      int h_compute = hend - hstart;
-      int w_compute = wend - wstart;
-      int true_limit =
-          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
-      int loop_int = (h_compute * w_compute) / true_limit;
-      int rem = (h_compute * w_compute) % true_limit;
-      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
-      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
-      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
-      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
-      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
-      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
-      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
-      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
-      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
-      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
-      float *nram_grads_image = (float *)nram_atomic_add + align_c;
-      if (true_limit == h_compute * w_compute) {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |---------------------------------------------------|
-         *  |                     argmax_temp                   |
-         *  |---------------------------------------------------|
-         *  |                     atomic_add                    |
-         *  |---------------------------------------------------|
-         *  |                     grads_image                   |
-         *  |---------------------------------------------------|
-         */
-        // Load the data from GDRAM to NRAM.
-        __memcpy(
-            (T *)nram_grads + align_c * high_precision,
-            (const T *)grads +
-                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
-                    channels,
-            channels * sizeof(T), GDRAM2NRAM);
-        if (high_precision) {
-          __bang_half2float((float *)nram_grads,
-                            (half *)nram_grads + align_c * high_precision,
-                            align_c);
-        }
-        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
-                                             (n * pooled_height * pooled_width +
-                                              ph * pooled_width + pw) *
-                                                 channels,
-                 channels * sizeof(int32_t), GDRAM2NRAM);
-        // Perform pooling operation on NRAM.
-        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                     nram_atomic_add, nram_grads_image, width, height, wstart,
-                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
-        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                          (int32_t *)nram_argmax_int, align_c, h_compute,
-                          w_compute, h_compute, w_compute, h_compute,
-                          w_compute);
-        if (high_precision) {
-          __bang_float2half_rd((half *)nram_grads_image,
-                               (float *)nram_grads_image,
-                               h_compute * w_compute * align_c);
-        }
-        // Store the result on NRAM back to GDRAM.
-        for (int hc = 0; hc < h_compute; ++hc) {
-          for (int wc = 0; wc < w_compute; ++wc) {
-            T *dst = (T *)nram_atomic_add;
-            int grad_image_offset = (roi_batch_ind * height * width +
-                                     (hc + hstart) * width + wc + wstart) *
-                                    channels;
-            T *src1 = (T *)grads_image + grad_image_offset;
-            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
-            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
-            __bang_atomic_add(dst, src1, src2, channels);
-          }
-        }
-      } else if (true_limit > 0) {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |--------------------ping_pong----------------------|
-         *  |       argmax_temp      |       argmax_temp        |
-         *  |------------------------|--------------------------|
-         *  |       atomic_add       |       atomic_add         |
-         *  |------------------------|--------------------------|
-         *  |       grads_image      |       grads_image        |
-         *  |---------------------------------------------------|
-         */
-        // Load the data from GDRAM to NRAM.
-        __memcpy(
-            (T *)nram_grads + align_c * high_precision,
-            (const T *)grads +
-                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
-                    channels,
-            channels * sizeof(T), GDRAM2NRAM);
-        if (high_precision) {
-          __bang_half2float((float *)nram_grads,
-                            (half *)nram_grads + align_c * high_precision,
-                            align_c);
-        }
-        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
-                                             (n * pooled_height * pooled_width +
-                                              ph * pooled_width + pw) *
-                                                 channels,
-                 channels * sizeof(int32_t), GDRAM2NRAM);
-        int ping_pong = 0;
-        int ping_pong_offset =
-            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
-        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
-          int size = (loop_id == loop_int) ? rem : true_limit;
-          if (size == 0) {
-            break;
-          }
-          // Perform pooling operation on NRAM.
-          nram_argmax_fp =
-              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
-          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
-          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
-          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
-          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
-          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
-          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
-          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
-          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
-          nram_grads_image = (float *)nram_atomic_add + align_c;
-          int loop_id_1 = loop_id;
-          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
-          if (size_1 == 0) {
-            break;
-          }
-          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                       nram_atomic_add, nram_grads_image, width, height, wstart,
-                       hstart, w_compute, h_compute, align_c, channels, 1,
-                       loop_id_1, true_limit);
-          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
-                            size_1, 1, size_1, 1);
-          if (high_precision) {
-            __bang_float2half_rd((half *)nram_grads_image,
-                                 (float *)nram_grads_image, size_1 * align_c);
-          }
-          // Store the result on NRAM back to GDRAM.
-          for (int index_size = 0; index_size < size; ++index_size) {
-            int h = (loop_id * true_limit + index_size) / w_compute;
-            int w = (loop_id * true_limit + index_size) % w_compute;
-            T *dst = (T *)nram_atomic_add;
-            T *grads_image_n =
-                (T *)grads_image + roi_batch_ind * height * width * channels;
-            T *src1 = (T *)grads_image_n +
-                      ((h + hstart) * width + (w + wstart)) * channels;
-            T *src2 = (T *)nram_grads_image + index_size * align_c;
-            __bang_atomic_add(dst, src1, src2, channels);
-          }
-          ping_pong = 1 - ping_pong;
-        }
-      } else {
-        /*
-         * NRAM partition
-         *  |---------------------------------------------------|
-         *  |                     grads                         |
-         *  |---------------------------------------------------|
-         *  |                     argmax                        |
-         *  |--------------------ping_pong----------------------|
-         *  |       argmax_temp      |       argmax_temp        |
-         *  |------------------------|--------------------------|
-         *  |       atomic_add       |       atomic_add         |
-         *  |------------------------|--------------------------|
-         *  |       grads_image      |       grads_image        |
-         *  |---------------------------------------------------|
-         */
-        int c_limit =
-            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
-                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
-                     ALIGN_SIZE);
-        int loop_int = channels / c_limit;
-        int rem = channels % c_limit;
-        int ping_pong = 0;
-        int ping_pong_offset =
-            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
-        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
-          int size = (loop_id == loop_int) ? rem : c_limit;
-          if (size == 0) {
-            break;
-          }
-          nram_argmax_fp =
-              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
-          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
-          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
-          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
-          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
-          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
-          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
-          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
-          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
-          nram_grads_image = (float *)nram_atomic_add + c_limit;
-          // This pipeline loads the data from GDRAM to NRAM.
-          __memcpy((T *)nram_grads + c_limit * high_precision,
-                   (const T *)grads +
-                       n * pooled_height * pooled_width * channels +
-                       ph * pooled_width * channels + pw * channels +
-                       loop_id * c_limit,
-                   size * sizeof(T), GDRAM2NRAM);
-          if (high_precision) {
-            __bang_half2float((float *)nram_grads,
-                              (half *)nram_grads + c_limit * high_precision,
-                              c_limit);
-          }
-          __memcpy((int32_t *)nram_argmax,
-                   (const int32_t *)argmax +
-                       n * pooled_height * pooled_width * channels +
-                       ph * pooled_width * channels + pw * channels +
-                       loop_id * c_limit,
-                   size * sizeof(int32_t), GDRAM2NRAM);
-          for (int hc = 0; hc < h_compute; ++hc) {
-            for (int wc = 0; wc < w_compute; ++wc) {
-              // This pipeline performs pooling operation on NRAM.
-              convertIndex(
-                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
-                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
-                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
-                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
-                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
-              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
-                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
-                                1, 1);
-              if (high_precision) {
-                __bang_float2half_rd((half *)nram_grads_image,
-                                     (float *)nram_grads_image, c_limit);
-              }
-              // This pipeline stores the result on NRAM back to GDRAM.
-              T *dst = (T *)nram_atomic_add;
-              T *grads_image_n =
-                  (T *)grads_image + roi_batch_ind * height * width * channels;
-              T *src1 = (T *)grads_image_n +
-                        ((hc + hstart) * width + (wc + wstart)) * channels +
-                        loop_id * c_limit;
-              T *src2 = (T *)nram_grads_image;
-              __bang_atomic_add(dst, src1, src2, size);
-            }
-          }
-          ping_pong = 1 - ping_pong;
-        }
-      }
-    }
-  }
-}
-__mlu_global__ void MLUKernelRoiPoolBackward(
-    const void *grads, const void *rois, const int *argmax, void *grads_image,
-    int rois_num, int pooled_height, int pooled_width, int channels, int no,
-    int height, int width, const float spatial_scale,
-    const cnrtDataType_t k_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (k_dtype) {
-    case CNRT_FLOAT16: {
-      // Using the float type '__bang_max_pool_bp' instruction to increase the
-      // bit width.
-      const int high_precision = 1;
-      MLUUnion1Roipool((const half *)rois, (const half *)grads,
-                       (const int32_t *)argmax, (half *)grads_image, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (const half)spatial_scale, high_precision);
-    }; break;
-    case CNRT_FLOAT32: {
-      const int high_precision = 0;
-      MLUUnion1Roipool((const float *)rois, (const float *)grads,
-                       (const int32_t *)argmax, (float *)grads_image, channels,
-                       height, width, pooled_height, pooled_width, rois_num,
-                       (const float)spatial_scale, high_precision);
-    }; break;
-    default: { break; }
-  }
-}
-}  // namespace backward
-void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *input_data, const void *input_rois,
-                          const int batch, const int channels, const int height,
-                          const int width, const int pooled_height,
-                          const int pooled_width, const int rois_num,
-                          const float spatial_scale, void *output_data,
-                          int *argmax) {
-  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
-      data_type, input_data, input_rois, batch, channels, height, width,
-      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
-      argmax);
-}
-void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
-                           const void *grad_output_ptr, const void *rois_ptr,
-                           const int *argmax_ptr, void *grad_input_ptr,
-                           const int box_num, const int pooled_height,
-                           const int pooled_width, const int channels,
-                           const int batch, const int height, const int width,
-                           const float spatial_scale) {
-  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
-      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
-      pooled_height, pooled_width, channels, batch, height, width,
-      spatial_scale, k_dtype);
-}
--- a/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#define ROI_OFFSET 7
-#define FLOAT_NRAM_BUFFER_NUM 14
-#define HALF_NRAM_BUFFER_NUM 25
-#define ALIGN_NUM 64
-__nram__ char data_nram[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_global__ void MLUUnion1KernelPtsIdxOfVoxels(
-    const int pool_method, const int boxes_num, const int pts_num,
-    const int max_pts_each_voxel, const int out_x, const int out_y,
-    const int out_z, const T *rois, const T *pts, int *pts_idx_of_voxels) {
-  // params (T)rois: (boxes_num, 7)
-  // params (T)pts: (3, pts_num)
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel)
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int nram_pts_num = 0;
-  if (sizeof(T) == sizeof(float)) {
-    nram_pts_num = PAD_DOWN(
-        (MAX_NRAM_SIZE / sizeof(float) / FLOAT_NRAM_BUFFER_NUM), ALIGN_NUM);
-  } else {
-    nram_pts_num = PAD_DOWN(
-        (MAX_NRAM_SIZE / sizeof(half) / HALF_NRAM_BUFFER_NUM), ALIGN_NUM);
-  }
-  char *X = NULL;
-  char *Y = NULL;
-  char *Z = NULL;
-  char *local_X = NULL;
-  char *local_Y = NULL;
-  char *local_Z = NULL;
-  char *nram_pts_in_flag = NULL;
-  float *temp_buffer1 = NULL;
-  float *temp_buffer2 = NULL;
-  float *temp_buffer3 = NULL;
-  float *temp_buffer4 = NULL;
-  float *temp_buffer5 = NULL;
-  float *nram_voxel_offset = NULL;
-  int *nram_pts_idx_seq = NULL;
-  float *fp_local_X = NULL;
-  float *fp_local_Y = NULL;
-  float *fp_local_Z = NULL;
-  float *fp_nram_pts_in_flag = NULL;
-  if (sizeof(T) == sizeof(float)) {
-    X = (char *)((float *)data_nram);
-    Y = (char *)((float *)data_nram + nram_pts_num);
-    Z = (char *)((float *)data_nram + nram_pts_num * 2);
-    local_X = (char *)((float *)data_nram + nram_pts_num * 3);
-    local_Y = (char *)((float *)data_nram + nram_pts_num * 4);
-    local_Z = (char *)((float *)data_nram + nram_pts_num * 5);
-    nram_pts_in_flag = (char *)((float *)data_nram + nram_pts_num * 6);
-    temp_buffer1 = (float *)data_nram + nram_pts_num * 7;
-    temp_buffer2 = (float *)data_nram + nram_pts_num * 8;
-    temp_buffer3 = (float *)data_nram + nram_pts_num * 9;
-    temp_buffer4 = (float *)data_nram + nram_pts_num * 10;
-    temp_buffer5 = (float *)data_nram + nram_pts_num * 11;
-    nram_voxel_offset = (float *)data_nram + nram_pts_num * 12;
-    nram_pts_idx_seq = (int *)((float *)data_nram + nram_pts_num * 13);
-    fp_local_X = (float *)local_X;
-    fp_local_Y = (float *)local_Y;
-    fp_local_Z = (float *)local_Z;
-    fp_nram_pts_in_flag = (float *)nram_pts_in_flag;
-  } else {
-    X = (char *)((half *)data_nram);
-    Y = (char *)((half *)data_nram + nram_pts_num);
-    Z = (char *)((half *)data_nram + nram_pts_num * 2);
-    local_X = (char *)((half *)data_nram + nram_pts_num * 4);
-    local_Y = (char *)((half *)data_nram + nram_pts_num * 6);
-    local_Z = (char *)((half *)data_nram + nram_pts_num * 8);
-    nram_pts_in_flag = (char *)((half *)data_nram + nram_pts_num * 10);
-    temp_buffer1 = (float *)((half *)data_nram + nram_pts_num * 11);
-    temp_buffer2 = (float *)((half *)data_nram + nram_pts_num * 13);
-    temp_buffer3 = (float *)((half *)data_nram + nram_pts_num * 15);
-    temp_buffer4 = (float *)((half *)data_nram + nram_pts_num * 17);
-    temp_buffer5 = (float *)((half *)data_nram + nram_pts_num * 19);
-    nram_voxel_offset = (float *)((half *)data_nram + nram_pts_num * 21);
-    nram_pts_idx_seq = (int *)((half *)data_nram + nram_pts_num * 23);
-    fp_local_X = (float *)((half *)local_X - nram_pts_num);
-    fp_local_Y = (float *)((half *)local_Y - nram_pts_num);
-    fp_local_Z = (float *)((half *)local_Z - nram_pts_num);
-    fp_nram_pts_in_flag = (float *)((half *)nram_pts_in_flag - nram_pts_num);
-  }
-  for (int i = 0; i < nram_pts_num; i++) {
-    nram_pts_idx_seq[i] = i;
-  }
-  int nram_pts_loop_times = pts_num / nram_pts_num;
-  int rem_nram_num = pts_num % nram_pts_num;
-  for (int roi_index = taskId; roi_index < boxes_num; roi_index += taskDim) {
-    const T *cur_roi = rois + roi_index * ROI_OFFSET;
-    T cx = cur_roi[0];
-    T cy = cur_roi[1];
-    T cz = cur_roi[2];
-    T dx = cur_roi[3];
-    T dy = cur_roi[4];
-    T dz = cur_roi[5];
-    T rz = cur_roi[6];
-    T dx_2 = dx / 2.0;
-    T dy_2 = dy / 2.0;
-    T dz_2 = dz / 2.0;
-    for (int loop_idx = 0; loop_idx <= nram_pts_loop_times; loop_idx++) {
-      int load_pts_num =
-          (loop_idx == nram_pts_loop_times) ? rem_nram_num : nram_pts_num;
-      if (load_pts_num == 0) {
-        break;
-      }
-      int pts_offset_cur_loop = nram_pts_num * loop_idx;
-      int compute_pts_num = (loop_idx == nram_pts_loop_times)
-                                ? PAD_UP(rem_nram_num, ALIGN_NUM)
-                                : nram_pts_num;
-      // load pts
-      __memcpy((void *)X, (T *)pts + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      __memcpy((void *)Y, (T *)pts + pts_num + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      __memcpy((void *)Z, (T *)pts + pts_num * 2 + pts_offset_cur_loop,
-               load_pts_num * sizeof(T), GDRAM2NRAM);
-      // fabs(local_z)
-      __bang_sub_scalar((T *)local_Z, (T *)Z, (T)cz, compute_pts_num);
-      __bang_sub_scalar((T *)temp_buffer1, (T *)Z, (T)(cz + dz_2),
-                        compute_pts_num);
-      __bang_active_abs((T *)temp_buffer1, (T *)temp_buffer1, compute_pts_num);
-#if __BANG_ARCH__ >= 322
-      __bang_le_scalar((T *)nram_pts_in_flag, (T *)temp_buffer1, (T)(dz_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dz_2));
-      __bang_le((T *)nram_pts_in_flag, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      T cosa = std::cos(-rz);
-      T sina = std::sin(-rz);
-      __bang_sub_scalar((T *)temp_buffer3, (T *)X, (T)cx, compute_pts_num);
-      __bang_sub_scalar((T *)temp_buffer4, (T *)Y, (T)cy, compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)cosa,
-                        compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)sina,
-                        compute_pts_num);
-      // local_x
-      __bang_sub((T *)local_X, (T *)temp_buffer1, (T *)temp_buffer2,
-                 compute_pts_num);
-      // fabs(local_x)
-      __bang_active_abs((T *)temp_buffer1, (T *)local_X, compute_pts_num);
-      // fabs(local_x) < dx/2 ? 1 : 0
-#if __BANG_ARCH__ >= 322
-      __bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dx_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dx_2));
-      __bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      __bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
-                 (T *)temp_buffer1,
-                 compute_pts_num);  // flush res
-      __bang_mul_scalar((T *)temp_buffer1, (T *)temp_buffer3, (T)sina,
-                        compute_pts_num);
-      __bang_mul_scalar((T *)temp_buffer2, (T *)temp_buffer4, (T)cosa,
-                        compute_pts_num);
-      // local_y
-      __bang_add((T *)local_Y, (T *)temp_buffer1, (T *)temp_buffer2,
-                 compute_pts_num);
-      // fabs(local_y)
-      __bang_active_abs((T *)temp_buffer1, (T *)local_Y, compute_pts_num);
-      // fabs(local_y) < dy/2 ? 1 : 0
-#if __BANG_ARCH__ >= 322
-      __bang_lt_scalar((T *)temp_buffer1, (T *)temp_buffer1, (T)(dy_2),
-                       compute_pts_num);
-#else
-      __bang_write_value((void *)temp_buffer2, compute_pts_num, (T)(dy_2));
-      __bang_lt((T *)temp_buffer1, (T *)temp_buffer1, (T *)temp_buffer2,
-                compute_pts_num);
-#endif
-      __bang_and((T *)nram_pts_in_flag, (T *)nram_pts_in_flag,
-                 (T *)temp_buffer1,
-                 compute_pts_num);  // flush res
-      T x_res = dx / out_x;
-      T y_res = dy / out_y;
-      T z_res = dz / out_z;
-      __bang_add_scalar((T *)local_X, (T *)local_X, (T)(dx_2), compute_pts_num);
-      __bang_add_scalar((T *)local_Y, (T *)local_Y, (T)(dy_2), compute_pts_num);
-      // local_Z do not need to add dz/2.0
-#if (__BANG_ARCH__ >= 322) && (__BANG_ARCH__ != 372)
-      __bang_div((T *)local_X, (T *)local_X, (T)x_res, compute_pts_num);
-      __bang_div((T *)local_Y, (T *)local_Y, (T)y_res, compute_pts_num);
-      __bang_div((T *)local_Z, (T *)local_Z, (T)z_res, compute_pts_num);
-#else
-      __bang_mul_scalar((T *)local_X, (T *)local_X, (T)(1 / x_res),
-                        compute_pts_num);
-      __bang_mul_scalar((T *)local_Y, (T *)local_Y, (T)(1 / y_res),
-                        compute_pts_num);
-      __bang_mul_scalar((T *)local_Z, (T *)local_Z, (T)(1 / z_res),
-                        compute_pts_num);
-#endif
-      // float = float2int + int2float, half = half2int + int2float
-      if (sizeof(T) == sizeof(float)) {
-#if __BANG_ARCH__ >= 322
-        __bang_float2int32_tz((int *)temp_buffer1, (float *)local_X,
-                              compute_pts_num, 0);
-        __bang_float2int32_tz((int *)temp_buffer2, (float *)local_Y,
-                              compute_pts_num, 0);
-        __bang_float2int32_tz((int *)temp_buffer3, (float *)local_Z,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
-                              compute_pts_num, 0);
-#else
-        convertFloat2Int((int *)temp_buffer1, (float *)temp_buffer2,
-                         (float *)fp_local_X, (float *)temp_buffer3,
-                         compute_pts_num);
-        convertFloat2Int((int *)temp_buffer2, (float *)temp_buffer3,
-                         (float *)fp_local_Y, (float *)temp_buffer4,
-                         compute_pts_num);
-        convertFloat2Int((int *)temp_buffer3, (float *)temp_buffer4,
-                         (float *)fp_local_Z, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_X, (float *)temp_buffer4,
-                         (int *)temp_buffer1, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_Y, (float *)temp_buffer4,
-                         (int *)temp_buffer2, (float *)temp_buffer5,
-                         compute_pts_num);
-        convertInt2Float((float *)fp_local_Z, (float *)temp_buffer4,
-                         (int *)temp_buffer3, (float *)temp_buffer5,
-                         compute_pts_num);
-#endif
-      } else {
-        __bang_half2float((float *)temp_buffer4, (half *)nram_pts_in_flag,
-                          compute_pts_num);
-        __bang_move((void *)fp_nram_pts_in_flag, (void *)temp_buffer4,
-                    compute_pts_num * sizeof(float));
-#if __BANG_ARCH__ >= 322
-        __bang_half2int32_tz((int *)temp_buffer1, (half *)local_X,
-                             compute_pts_num, 0);
-        __bang_half2int32_tz((int *)temp_buffer2, (half *)local_Y,
-                             compute_pts_num, 0);
-        __bang_half2int32_tz((int *)temp_buffer3, (half *)local_Z,
-                             compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_X, (int *)temp_buffer1,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Y, (int *)temp_buffer2,
-                              compute_pts_num, 0);
-        __bang_int322float_rn((float *)fp_local_Z, (int *)temp_buffer3,
-                              compute_pts_num, 0);
-#else
-        __bang_half2int16_tz((int16_t *)temp_buffer1, (half *)local_X,
-                             compute_pts_num, 0);
-        __bang_half2int16_tz((int16_t *)temp_buffer2, (half *)local_Y,
-                             compute_pts_num, 0);
-        __bang_half2int16_tz((int16_t *)temp_buffer3, (half *)local_Z,
-                             compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_X, (int16_t *)temp_buffer1,
-                           compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_Y, (int16_t *)temp_buffer2,
-                           compute_pts_num, 0);
-        __bang_int162float((float *)fp_local_Z, (int16_t *)temp_buffer3,
-                           compute_pts_num, 0);
-#endif
-      }
-      // process index >= 0
-      __bang_write_value((float *)temp_buffer4, compute_pts_num, (float)0.0f);
-      __bang_maxequal((float *)fp_local_X, (float *)fp_local_X,
-                      (float *)temp_buffer4, compute_pts_num);
-      __bang_maxequal((float *)fp_local_Y, (float *)fp_local_Y,
-                      (float *)temp_buffer4, compute_pts_num);
-      __bang_maxequal((float *)fp_local_Z, (float *)fp_local_Z,
-                      (float *)temp_buffer4, compute_pts_num);
-      // process index <= （out_x - 1)
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_x - 1));
-      __bang_minequal((float *)fp_local_X, (float *)fp_local_X,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_y - 1));
-      __bang_minequal((float *)fp_local_Y, (float *)fp_local_Y,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_write_value((float *)temp_buffer5, compute_pts_num,
-                         (float)(out_z - 1));
-      __bang_minequal((float *)fp_local_Z, (float *)fp_local_Z,
-                      (float *)temp_buffer5, compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer1, (float *)fp_local_X,
-                        (float)(out_y * out_z), compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer2, (float *)fp_local_Y,
-                        (float)out_z, compute_pts_num);
-      __bang_mul_scalar((float *)temp_buffer3, (float *)fp_local_Z, (float)1.0,
-                        compute_pts_num);
-      __bang_add((float *)nram_voxel_offset, (float *)temp_buffer1,
-                 (float *)temp_buffer2, compute_pts_num);
-      __bang_add((float *)nram_voxel_offset, (float *)nram_voxel_offset,
-                 (float *)temp_buffer3, compute_pts_num);
-      __bang_mul_scalar((float *)nram_voxel_offset, (float *)nram_voxel_offset,
-                        (float)max_pts_each_voxel, compute_pts_num);
-      if (compute_pts_num != load_pts_num) {
-        __memset_nram((float *)fp_nram_pts_in_flag + load_pts_num,
-                      compute_pts_num - load_pts_num, (float)0.0);
-      }
-      __bang_collect((float *)temp_buffer4, (float *)nram_pts_idx_seq,
-                     (float *)fp_nram_pts_in_flag, compute_pts_num);
-      int pts_num_in_cur_roi =
-          (int)__bang_count((float *)fp_nram_pts_in_flag, compute_pts_num);
-      int *pts_idx_cur_voxels =
-          (int *)pts_idx_of_voxels +
-          roi_index * out_x * out_y * out_z * max_pts_each_voxel;
-      for (int idx = 0; idx < pts_num_in_cur_roi; idx++) {
-        int cur_pts_idx = *((int *)temp_buffer4 + idx);
-        int offset = (int)(*((float *)nram_voxel_offset + cur_pts_idx));
-        int cnt = pts_idx_cur_voxels[offset];
-        if (cnt < max_pts_each_voxel - 1) {
-          pts_idx_cur_voxels[offset + cnt + 1] =
-              cur_pts_idx + loop_idx * nram_pts_num;
-          pts_idx_cur_voxels[offset]++;
-        }
-      }
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawarePool3dForward(
-    const int pool_method, const int boxes_num, const int pts_num,
-    const int channels, const int max_pts_each_voxel, const int out_x,
-    const int out_y, const int out_z, const T *pts_feature,
-    const int *pts_idx_of_voxels, T *pooled_features, int *argmax) {
-  // params (T)pts_feature: (channels, pts_num)
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel) params (int)argmax: (boxes_num, out_x, out_y, out_z,
-  // channels) params (T)pooled_features: (boxes_num, out_x, out_y, out_z,
-  // channels)
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int align_num = NFU_ALIGN_SIZE / sizeof(T);
-  int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
-  int nram_channels_limit =
-      PAD_DOWN((MAX_NRAM_SIZE - 128 -
-                align_max_pts_each_voxel * (sizeof(int) + sizeof(T))) /
-                   ((align_max_pts_each_voxel + 1) * sizeof(T) + sizeof(int)),
-               align_num);
-  int *nram_pts_idx_cur_voxel = (int *)data_nram;
-  // nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
-  T *nram_max_pts_feature_tmp =
-      (T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
-  // nram_max_pts_feature_tmp [align_max_pts_each_voxel]
-  T *nram_pts_feature_in_voxel =
-      ((T *)nram_max_pts_feature_tmp + align_max_pts_each_voxel);
-  // nram_pts_feature_in_voxel [nram_channels_limit, align_max_pts_each_voxel]
-  T *nram_pooled_features_cur_voxel =
-      ((T *)nram_pts_feature_in_voxel +
-       nram_channels_limit * align_max_pts_each_voxel);
-  // nram_pooled_features_cur_voxel [nram_channels_limit]
-  int *nram_argmax_cur_voxel =
-      (int *)((T *)nram_pooled_features_cur_voxel + nram_channels_limit);
-  // nram_argmax_cur_voxel [nram_channels_limit]
-  char *one_pooled_feature =
-      (char *)((int *)nram_argmax_cur_voxel + nram_channels_limit);
-  // one_pooled_feature [128]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  for (int voxel_index = taskId;
-       voxel_index < boxes_num * out_x * out_y * out_z;
-       voxel_index += taskDim) {
-    int *pts_idx_cur_voxels =
-        (int *)pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
-    __memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxels,
-             max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
-    int pts_num_cur_voxel = nram_pts_idx_cur_voxel[0];
-    if (pts_num_cur_voxel == 0) {
-      continue;
-    }
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      int channels_offset = nram_channels_limit * channels_loop_idx;
-#if ((__BANG_ARCH__ >= 200) && (__BANG_ARCH__ < 300))
-      int compute_channels_num = (channels_loop_idx == channels_loop_times)
-                                     ? PAD_UP(rem_channels, align_num)
-                                     : nram_channels_limit;
-      if (pool_method == 0) {
-        __bang_write_value((void *)nram_pts_feature_in_voxel,
-                           compute_channels_num * align_max_pts_each_voxel,
-                           (T)-INFINITY);
-      }
-#endif
-      T *pts_feature_cur_loop = (T *)pts_feature + channels_offset * pts_num;
-      for (int idx = 0; idx < pts_num_cur_voxel; idx++) {
-        __memcpy((T *)nram_pts_feature_in_voxel + idx,
-                 (T *)pts_feature_cur_loop + nram_pts_idx_cur_voxel[idx + 1],
-                 sizeof(T), GDRAM2NRAM, align_max_pts_each_voxel * sizeof(T),
-                 pts_num * sizeof(T), actual_channels_num - 1);
-      }
-      for (int channel_idx = 0; channel_idx < actual_channels_num;
-           channel_idx++) {
-        if (pool_method == 0) {
-#if __BANG_ARCH__ >= 322
-          __bang_argmax((T *)one_pooled_feature,
-                        (T *)nram_pts_feature_in_voxel +
-                            channel_idx * align_max_pts_each_voxel,
-                        pts_num_cur_voxel);
-          T max_val = ((T *)one_pooled_feature)[0];
-          int max_idx = (int)(*(uint32_t *)((T *)one_pooled_feature + 1));
-          nram_pooled_features_cur_voxel[channel_idx] =
-              (max_val == -INFINITY) ? 0 : max_val;
-          nram_argmax_cur_voxel[channel_idx] =
-              (max_val == -INFINITY) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
-#else
-          // __bang_max need align num on mlu200 series
-          if (sizeof(T) == sizeof(float)) {
-            __bang_max((float *)one_pooled_feature,
-                       (float *)nram_pts_feature_in_voxel +
-                           channel_idx * align_max_pts_each_voxel,
-                       align_max_pts_each_voxel);
-            float max_val = ((float *)one_pooled_feature)[0];
-            __bang_write_value((void *)nram_max_pts_feature_tmp,
-                               align_max_pts_each_voxel, (float)max_val);
-            __bang_eq((float *)nram_max_pts_feature_tmp,
-                      (float *)nram_pts_feature_in_voxel +
-                          channel_idx * align_max_pts_each_voxel,
-                      (float *)nram_max_pts_feature_tmp,
-                      align_max_pts_each_voxel);
-            int max_idx = (int)__bang_findfirst1(
-                (float *)nram_max_pts_feature_tmp, align_max_pts_each_voxel);
-            nram_pooled_features_cur_voxel[channel_idx] =
-                (max_val == -INFINITY) ? 0 : max_val;
-            nram_argmax_cur_voxel[channel_idx] =
-                (max_val == -INFINITY) ? -1
-                                       : nram_pts_idx_cur_voxel[max_idx + 1];
-          } else {
-            int max_idx = -1;
-            float max_val = -INFINITY;
-            for (int k = 0; k < pts_num_cur_voxel; k++) {
-              float pts_feature_cur_channel = __half2float_rd(
-                  *((half *)nram_pts_feature_in_voxel +
-                    channel_idx * align_max_pts_each_voxel + k));
-              if (pts_feature_cur_channel > max_val) {
-                max_val = pts_feature_cur_channel;
-                max_idx = k;
-              }
-            }
-            nram_pooled_features_cur_voxel[channel_idx] =
-                (max_idx == -1) ? 0 : max_val;
-            nram_argmax_cur_voxel[channel_idx] =
-                (max_idx == -1) ? -1 : nram_pts_idx_cur_voxel[max_idx + 1];
-          }
-#endif
-        } else if (pool_method == 1) {
-          float sum_val_cur_channel = 0;
-          for (int k = 0; k < pts_num_cur_voxel; k++) {
-            sum_val_cur_channel += static_cast<float>(
-                ((T *)nram_pts_feature_in_voxel)[channel_idx *
-                                                     align_max_pts_each_voxel +
-                                                 k]);
-          }
-          nram_pooled_features_cur_voxel[channel_idx] =
-              (T)(sum_val_cur_channel / pts_num_cur_voxel);
-        }
-      }
-      // store
-      __memcpy((T *)pooled_features + voxel_index * channels + channels_offset,
-               (void *)nram_pooled_features_cur_voxel,
-               actual_channels_num * sizeof(T), NRAM2GDRAM);
-      if (pool_method == 0) {
-        __memcpy((int *)argmax + voxel_index * channels + channels_offset,
-                 (void *)nram_argmax_cur_voxel,
-                 actual_channels_num * sizeof(int), NRAM2GDRAM);
-      }
-    }
-  }
-}
-void KernelPtsIdxOfVoxels(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, const cnrtDataType_t d_type,
-                          const int pool_method, const int boxes_num,
-                          const int pts_num, const int max_pts_each_voxel,
-                          const int out_x, const int out_y, const int out_z,
-                          const void *rois, const void *pts,
-                          int *pts_idx_of_voxels) {
-  switch (d_type) {
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelPtsIdxOfVoxels<float><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
-          out_z, (float *)rois, (float *)pts, (int *)pts_idx_of_voxels);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelPtsIdxOfVoxels<half><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, max_pts_each_voxel, out_x, out_y,
-          out_z, (half *)rois, (half *)pts, (int *)pts_idx_of_voxels);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-void KernelRoiawarePool3dForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int pts_num, const int channels, const int max_pts_each_voxel,
-    const int out_x, const int out_y, const int out_z, const void *pts_feature,
-    const int *pts_idx_of_voxels, void *pooled_features, int *argmax) {
-  switch (d_type) {
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiawarePool3dForward<float><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
-          out_y, out_z, (float *)pts_feature, (int *)pts_idx_of_voxels,
-          (float *)pooled_features, (int *)argmax);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiawarePool3dForward<half><<<k_dim, k_type, queue>>>(
-          pool_method, boxes_num, pts_num, channels, max_pts_each_voxel, out_x,
-          out_y, out_z, (half *)pts_feature, (int *)pts_idx_of_voxels,
-          (half *)pooled_features, (int *)argmax);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawareMaxPool3dBackward(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    const int channels, const int *argmax, const T *grad_out, T *grad_in) {
-  // params (int)argmax: (boxes_num, out_x, out_y, out_z, channels)
-  // params (T)grad_out: (boxes_num, out_x, out_y, out_z, channels)
-  // params (T)grad_in: (pts_num, channels)
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int nram_channels_limit =
-      (MAX_NRAM_SIZE - sizeof(T) * 1) / (sizeof(T) + sizeof(int));
-  int *nram_argmax_cur_loop = (int *)data_nram;
-  // nram_argmax_cur_loop [nram_channels_limit]
-  T *nram_grad_out_cur_loop =
-      (T *)((int *)nram_argmax_cur_loop + nram_channels_limit);
-  // nram_grad_out_cur_loop [nram_channels_limit]
-  T *nram_grad_in_cur_channel =
-      (T *)nram_grad_out_cur_loop + nram_channels_limit;
-  // nram_grad_in_cur_channel [1]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  int voxels_num = boxes_num * out_x * out_y * out_z;
-  for (int voxel_index = taskId; voxel_index < voxels_num;
-       voxel_index += taskDim) {
-    const int *argmax_cur_voxel = argmax + voxel_index * channels;
-    const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      const int *argmax_cur_loop =
-          argmax_cur_voxel + nram_channels_limit * channels_loop_idx;
-      const T *grad_out_cur_loop =
-          grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
-      __memcpy((void *)nram_argmax_cur_loop, (void *)argmax_cur_loop,
-               actual_channels_num * sizeof(int), GDRAM2NRAM);
-      __memcpy((void *)nram_grad_out_cur_loop, (void *)grad_out_cur_loop,
-               actual_channels_num * sizeof(T), GDRAM2NRAM);
-      for (int channel_idx = 0; channel_idx < actual_channels_num;
-           channel_idx++) {
-        int *nram_argmax_cur_channel = nram_argmax_cur_loop + channel_idx;
-        T *nram_grad_out_cur_channel = nram_grad_out_cur_loop + channel_idx;
-        if (nram_argmax_cur_channel[0] == -1) {
-          continue;
-        }
-        T *grad_in_cur_channel =
-            grad_in + nram_argmax_cur_channel[0] * channels +
-            nram_channels_limit * channels_loop_idx + channel_idx;
-        __bang_atomic_add((T *)nram_grad_in_cur_channel,
-                          (T *)grad_in_cur_channel,
-                          (T *)(nram_grad_out_cur_channel), 1);
-      }
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiawareAvgPool3dBackward(
-    const int boxes_num, const int out_x, const int out_y, const int out_z,
-    const int channels, const int max_pts_each_voxel,
-    const int *pts_idx_of_voxels, const T *grad_out, T *grad_in) {
-  // params (int)pts_idx_of_voxels: (boxes_num, out_x, out_y, out_z,
-  // max_pts_each_voxel) params (T)grad_out: (boxes_num, out_x, out_y, out_z,
-  // channels) params (T)grad_in: (pts_num, channels)
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  int align_num = NFU_ALIGN_SIZE / sizeof(T);
-  int align_max_pts_each_voxel = PAD_UP(max_pts_each_voxel, align_num);
-  int nram_channels_limit = PAD_DOWN(
-      (MAX_NRAM_SIZE - align_max_pts_each_voxel * sizeof(int)) / 2 / sizeof(T),
-      align_num);
-  int *nram_pts_idx_cur_voxel = (int *)data_nram;
-  // nram_pts_idx_cur_voxel [align_max_pts_each_voxel]
-  T *nram_grad_out_cur_loop =
-      (T *)((int *)nram_pts_idx_cur_voxel + align_max_pts_each_voxel);
-  // nram_grad_out_cur_loop [nram_channels_limit]
-  T *nram_grad_in_cur_loop = (T *)nram_grad_out_cur_loop + nram_channels_limit;
-  // nram_grad_in_cur_loop [nram_channels_limit]
-  int channels_loop_times = channels / nram_channels_limit;
-  int rem_channels = channels % nram_channels_limit;
-  int voxels_num = boxes_num * out_x * out_y * out_z;
-  for (int voxel_index = taskId; voxel_index < voxels_num;
-       voxel_index += taskDim) {
-    const T *grad_out_cur_voxel = grad_out + voxel_index * channels;
-    const int *pts_idx_cur_voxel =
-        pts_idx_of_voxels + voxel_index * max_pts_each_voxel;
-    __memcpy((void *)nram_pts_idx_cur_voxel, (void *)pts_idx_cur_voxel,
-             max_pts_each_voxel * sizeof(int), GDRAM2NRAM);
-    int total_pts_of_voxel = nram_pts_idx_cur_voxel[0];
-    if (total_pts_of_voxel <= 0) {
-      continue;
-    }
-    float cur_grad = 1.0 / ((float)total_pts_of_voxel);
-    for (int channels_loop_idx = 0; channels_loop_idx <= channels_loop_times;
-         channels_loop_idx++) {
-      int actual_channels_num = (channels_loop_idx == channels_loop_times)
-                                    ? rem_channels
-                                    : nram_channels_limit;
-      if (actual_channels_num == 0) {
-        break;
-      }
-      const T *grad_out_cur_loop =
-          grad_out_cur_voxel + nram_channels_limit * channels_loop_idx;
-      __memcpy((void *)nram_grad_in_cur_loop, (void *)grad_out_cur_loop,
-               actual_channels_num * sizeof(T), GDRAM2NRAM);
-      int align_actual_channels_num = PAD_UP(actual_channels_num, align_num);
-      if (sizeof(T) == sizeof(half)) {
-        __bang_half2float((float *)nram_grad_out_cur_loop,
-                          (half *)nram_grad_in_cur_loop,
-                          align_actual_channels_num);
-        __bang_mul_scalar((float *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_out_cur_loop, (float)cur_grad,
-                          align_actual_channels_num);
-        convertFloat2half((half *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_out_cur_loop,
-                          align_actual_channels_num);
-      } else {
-        __bang_mul_scalar((float *)nram_grad_out_cur_loop,
-                          (float *)nram_grad_in_cur_loop, (float)cur_grad,
-                          align_actual_channels_num);
-      }
-      for (int k = 1; k <= total_pts_of_voxel; k++) {
-        T *grad_in_cur_loop = grad_in + nram_pts_idx_cur_voxel[k] * channels +
-                              nram_channels_limit * channels_loop_idx;
-        __bang_atomic_add((T *)nram_grad_in_cur_loop, (T *)grad_in_cur_loop,
-                          (T *)nram_grad_out_cur_loop, actual_channels_num);
-      }
-    }
-  }
-}
-void KernelRoiawarePool3dBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const cnrtDataType_t d_type, const int pool_method, const int boxes_num,
-    const int out_x, const int out_y, const int out_z, const int channels,
-    const int max_pts_each_voxel, const int *pts_idx_of_voxels,
-    const int *argmax, const void *grad_out, void *grad_in) {
-  if (pool_method == 0) {
-    switch (d_type) {
-      case CNRT_FLOAT32: {
-        MLUUnion1KernelRoiawareMaxPool3dBackward<float>
-            <<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
-                                       (int *)argmax, (float *)grad_out,
-                                       (float *)grad_in);
-      }; break;
-      case CNRT_FLOAT16: {
-        MLUUnion1KernelRoiawareMaxPool3dBackward<half>
-            <<<k_dim, k_type, queue>>>(boxes_num, out_x, out_y, out_z, channels,
-                                       (int *)argmax, (half *)grad_out,
-                                       (half *)grad_in);
-      }; break;
-      default: {
-        break;
-      }
-    }
-  } else {
-    switch (d_type) {
-      case CNRT_FLOAT32: {
-        MLUUnion1KernelRoiawareAvgPool3dBackward<float>
-            <<<k_dim, k_type, queue>>>(
-                boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-                (int *)pts_idx_of_voxels, (float *)grad_out, (float *)grad_in);
-      }; break;
-      case CNRT_FLOAT16: {
-        MLUUnion1KernelRoiawareAvgPool3dBackward<half>
-            <<<k_dim, k_type, queue>>>(
-                boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
-                (int *)pts_idx_of_voxels, (half *)grad_out, (half *)grad_in);
-      }; break;
-      default: {
-        break;
-      }
-    }
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roipoint_pool3d_large_boxes_num_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-/*************************************************************************
- *
- * NRAM partition:
- * | boxes3d       | ping points + pong points | aux_a ~ aux_f            |
- * | 7 * sizeof(T) | 6 * deal_num * sizeof(T)  | 6 * deal_num * sizeof(T) |
- *
- *************************************************************************/
-#define TWELVE_SPLIT 12
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
-                                     const size_t deal_num,
-                                     T *x,
-                                     T *y,
-                                     T *z,
-                                     T *auxiliary_a,
-                                     T *auxiliary_b,
-                                     T *auxiliary_c,
-                                     T *auxiliary_d,
-                                     T *auxiliary_e,
-                                     T *auxiliary_f,
-                                     T *pts_assign) {
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
-  T cx = boxes3d[0];
-  T cy = boxes3d[1];
-  T cz = boxes3d[2];
-  T dx = boxes3d[3];
-  T dy = boxes3d[4];
-  T dz = boxes3d[5];
-  T rz = boxes3d[6];
-  // shift to the center since cz in box3d is the bottom center
-  cz += 0.5 * dz;
-  T cosa = (T)std::cos(-rz);
-  T sina = (T)std::sin(-rz);
-  // x - cx
-  __bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
-  // y - cy
-  __bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
-  // z - cz
-  __bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
-  // |z - cz|
-  __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // |z - cz| > dz / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
-  __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
-#endif
-  // !(|z - cz| > dz / 2.0)
-  __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // (x - cx) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
-  // (y - cy) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
-  // local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
-  __bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
-  // |local_x|
-  __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
-  // |local_x| < dx / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
-  __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
-#endif
-  // (x - cx) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
-  // (y - cy) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
-  // local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
-  __bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
-  // |local_y|
-  __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
-  // |local_y| < dy / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
-  __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
-#endif
-  // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
-  __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
-  __bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
-}
-template <typename T>
-__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
-                                             int  *cnt,
-                                             char *points_x,
-                                             char *points_y,
-                                             char *points_z,
-                                             const char *point_features,
-                                             char *auxiliary_a,
-                                             char *auxiliary_b,
-                                             char *auxiliary_c,
-                                             char *auxiliary_d,
-                                             char *auxiliary_e,
-                                             char *auxiliary_f,
-                                             const int box_idx,
-                                             const int pts_num,
-                                             const int feature_in_len,
-                                             const int sampled_pts_num,
-                                             const size_t span_num_deal,
-                                             char *pooled_features_gdram,
-                                             char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (*cnt >= sampled_pts_num) {
-    return;
-  }
-  checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
-                     (T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
-                     (T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-  if (select_num == 0) {
-    return;
-  }
-  int sampled_pts_num_rem = sampled_pts_num - *cnt;
-  int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-  // copy x to pooled_features_gdram
-  // The result of __bang_select is composed of three parts:
-  // The first 4-byte is the number of selected element, whose data type is unsigned int.
-  // The next 124-byte is zero. The rest bytes are the selected elements.
-  int select_num_size = 128;
-  __memcpy(
-      pooled_features_gdram + (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-      (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-      (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-  // copy y to pooled_features_gdram
-  __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-               1 * sizeof(T),
-           (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-  // copy z to pooled_features_gdram
-  __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-               2 * sizeof(T),
-           (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-  // copy features to pooled_features_gdram
-  for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-    __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-             GDRAM2NRAM);
-    __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 (3 + c_idx) * sizeof(T),
-             auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-  }
-  *cnt += select_num;
-}
-template <typename T>
-__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
-                                                      int  *cnt,
-                                                      char *points_x,
-                                                      char *points_y,
-                                                      char *points_z,
-                                                      const char *point_features,
-                                                      char *auxiliary_a,
-                                                      char *auxiliary_b,
-                                                      char *auxiliary_c,
-                                                      char *auxiliary_d,
-                                                      char *auxiliary_e,
-                                                      char *auxiliary_f,
-                                                      const int box_idx,
-                                                      const int pts_num,
-                                                      const int feature_in_len,
-                                                      const int sampled_pts_num,
-                                                      const size_t span_num_deal,
-                                                      const size_t auxiliary_num_deal,
-                                                      char *pooled_features_gdram,
-                                                      char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (*cnt >= sampled_pts_num) {
-    // pooled_empty_flag_gdram set 0
-    *((int *)auxiliary_a) = 0;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    return;
-  }
-  checkPointsInBox3d((T *)boxes3d, span_num_deal, (T *)points_x, (T *)points_y, (T *)points_z,
-                     (T *)auxiliary_a, (T *)auxiliary_b, (T *)auxiliary_c, (T *)auxiliary_d,
-                     (T *)auxiliary_e, (T *)auxiliary_f, (T *)pts_assign);
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-  if (*cnt + select_num == 0) {
-    // pooled_empty_flag_gdram set 1
-    *((int *)auxiliary_a) = 1;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    // pooled_features_gdram set 0
-    int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
-    int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
-    // use auxiliary_a to auxiliary_f
-    __bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
-               auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram +
-                   box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
-                   repeat * auxiliary_num_deal * 6 * sizeof(T),
-               auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
-    }
-    return;
-  }
-  if (select_num > 0) {
-    int sampled_pts_num_rem = sampled_pts_num - *cnt;
-    int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-    // copy x to pooled_features_gdram
-    // The result of __bang_select is composed of three parts:
-    // The first 4-byte is the number of selected element, whose data type is unsigned int.
-    // The next 124-byte is zero. The rest bytes are the selected elements.
-    int select_num_size = 128;
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-             (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-             (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-    // copy y to pooled_features_gdram
-    __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 1 * sizeof(T),
-             (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-    // copy z to pooled_features_gdram
-    __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                 2 * sizeof(T),
-             (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-    // copy features to pooled_features_gdram
-    for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-      __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-               GDRAM2NRAM);
-      __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T) +
-                   (3 + c_idx) * sizeof(T),
-               auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-               segnum);
-    }
-  }
-  // pooled_empty_flag_gdram set 0
-  *((int *)auxiliary_a) = 0;
-  __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-  *cnt += select_num;
-  if (*cnt < sampled_pts_num) {
-    // duplicate same points for sampling
-    int repeat = sampled_pts_num / (*cnt) - 1;
-    int rem = sampled_pts_num % (*cnt);
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + *cnt) * (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               (*cnt) * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
-               (*cnt) * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(
-          pooled_features_gdram +
-              (box_idx * sampled_pts_num + (repeat + 1) * (*cnt)) * (3 + feature_in_len) *
-              sizeof(T),
-          pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-          rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-  size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
-  size_t boxes_rem = (batch_size * boxes_num) % taskDim;
-  // calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
-  int32_t batch_start = taskId < (boxes_rem + 1) ?
-                        (taskId * (boxes_per_core + 1)) / boxes_num :
-                        (taskId * boxes_per_core + boxes_rem) / boxes_num;
-  int32_t batch_end = taskId < boxes_rem ?
-                      ((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
-                      ((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
-  size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
-                                 (taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
-                                 taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
-  size_t last_batch_box_end = taskId < boxes_rem ?
-                              (taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
-                              ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
-  // points_xyz : [3, B, N]
-  const char *points_x_gdram = points_xyz_gdram;
-  const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
-  const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
-  size_t boxes3d_size = PAD_UP(7, NFU_ALIGN_SIZE) * sizeof(T);
-  size_t span_num_deal = PAD_DOWN(MAX_NRAM_SIZE / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
-  size_t align_num = NFU_ALIGN_SIZE;
-  int32_t repeat = pts_num / span_num_deal;
-  size_t rem = pts_num % span_num_deal;
-  size_t align_rem = CEIL_ALIGN(rem, align_num);
-  char *boxes3d = nram_buffer;
-  char *ping_points_x = nram_buffer + boxes3d_size;
-  char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
-  char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
-  size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
-  char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
-  char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
-  char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
-  char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
-  size_t span_load_input1_size = span_num_deal * sizeof(T);
-  size_t span_load_input2_size = span_num_deal * sizeof(T);
-  size_t span_load_input3_size = span_num_deal * sizeof(T);
-  size_t span_load_input4_size = span_num_deal * sizeof(T);
-  int cnt = 0;
-  for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
-    const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
-    const char *point_features_start =
-        point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
-    char *pooled_features_start =
-        pooled_features_gdram +
-        (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
-    char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
-    size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
-    size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
-    for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-      __memcpy_async(boxes3d,
-                     boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T) + box_idx * 7 * sizeof(T),
-                     7 * sizeof(T), GDRAM2NRAM);
-      cnt = 0;
-      if (repeat > 0) {
-        __memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
-        __memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
-        __memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
-        __asm__ volatile("sync;");
-      }
-      for (int i = 0; i < repeat - 1; i++) {
-        __memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
-                       points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
-                       points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
-                       points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
-                       GDRAM2NRAM);
-        computeStoreRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + (i % 2) * ping_pong_gap,
-            ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
-            point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
-            auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-        __asm__ volatile("sync;");
-      }
-      if (rem > 0) {
-        if (sizeof(T) == sizeof(float)) {
-          __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-        } else {
-          __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-          __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                   PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                             NFU_ALIGN_SIZE, (T)NAN);
-        }
-        __memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
-                       points_x_start + repeat * span_load_input1_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
-                       points_y_start + repeat * span_load_input2_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-        __memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
-                       points_z_start + repeat * span_load_input3_size, rem * sizeof(T),
-                       GDRAM2NRAM);
-      }
-      if (repeat > 0 && rem > 0) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      } else if (repeat > 0 && rem == 0) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-      if (rem > 0) {
-        __asm__ volatile("sync;");
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, &cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
-            ping_points_y + (repeat % 2) * ping_pong_gap,
-            ping_points_z + (repeat % 2) * ping_pong_gap,
-            point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-  }
-}
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-void KernelRoiPointPool3dLargeBoxesNumForward(cnrtDim3_t k_dim,
-                                              cnrtFunctionType_t k_type,
-                                              cnrtQueue_t queue,
-                                              const cnrtDataType_t d_type,
-                                              const int batch_size,
-                                              const int pts_num,
-                                              const int boxes_num,
-                                              const int feature_in_len,
-                                              const int sampled_pts_num,
-                                              const void *points_xyz,
-                                              const void *boxes3d,
-                                              const void *point_features,
-                                              void *pooled_features,
-                                              int *pooled_empty_flag) {
-  switch (d_type) {
-    default: { break; }
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<float><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiPointPool3dLargeBoxesNumForward<half><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roipoint_pool3d_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-/**************************************************************************************
- *
- * NRAM partition:
- * | boxes3d                   | cnt                      |
- * | boxes_num * 7 * sizeof(T) | boxes_num * sizeof(int)  |
- *
- * | ping points               | pong points              | aux_a ~ aux_f            |
- * | 3 * deal_num * sizeof(T)  | 3 * deal_num * sizeof(T) | 6 * deal_num * sizeof(T) |
- *
- ***************************************************************************************/
-#define TWELVE_SPLIT 12
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void checkPointsInBox3d(const T *boxes3d,
-                                     const size_t deal_num,
-                                     T *x,
-                                     T *y,
-                                     T *z,
-                                     T *auxiliary_a,
-                                     T *auxiliary_b,
-                                     T *auxiliary_c,
-                                     T *auxiliary_d,
-                                     T *auxiliary_e,
-                                     T *auxiliary_f,
-                                     T *pts_assign) {
-  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate
-  T cx = boxes3d[0];
-  T cy = boxes3d[1];
-  T cz = boxes3d[2];
-  T dx = boxes3d[3];
-  T dy = boxes3d[4];
-  T dz = boxes3d[5];
-  T rz = boxes3d[6];
-  // shift to the center since cz in box3d is the bottom center
-  cz += 0.5 * dz;
-  T cosa = (T)std::cos(-rz);
-  T sina = (T)std::sin(-rz);
-  // x - cx
-  __bang_sub_scalar((T *)auxiliary_a, (T *)x, (T)cx, deal_num);
-  // y - cy
-  __bang_sub_scalar((T *)auxiliary_b, (T *)y, (T)cy, deal_num);
-  // z - cz
-  __bang_sub_scalar((T *)auxiliary_c, (T *)z, (T)cz, deal_num);
-  // |z - cz|
-  __bang_active_abs((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // |z - cz| > dz / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_gt_scalar((T *)auxiliary_c, (T *)auxiliary_c, (T)(0.5 * dz), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_d, deal_num, (T)(0.5 * dz));
-  __bang_lt((T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_c, deal_num);
-#endif
-  // !(|z - cz| > dz / 2.0)
-  __bang_not((T *)auxiliary_c, (T *)auxiliary_c, deal_num);
-  // (x - cx) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_d, (T *)auxiliary_a, (T)cosa, deal_num);
-  // (y - cy) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_b, (T)sina, deal_num);
-  // local_x = (x - cx) * cos(-rz) + (y - cy) * -sin(-rz)
-  __bang_sub((T *)auxiliary_d, (T *)auxiliary_d, (T *)auxiliary_e, deal_num);
-  // |local_x|
-  __bang_active_abs((T *)auxiliary_d, (T *)auxiliary_d, deal_num);
-  // |local_x| < dx / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_d, auxiliary_d, (T)(0.5 * dx), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_e, deal_num, (T)(0.5 * dx));
-  __bang_gt((T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_d, deal_num);
-#endif
-  // (x - cx) * sin(-rz)
-  __bang_mul_scalar((T *)auxiliary_e, (T *)auxiliary_a, (T)sina, deal_num);
-  // (y - cy) * cos(-rz)
-  __bang_mul_scalar((T *)auxiliary_f, (T *)auxiliary_b, (T)cosa, deal_num);
-  // local_y = (x - cx) * sin(-rz) + (y - cy) * cos(-rz)
-  __bang_add((T *)auxiliary_e, (T *)auxiliary_e, (T *)auxiliary_f, deal_num);
-  // |local_y|
-  __bang_active_abs((T *)auxiliary_e, (T *)auxiliary_e, deal_num);
-  // |local_y| < dy / 2.0
-#if __BANG_ARCH__ >= 322
-  __bang_lt_scalar(auxiliary_e, auxiliary_e, (T)(0.5 * dy), deal_num);
-#else
-  __bang_write_value((T *)auxiliary_f, deal_num, (T)(0.5 * dy));
-  __bang_gt((T *)auxiliary_e, (T *)auxiliary_f, (T *)auxiliary_e, deal_num);
-#endif
-  // pts_assign = |x - cx| < dx / 2.0 && |y - cy| < dy / 2.0 && |z - cz| <= dz / 2.0
-  __bang_mul((T *)pts_assign, (T *)auxiliary_c, (T *)auxiliary_d, deal_num);
-  __bang_mul((T *)pts_assign, (T *)pts_assign, (T *)auxiliary_e, deal_num);
-}
-template <typename T>
-__mlu_func__ void computeStoreRoipointPool3d(char *boxes3d,
-                                             int  *cnt,
-                                             char *points_x,
-                                             char *points_y,
-                                             char *points_z,
-                                             const char *point_features,
-                                             char *auxiliary_a,
-                                             char *auxiliary_b,
-                                             char *auxiliary_c,
-                                             char *auxiliary_d,
-                                             char *auxiliary_e,
-                                             char *auxiliary_f,
-                                             const int box_idx,
-                                             const int pts_num,
-                                             const int feature_in_len,
-                                             const int sampled_pts_num,
-                                             const size_t span_num_deal,
-                                             char *pooled_features_gdram,
-                                             char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (cnt[box_idx] >= sampled_pts_num) {
-    return;
-  }
-  checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
-                     (T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
-                     (T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
-                     (T *)pts_assign);
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-  if (select_num == 0) {
-    return;
-  }
-  int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
-  int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-  // copy x to pooled_features_gdram
-  // The result of __bang_select is composed of three parts:
-  // The first 4-byte is the number of selected element, whose data type is unsigned int.
-  // The next 124-byte is zero. The rest bytes are the selected elements.
-  int select_num_size = 128;
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-           (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-           (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-  // copy y to pooled_features_gdram
-  __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-               1 * sizeof(T),
-           (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-  // copy z to pooled_features_gdram
-  __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-  __memcpy(pooled_features_gdram +
-               (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-               2 * sizeof(T),
-           (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-           segnum);
-  // copy features to pooled_features_gdram
-  for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-    __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-             GDRAM2NRAM);
-    __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 (3 + c_idx) * sizeof(T),
-             auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-  }
-  cnt[box_idx] += select_num;
-}
-template <typename T>
-__mlu_func__ void computeStoreLastBlockRoipointPool3d(char *boxes3d,
-                                                      int  *cnt,
-                                                      char *points_x,
-                                                      char *points_y,
-                                                      char *points_z,
-                                                      const char *point_features,
-                                                      char *auxiliary_a,
-                                                      char *auxiliary_b,
-                                                      char *auxiliary_c,
-                                                      char *auxiliary_d,
-                                                      char *auxiliary_e,
-                                                      char *auxiliary_f,
-                                                      const int box_idx,
-                                                      const int pts_num,
-                                                      const int feature_in_len,
-                                                      const int sampled_pts_num,
-                                                      const size_t span_num_deal,
-                                                      const size_t auxiliary_num_deal,
-                                                      char *pooled_features_gdram,
-                                                      char *pooled_empty_flag_gdram) {
-  char *pts_assign = auxiliary_a;
-  if (cnt[box_idx] >= sampled_pts_num) {
-    // pooled_empty_flag_gdram set 0
-    *((int *)auxiliary_a) = 0;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    return;
-  }
-  checkPointsInBox3d((T *)(boxes3d + box_idx * 7 * sizeof(T)), span_num_deal, (T *)points_x,
-                     (T *)points_y, (T *)points_z, (T *)auxiliary_a, (T *)auxiliary_b,
-                     (T *)auxiliary_c, (T *)auxiliary_d, (T *)auxiliary_e, (T *)auxiliary_f,
-                     (T *)pts_assign);
-  // __bang_select returns selected elements vector and the number of selected elements
-  __bang_select((T *)auxiliary_b, (T *)points_x, (T *)pts_assign, span_num_deal);
-  uint32_t select_num = *((uint32_t *)auxiliary_b);
-  if (cnt[box_idx] + select_num == 0) {
-    // pooled_empty_flag_gdram set 1
-    *((int *)auxiliary_a) = 1;
-    __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-    // pooled_features_gdram set 0
-    int repeat = (sampled_pts_num * (3 + feature_in_len)) / (auxiliary_num_deal * 6);
-    int rem = (sampled_pts_num * (3 + feature_in_len)) % (auxiliary_num_deal * 6);
-    // use auxiliary_a to auxiliary_f
-    __bang_write_zero((T *)auxiliary_a, PAD_UP(auxiliary_num_deal * 6, NFU_ALIGN_SIZE));
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               auxiliary_a, auxiliary_num_deal * 6 * sizeof(T), NRAM2GDRAM,
-               auxiliary_num_deal * 6 * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram +
-                   box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T) +
-                   repeat * auxiliary_num_deal * 6 * sizeof(T),
-               auxiliary_a, rem * sizeof(T), NRAM2GDRAM);
-    }
-    return;
-  }
-  if (select_num > 0) {
-    int sampled_pts_num_rem = sampled_pts_num - cnt[box_idx];
-    int segnum = min((int)select_num, sampled_pts_num_rem) - 1;
-    // copy x to pooled_features_gdram
-    // The result of __bang_select is composed of three parts:
-    // The first 4-byte is the number of selected element, whose data type is unsigned int.
-    // The next 124-byte is zero. The rest bytes are the selected elements.
-    int select_num_size = 128;
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-             (T *)((int8_t *)auxiliary_b + select_num_size), sizeof(T), NRAM2GDRAM,
-             (3 + feature_in_len) * sizeof(T), sizeof(T), segnum);
-    // copy y to pooled_features_gdram
-    __bang_collect((T *)auxiliary_d, (T *)points_y, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 1 * sizeof(T),
-             (T *)auxiliary_d, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-    // copy z to pooled_features_gdram
-    __bang_collect((T *)auxiliary_e, (T *)points_z, (T *)pts_assign, span_num_deal);
-    __memcpy(pooled_features_gdram +
-                 (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                 2 * sizeof(T),
-             (T *)auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-             segnum);
-    // copy features to pooled_features_gdram
-    for (int c_idx = 0; c_idx < feature_in_len; c_idx++) {
-      __memcpy(auxiliary_d, point_features + c_idx * pts_num * sizeof(T), span_num_deal * sizeof(T),
-               GDRAM2NRAM);
-      __bang_collect((T *)auxiliary_e, (T *)auxiliary_d, (T *)pts_assign, span_num_deal);
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T) +
-                   (3 + c_idx) * sizeof(T),
-               auxiliary_e, sizeof(T), NRAM2GDRAM, (3 + feature_in_len) * sizeof(T), sizeof(T),
-               segnum);
-    }
-  }
-  // pooled_empty_flag_gdram set 0
-  *((int *)auxiliary_a) = 0;
-  __memcpy(pooled_empty_flag_gdram + box_idx * sizeof(int), auxiliary_a, sizeof(int), NRAM2GDRAM);
-  cnt[box_idx] += select_num;
-  if (cnt[box_idx] < sampled_pts_num) {
-    // duplicate same points for sampling
-    int repeat = sampled_pts_num / cnt[box_idx] - 1;
-    int rem = sampled_pts_num % cnt[box_idx];
-    if (repeat > 0) {
-      __memcpy(pooled_features_gdram +
-                   (box_idx * sampled_pts_num + cnt[box_idx]) * (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               cnt[box_idx] * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM,
-               cnt[box_idx] * (3 + feature_in_len) * sizeof(T), 0, repeat - 1);
-    }
-    if (rem > 0) {
-      __memcpy(pooled_features_gdram + (box_idx * sampled_pts_num + (repeat + 1) * cnt[box_idx]) *
-                   (3 + feature_in_len) * sizeof(T),
-               pooled_features_gdram + box_idx * sampled_pts_num * (3 + feature_in_len) * sizeof(T),
-               rem * (3 + feature_in_len) * sizeof(T), GDRAM2GDRAM);
-    }
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelRoiPointPool3dForward(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-  size_t boxes_per_core = (batch_size * boxes_num) / taskDim;
-  size_t boxes_rem = (batch_size * boxes_num) % taskDim;
-  // calc batch_start, batch_end, first_batch_box_start, last batch_box_end for each core
-  int32_t batch_start = taskId < (boxes_rem + 1) ?
-                        (taskId * (boxes_per_core + 1)) / boxes_num :
-                        (taskId * boxes_per_core + boxes_rem) / boxes_num;
-  int32_t batch_end = taskId < boxes_rem ?
-                      ((taskId + 1) * (boxes_per_core + 1) - 1) / boxes_num :
-                      ((taskId + 1) * boxes_per_core + boxes_rem - 1) / boxes_num;
-  size_t first_batch_box_start = taskId < (boxes_rem + 1) ?
-                                 (taskId * (boxes_per_core + 1)) - batch_start * boxes_num :
-                                 taskId * boxes_per_core + boxes_rem - batch_start * boxes_num;
-  size_t last_batch_box_end = taskId < boxes_rem ?
-                              (taskId + 1) * (boxes_per_core + 1) - batch_end * boxes_num :
-                              ((taskId + 1) * boxes_per_core + boxes_rem) - batch_end * boxes_num;
-  // points_xyz : [3, B, N]
-  const char *points_x_gdram = points_xyz_gdram;
-  const char *points_y_gdram = points_xyz_gdram + (1 * batch_size * pts_num) * sizeof(T);
-  const char *points_z_gdram = points_xyz_gdram + (2 * batch_size * pts_num) * sizeof(T);
-  size_t boxes3d_size = PAD_UP(boxes_num * 7, NFU_ALIGN_SIZE) * sizeof(T);
-  size_t cnt_size = PAD_UP(boxes_num, NFU_ALIGN_SIZE) * sizeof(int);
-  size_t span_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - boxes3d_size - cnt_size) / TWELVE_SPLIT / sizeof(T), NFU_ALIGN_SIZE);
-  size_t align_num = NFU_ALIGN_SIZE;
-  int32_t repeat = pts_num / span_num_deal;
-  size_t rem = pts_num % span_num_deal;
-  size_t align_rem = CEIL_ALIGN(rem, align_num);
-  char *boxes3d = nram_buffer;
-  char *cnt = nram_buffer + boxes3d_size;
-  char *ping_points_x = cnt + cnt_size;
-  char *ping_points_y = ping_points_x + span_num_deal * sizeof(T);
-  char *ping_points_z = ping_points_y + span_num_deal * sizeof(T);
-  size_t ping_pong_gap = 3 * span_num_deal * sizeof(T);
-  char *auxiliary_a = ping_points_x + 2 * ping_pong_gap;
-  char *auxiliary_b = auxiliary_a + span_num_deal * sizeof(T);
-  char *auxiliary_c = auxiliary_b + span_num_deal * sizeof(T);
-  char *auxiliary_d = auxiliary_c + span_num_deal * sizeof(T);
-  char *auxiliary_e = auxiliary_d + span_num_deal * sizeof(T);
-  char *auxiliary_f = auxiliary_e + span_num_deal * sizeof(T);
-  size_t span_load_input1_size = span_num_deal * sizeof(T);
-  size_t span_load_input2_size = span_num_deal * sizeof(T);
-  size_t span_load_input3_size = span_num_deal * sizeof(T);
-  size_t span_load_input4_size = span_num_deal * sizeof(T);
-  for (int bs_idx = batch_start; bs_idx <= batch_end; bs_idx++) {
-    __memcpy_async(boxes3d, boxes3d_gdram + bs_idx * boxes_num * 7 * sizeof(T),
-                   boxes_num * 7 * sizeof(T), GDRAM2NRAM);
-    __bang_write_zero((int *)cnt, PAD_UP(boxes_num, NFU_ALIGN_SIZE));
-    const char *points_x_start = points_x_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_y_start = points_y_gdram + bs_idx * pts_num * sizeof(T);
-    const char *points_z_start = points_z_gdram + bs_idx * pts_num * sizeof(T);
-    const char *point_features_start =
-        point_features_gdram + bs_idx * feature_in_len * pts_num * sizeof(T);
-    char *pooled_features_start =
-        pooled_features_gdram +
-        (bs_idx * boxes_num * sampled_pts_num * (3 + feature_in_len)) * sizeof(T);
-    char *pooled_empty_flag_start = pooled_empty_flag_gdram + bs_idx * boxes_num * sizeof(int);
-    size_t box_start = bs_idx == batch_start ? first_batch_box_start : 0;
-    size_t box_end = bs_idx == batch_end ? last_batch_box_end : boxes_num;
-    if (repeat > 0) {
-      __memcpy_async(ping_points_x, points_x_start, span_load_input1_size, GDRAM2NRAM);
-      __memcpy_async(ping_points_y, points_y_start, span_load_input2_size, GDRAM2NRAM);
-      __memcpy_async(ping_points_z, points_z_start, span_load_input3_size, GDRAM2NRAM);
-      __asm__ volatile("sync;");
-    }
-    for (int i = 0; i < repeat - 1; i++) {
-      __memcpy_async(ping_points_x + ((i + 1) % 2) * ping_pong_gap,
-                     points_x_start + (i + 1) * span_load_input1_size, span_load_input1_size,
-                     GDRAM2NRAM);
-      __memcpy_async(ping_points_y + ((i + 1) % 2) * ping_pong_gap,
-                     points_y_start + (i + 1) * span_load_input2_size, span_load_input2_size,
-                     GDRAM2NRAM);
-      __memcpy_async(ping_points_z + ((i + 1) % 2) * ping_pong_gap,
-                     points_z_start + (i + 1) * span_load_input3_size, span_load_input3_size,
-                     GDRAM2NRAM);
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + (i % 2) * ping_pong_gap,
-            ping_points_y + (i % 2) * ping_pong_gap, ping_points_z + (i % 2) * ping_pong_gap,
-            point_features_start + i * span_load_input4_size, auxiliary_a, auxiliary_b, auxiliary_c,
-            auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      }
-      __asm__ volatile("sync;");
-    }
-    if (rem > 0) {
-      if (sizeof(T) == sizeof(float)) {
-        __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-      } else {
-        __bang_write_value((T *)(ping_points_x + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_y + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-        __bang_write_value((T *)(ping_points_z + (repeat % 2) * ping_pong_gap +
-                                 PAD_DOWN(rem, NFU_ALIGN_SIZE) * sizeof(T)),
-                           NFU_ALIGN_SIZE, (T)NAN);
-      }
-      __memcpy_async(ping_points_x + (repeat % 2) * ping_pong_gap,
-                     points_x_start + repeat * span_load_input1_size, rem * sizeof(T), GDRAM2NRAM);
-      __memcpy_async(ping_points_y + (repeat % 2) * ping_pong_gap,
-                     points_y_start + repeat * span_load_input2_size, rem * sizeof(T), GDRAM2NRAM);
-      __memcpy_async(ping_points_z + (repeat % 2) * ping_pong_gap,
-                     points_z_start + repeat * span_load_input3_size, rem * sizeof(T), GDRAM2NRAM);
-    }
-    if (repeat > 0 && rem > 0) {
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, pooled_features_start, pooled_empty_flag_start);
-      }
-    } else if (repeat > 0 && rem == 0) {
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_y + ((repeat - 1) % 2) * ping_pong_gap,
-            ping_points_z + ((repeat - 1) % 2) * ping_pong_gap,
-            point_features_start + (repeat - 1) * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, span_num_deal, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-    if (rem > 0) {
-      __asm__ volatile("sync;");
-      for (int box_idx = box_start; box_idx < box_end; box_idx++) {
-        computeStoreLastBlockRoipointPool3d<T>(
-            boxes3d, (int *)cnt, ping_points_x + (repeat % 2) * ping_pong_gap,
-            ping_points_y + (repeat % 2) * ping_pong_gap,
-            ping_points_z + (repeat % 2) * ping_pong_gap,
-            point_features_start + repeat * span_load_input4_size, auxiliary_a, auxiliary_b,
-            auxiliary_c, auxiliary_d, auxiliary_e, auxiliary_f, box_idx, pts_num, feature_in_len,
-            sampled_pts_num, align_rem, span_num_deal, pooled_features_start,
-            pooled_empty_flag_start);
-      }
-    }
-  }
-}
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<float>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-template __mlu_global__ void MLUUnion1KernelRoiPointPool3dForward<half>(
-    const int batch_size,
-    const int pts_num,
-    const int boxes_num,
-    const int feature_in_len,
-    const int sampled_pts_num,
-    const char *points_xyz_gdram,
-    const char *point_features_gdram,
-    const char *boxes3d_gdram,
-    char *pooled_features_gdram,
-    char *pooled_empty_flag_gdram);
-void KernelRoiPointPool3dForward(cnrtDim3_t k_dim,
-                                 cnrtFunctionType_t k_type,
-                                 cnrtQueue_t queue,
-                                 const cnrtDataType_t d_type,
-                                 const int batch_size,
-                                 const int pts_num,
-                                 const int boxes_num,
-                                 const int feature_in_len,
-                                 const int sampled_pts_num,
-                                 const void *points_xyz,
-                                 const void *boxes3d,
-                                 const void *point_features,
-                                 void *pooled_features,
-                                 int *pooled_empty_flag) {
-  switch (d_type) {
-    default: { break; }
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelRoiPointPool3dForward<float><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelRoiPointPool3dForward<half><<<k_dim, k_type, queue>>>(
-          batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
-          (char *)points_xyz, (char *)point_features, (char *)boxes3d,
-          (char *)pooled_features, (char *)pooled_empty_flag);
-    }; break;
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include <algorithm>
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-#if __BANG_ARCH__ >= 322
-/**
- * returns the index of ret, which is stored at the 1st position of the `ret`,
- * used after bang_min
- */
-__mlu_func__ uint32_t getIndice(half *ret) {
-  uint32_t indice = *((uint32_t *)((uint16_t *)ret + 1));
-  return indice;
-}
-/**
- * returns the index of ret, which is stored at the 1st position of the `ret`,
- * used after bang_min
- */
-__mlu_func__ uint32_t getIndice(float *ret) {
-  uint32_t indice = ((uint32_t *)ret)[1];
-  return indice;
-}
-#endif
-template <typename T>
-__mlu_func__ void auxArgmin(T *nram_dst, T *nram_src, const int num_deal,
-                            T *value, int *index) {
-  __bang_min(nram_dst, nram_src, num_deal);
-  *value = nram_dst[0];
-  __bang_write_value(nram_dst, num_deal, *value);
-  __bang_eq(nram_dst, nram_src, nram_dst, num_deal);
-  __bang_findfirst1((uint32_t *)nram_dst, nram_dst, num_deal);
-  *index = *((int *)nram_dst);
-}
-template <typename T>
-__mlu_func__ void auxFuncFind3Min(T *nram_aux_a, const int auxa_offset,
-                                  int *nram_aux_b, const int auxb_offset,
-                                  T *nram_dest, T *nram_aux_sort_a,
-                                  int *nram_aux_sort_b, const int deal_offset) {
-  __bang_write_value(nram_aux_sort_a, auxa_offset, (T)(INFINITY));
-  __bang_write_value(nram_aux_sort_b, auxb_offset, (int)0);
-  int index = 0;
-  for (int i = 0; i < 3; i++) {
-#if __BANG_ARCH__ >= 322
-    __bang_argmin(nram_dest, nram_aux_a, auxa_offset);
-    nram_aux_sort_a[i] = nram_dest[0];
-    index = getIndice(nram_dest);
-#else
-    T value = 0;
-    auxArgmin(nram_dest, nram_aux_a, auxa_offset, &value, &index);
-    nram_aux_sort_a[i] = value;
-#endif
-    nram_aux_sort_b[i] = nram_aux_b[index];
-    __memset_nram(nram_aux_a + index, 1, (T)(INFINITY));
-  }
-  __memcpy((char *)nram_aux_a, (char *)nram_aux_sort_a, auxa_offset * sizeof(T),
-           NRAM2NRAM);
-  __memcpy((char *)nram_aux_b, (char *)nram_aux_sort_b,
-           auxb_offset * sizeof(int), NRAM2NRAM);
-}
-template <typename T>
-__mlu_func__ void auxFuncSort(T *nram_aux_a, const int auxa_offset,
-                              int *nram_aux_b, const int auxb_offset,
-                              T *nram_dest, T *nram_help_value,
-                              int *nram_help_idx, const int num_deal,
-                              const int deal_offset) {
-  for (int k = 0; k < num_deal; ++k) {
-    auxFuncFind3Min(nram_aux_a + k * auxa_offset, auxa_offset,
-                    nram_aux_b + k * auxb_offset, auxb_offset, nram_dest,
-                    nram_help_value, nram_help_idx, deal_offset);
-  }
-}
-template <typename T>
-__mlu_func__ void auxFuncNN(
-    size_t *output_aux_sort_a_gap, size_t *output_aux_sort_b_gap,
-    size_t *output_aux_dest_gap, size_t *output_unknown_gap,
-    size_t *output_known_gap, size_t *output_dist_gap, size_t *auxillary_a_gap,
-    size_t *auxillary_b_gap, size_t *known_num_deal, size_t *unknown_num_deal,
-    size_t *align_num, size_t *auxa_offset, size_t *auxb_offset) {
-  /*
-   * nram partition:
-   *        |-NFU_ALIGN_SIZE-|-2*NFU_ALIGN_SIZE-|-X*3*sizeof(T)-|
-   * space: |   aux_sort_a   |  aux_sort_b      |  nram_unknown |
-   *
-   *        | ------        (Y * 7 *sizeof(T)) ---------------- |
-   *        |   nram_known   |    nram_dist     |   nram_dest   |
-   *
-   *        | -X * NFU_ALIGN_SIZE ---|---X * 2 * NFU_ALIGN_SIZE-|
-   *        |  output_dist(aux_a)    |    output_dist(aux_b)    |
-   *  200 series
-   *  X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (2/3) / (3 * sizeof(T) + 3 *
-   *  NFU_ALIGN_SIZE)
-   *  Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (1/3) / (7 * sizeof(T))
-   *  300 series
-   *  X = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) * (4/5) / (3 *
-   *  sizeof(T) + 3 * NFU_ALIGN_SIZE)
-   *  Y = (MAX_NRAM - 3 * NFU_ALIGN_SIZE) *
-   *  (1/5) / (7 * sizeof(T))
-   *
-   */
-  *align_num = NFU_ALIGN_SIZE / sizeof(T);
-  *auxa_offset = NFU_ALIGN_SIZE / sizeof(T);
-  *auxb_offset = 2 * NFU_ALIGN_SIZE / sizeof(int);
-#if __BANG_ARCH__ >= 322
-  *known_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 / (7 * sizeof(T)), *align_num);
-  *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 5 * 4 /
-                                   (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
-                               *align_num);
-#else
-  *known_num_deal = PAD_DOWN(
-      (MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 / (7 * sizeof(T)), *align_num);
-  *unknown_num_deal = PAD_DOWN((MAX_NRAM_SIZE - 3 * NFU_ALIGN_SIZE) / 3 * 2 /
-                                   (3 * sizeof(T) + 3 * NFU_ALIGN_SIZE),
-                               *align_num);
-#endif
-  *output_aux_sort_a_gap = 0;
-  *output_aux_sort_b_gap = *output_aux_sort_a_gap + NFU_ALIGN_SIZE;
-  *output_aux_dest_gap = *output_aux_sort_b_gap + 2 * NFU_ALIGN_SIZE;
-  *output_unknown_gap = *output_aux_dest_gap + *known_num_deal * sizeof(T);
-  *output_known_gap = *output_unknown_gap + *unknown_num_deal * 3 * sizeof(T);
-  *output_dist_gap = *output_known_gap + *known_num_deal * 3 * sizeof(T);
-  *auxillary_a_gap = *output_dist_gap + *known_num_deal * 3 * sizeof(T);
-  *auxillary_b_gap = *auxillary_a_gap + *unknown_num_deal * NFU_ALIGN_SIZE;
-}
-#if __BANG_ARCH__ >= 322
-template <typename T>
-__mlu_func__ bool containNanInf(T *nram_unknown) {
-  if (std::isnan(nram_unknown[0]) || std::isnan(nram_unknown[1]) ||
-      std::isnan(nram_unknown[2]) || std::isinf(nram_unknown[0]) ||
-      std::isinf(nram_unknown[1]) || std::isinf(nram_unknown[2]))
-    return true;
-  else
-    return false;
-}
-#endif
-template <typename T>
-__mlu_func__ void computeThreeNN(T *nram_unknown, T *nram_known, T *nram_dist,
-                                 T *nram_dest, T *nram_aux_a,
-                                 T *nram_aux_sort_a, int *nram_aux_b,
-                                 int *nram_aux_sort_b, const int known_num_deal,
-                                 const int known_seg_num, const int deal_offset,
-                                 const int known_count,
-                                 const int known_count_align) {
-  __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
-#if __BANG_ARCH__ >= 322
-  if (!containNanInf(nram_unknown)) {
-#endif
-    // x1 - x2
-    __bang_sub_scalar(nram_dist, nram_known, nram_unknown[0],
-                      known_count_align);
-    // y1 - y2
-    __bang_sub_scalar(nram_dist + known_count_align,
-                      nram_known + known_count_align, nram_unknown[1],
-                      known_count_align);
-    // z1 - z2
-    __bang_sub_scalar(nram_dist + 2 * known_count_align,
-                      nram_known + 2 * known_count_align, nram_unknown[2],
-                      known_count_align);
-    __bang_square(nram_dist, nram_dist, 3 * known_count_align);
-    __bang_add(nram_dist, nram_dist, nram_dist + known_count_align,
-               known_count_align);
-    __bang_add(nram_dist, nram_dist, nram_dist + 2 * known_count_align,
-               known_count_align);
-#if __BANG_ARCH__ >= 322
-  }
-#endif
-  int index = 0;
-  for (int i = 0; i < 3; i++) {
-#if __BANG_ARCH__ >= 322
-    __bang_argmin(nram_dest, nram_dist, known_count_align);
-    nram_aux_a[i + deal_offset] = nram_dest[0];
-    index = getIndice(nram_dest);
-#else
-    T value = 0;
-    auxArgmin(nram_dest, nram_dist, known_count_align, &value, &index);
-    nram_aux_a[i + deal_offset] = value;
-#endif
-    nram_aux_b[i + deal_offset] = index + known_seg_num * known_num_deal;
-    __memset_nram(nram_dist + index, 1, (T)(INFINITY));
-  }
-}
-template <typename T>
-__mlu_func__ void loadTransposedKnownTensor(
-    char *nram_known, char *nram_dist, const char *known_gdram,
-    const int known_num_deal, const int batch_id, const int m,
-    const int known_seg_num, const int count, const int count_align_num) {
-  __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-#if __BANG_ARCH__ >= 322
-  __bang_write_value(nram_dist, 3 * known_num_deal, (T)(INFINITY));
-  __memcpy(nram_dist,
-           known_gdram +
-               (batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
-           count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
-           m * sizeof(T), 2);
-  __bang_minequal((T *)nram_known, (T *)nram_known, (T *)nram_dist,
-                  3 * count_align_num);
-#else
-  __memcpy(nram_known,
-           known_gdram +
-               (batch_id * m * 3 + known_seg_num * known_num_deal) * sizeof(T),
-           count * sizeof(T), GDRAM2NRAM, count_align_num * sizeof(T),
-           m * sizeof(T), 2);
-#endif
-}
-template <typename T>
-__mlu_func__ void loadUnknownTensor(char *nram_unknown,
-                                    const char *unknown_gdram,
-                                    const int unknown_num_deal,
-                                    const int unknown_seg_num, const int count,
-                                    const int count_align_num) {
-  __memcpy(nram_unknown,
-           unknown_gdram + unknown_seg_num * unknown_num_deal * 3 * sizeof(T),
-           count * 3 * sizeof(T), GDRAM2NRAM);
-}
-template <typename T>
-__mlu_func__ void auxProcessSegment(
-    const int m, const int n, T *nram_unknown, T *nram_known, T *nram_dist,
-    T *nram_dest, T *known_gdram, T *nram_aux_a, const int auxa_offset,
-    int *nram_aux_b, const int auxb_offset, T *nram_aux_sort_a,
-    int *nram_aux_sort_b, const int unknown_num_deal, const int known_num_deal,
-    const int known_seg_num, const int unknown_seg_num, const int unknown_count,
-    const int known_count, const int known_count_align, const int start_idx,
-    int *deal_offset) {
-  int pre_batch_id = -1;
-  int cur_batch_id = -1;
-  pre_batch_id = start_idx / n;
-  // if aux_a space is not enough, get the first 3 min among aux_a and clear.
-  if (*deal_offset >= PAD_DOWN(auxa_offset, 3)) {
-    auxFuncSort(nram_aux_a, auxa_offset, nram_aux_b, auxb_offset, nram_dest,
-                nram_aux_sort_a, nram_aux_sort_b, unknown_count, *deal_offset);
-    *deal_offset = 3;
-  }
-  // load i'th segment of known batch data.
-  loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
-                               (char *)known_gdram, known_num_deal,
-                               pre_batch_id, m, known_seg_num, known_count,
-                               known_count_align);
-  for (int k = 0; k < unknown_count; ++k) {
-    cur_batch_id = (start_idx + k) / n;
-    if (cur_batch_id != pre_batch_id) {  // if batch id of unknown data changed,
-                                         // load corresponding known batch data
-      pre_batch_id = cur_batch_id;
-      loadTransposedKnownTensor<T>((char *)nram_known, (char *)nram_dist,
-                                   (char *)known_gdram, known_num_deal,
-                                   pre_batch_id, m, known_seg_num, known_count,
-                                   known_count_align);
-    }
-    computeThreeNN(nram_unknown + 3 * k, nram_known, nram_dist, nram_dest,
-                   nram_aux_a + k * auxa_offset, nram_aux_sort_a,
-                   nram_aux_b + k * auxb_offset, nram_aux_sort_b,
-                   known_num_deal, known_seg_num, *deal_offset, known_count,
-                   known_count_align);
-  }
-}
-template <typename T>
-__mlu_global__ void MLUUnion1KernelThreeNN(const int b, const int n,
-                                           const int m, char *unknown_gdram,
-                                           char *known_gdram, char *dist2_gdram,
-                                           int *idx_gdram) {
-  if (coreId == 0x80) {
-    return;
-  }
-  size_t output_aux_sort_a_gap = 0, output_aux_sort_b_gap = 0,
-         output_dest_gap = 0, output_unknown_gap = 0, output_known_gap = 0,
-         output_dist_gap = 0, auxillary_a_gap = 0, auxillary_b_gap = 0,
-         known_num_deal = 0, unknown_num_deal = 0, align_num = 0,
-         auxa_offset = 0, auxb_offset = 0;
-  auxFuncNN<T>(&output_aux_sort_a_gap, &output_aux_sort_b_gap, &output_dest_gap,
-               &output_unknown_gap, &output_known_gap, &output_dist_gap,
-               &auxillary_a_gap, &auxillary_b_gap, &known_num_deal,
-               &unknown_num_deal, &align_num, &auxa_offset, &auxb_offset);
-  int num_per_core = b * n / taskDim;
-  const int core_offset = num_per_core;
-  char *unknown_gdram_start =
-      unknown_gdram + taskId * 3 * core_offset * sizeof(T);
-  char *known_gdram_start = known_gdram;
-  char *output_dist_start = dist2_gdram + taskId * 3 * core_offset * sizeof(T);
-  int *output_idx_start = idx_gdram + taskId * 3 * core_offset;
-  const int rem = (b * n) % taskDim;
-  if (taskId == taskDim - 1) {
-    num_per_core += rem;
-  }
-  const int unknown_repeat =
-      num_per_core / unknown_num_deal;  // if unknown number is big, process it
-                                        // by unknown_repeat times.
-  const int unknown_rem = num_per_core % unknown_num_deal;  // unknown reminder
-  const int unknown_rem_align = PAD_UP(unknown_rem, align_num);
-  const int known_repeat =
-      m / known_num_deal;  // if known number is big, process it by
-                           // unknown_repeat times.
-  const int known_rem = m % known_num_deal;  // known reminder
-  const int known_rem_align = PAD_UP(known_rem, align_num);
-  char *nram_aux_sort_a = nram_buffer;
-  int *nram_aux_sort_b = (int *)(nram_buffer + output_aux_sort_b_gap);
-  char *nram_dest = nram_buffer + output_dest_gap;
-  char *nram_unknown = nram_buffer + output_unknown_gap;
-  char *nram_known = nram_buffer + output_known_gap;
-  char *nram_dist = nram_buffer + output_dist_gap;
-  char *nram_aux_a = nram_buffer + auxillary_a_gap;
-  int *nram_aux_b = (int *)(nram_buffer + auxillary_b_gap);
-  int deal_offset = 0;
-  int start_idx = -1;
-  for (int j = 0; j < unknown_repeat;
-       ++j) {  // process data within a unknown_repeat
-    // if unknown need to be process segmentally, use a aux_a and aux_b
-    // space to find first 3 minimum dist.
-    __bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
-                       (T)(INFINITY));
-    __bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
-    loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal, j,
-                         unknown_num_deal, unknown_num_deal);
-    deal_offset = 0;
-    start_idx = taskId * core_offset + j * unknown_num_deal;
-    for (int i = 0; i < known_repeat;
-         ++i) {  // process known data in segmentally.
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, i, j, unknown_num_deal,
-          known_num_deal, known_num_deal, start_idx, &deal_offset);
-      deal_offset += 3;
-    }
-    if (known_rem > 0) {  // process known rem
-      __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, known_repeat, j, unknown_num_deal,
-          known_rem, known_rem_align, start_idx, &deal_offset);
-    }
-    deal_offset += 3;
-    if (deal_offset > 3) {
-      auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
-                  (T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
-                  unknown_num_deal, deal_offset);
-      deal_offset = 0;
-    }
-    __memcpy((char *)output_dist_start + j * unknown_num_deal * 3 * sizeof(T),
-             (char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
-             auxa_offset * sizeof(T), unknown_num_deal - 1);
-    __memcpy((char *)output_idx_start + j * unknown_num_deal * 3 * sizeof(int),
-             (char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
-             auxb_offset * sizeof(int), unknown_num_deal - 1);
-  }
-  if (unknown_rem > 0) {  // process unknown rem
-    deal_offset = 0;
-    __bang_write_value(nram_aux_a, unknown_num_deal * auxa_offset,
-                       (T)(INFINITY));
-    __bang_write_value(nram_aux_b, unknown_num_deal * auxb_offset, (int)0);
-    loadUnknownTensor<T>(nram_unknown, unknown_gdram_start, unknown_num_deal,
-                         unknown_repeat, unknown_rem, unknown_rem_align);
-    start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
-    for (int i = 0; i < known_repeat; ++i) {
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, i, unknown_repeat, unknown_rem,
-          known_num_deal, known_num_deal, start_idx, &deal_offset);
-      deal_offset += 3;
-    }
-    if (known_rem > 0) {
-      __bang_write_value(nram_known, 3 * known_num_deal, (T)(INFINITY));
-      start_idx = taskId * core_offset + unknown_repeat * unknown_num_deal;
-      auxProcessSegment<T>(
-          m, n, (T *)nram_unknown, (T *)nram_known, (T *)nram_dist,
-          (T *)nram_dest, (T *)known_gdram_start, (T *)nram_aux_a, auxa_offset,
-          nram_aux_b, auxb_offset, (T *)nram_aux_sort_a, nram_aux_sort_b,
-          unknown_num_deal, known_num_deal, known_repeat, unknown_repeat,
-          unknown_rem, known_rem, known_rem_align, start_idx, &deal_offset);
-      deal_offset += 3;
-    }
-    if (deal_offset > 3) {
-      auxFuncSort((T *)nram_aux_a, auxa_offset, nram_aux_b, auxb_offset,
-                  (T *)nram_dest, (T *)nram_aux_sort_a, nram_aux_sort_b,
-                  unknown_rem, deal_offset);
-      deal_offset = 0;
-    }
-    __memcpy((char *)output_dist_start +
-                 unknown_repeat * unknown_num_deal * 3 * sizeof(T),
-             (char *)nram_aux_a, 3 * sizeof(T), NRAM2GDRAM, 3 * sizeof(T),
-             auxa_offset * sizeof(T), unknown_rem - 1);
-    __memcpy((char *)output_idx_start +
-                 unknown_repeat * unknown_num_deal * 3 * sizeof(int),
-             (char *)nram_aux_b, 3 * sizeof(int), NRAM2GDRAM, 3 * sizeof(int),
-             auxb_offset * sizeof(int), unknown_rem - 1);
-  }
-}
-template __mlu_global__ void MLUUnion1KernelThreeNN<float>(
-    const int b, const int n, const int m, char *unknown_gdram,
-    char *known_gdram, char *dist2_gdram, int *idx_gdram);
-template __mlu_global__ void MLUUnion1KernelThreeNN<half>(
-    const int b, const int n, const int m, char *unknown_gdram,
-    char *known_gdram, char *dist2_gdram, int *idx_gdram);
-void KernelThreeNNForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
-                          cnrtQueue_t queue, cnrtDataType_t data_type,
-                          const void *unknown, const void *known, void *dist2,
-                          int *idx, const int b, const int n, const int m) {
-  switch (data_type) {
-    case CNRT_FLOAT16: {
-      MLUUnion1KernelThreeNN<half><<<k_dim, k_type, queue>>>(
-          b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
-    }; break;
-    case CNRT_FLOAT32: {
-      MLUUnion1KernelThreeNN<float><<<k_dim, k_type, queue>>>(
-          b, n, m, (char *)unknown, (char *)known, (char *)dist2, idx);
-    }; break;
-    default: {
-      break;
-    }
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-__nram__ char data_nram[MAX_NRAM_SIZE];
-template <typename T>
-__mlu_func__ void mluMultiKernelTinShift(
-    const T *input, const int *shifts, T *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel) {
-  for (int cur_channel_index = taskId;
-       cur_channel_index < batch_size * channel_size;
-       cur_channel_index += taskDim) {
-    int n_index = cur_channel_index / channel_size;
-    int group_id = cur_channel_index % channel_size / group_channel;
-    int t_shift = shifts[n_index * group_size + group_id];
-    int index = cur_channel_index % channel_size * hw_size +
-                n_index * time_size * channel_size * hw_size;
-    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
-    __asm__ volatile("sync;");
-    if (abs(t_shift) >= time_size) {
-      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-               channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-               time_size - 1);
-    } else {
-      if (t_shift > 0) {
-        __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
-                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
-                 channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 time_size - 1);
-      } else {
-        __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
-                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
-                 channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 time_size - 1);
-      }
-    }
-    __asm__ volatile("sync;");
-  }
-}
-template <typename T>
-__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
-                             const int time_size, const int hw_size,
-                             const int channel_size, const int index,
-                             const int cur_sequence_index,
-                             const int max_length_per_core, T *output) {
-  for (int cur_index = index; cur_index < index + hw_size;
-       cur_index += max_length_per_core) {
-    int memcpy_size = max_length_per_core;
-    if (cur_index + max_length_per_core > index + hw_size) {
-      memcpy_size = index + hw_size - cur_index;
-    }
-    if (cur_sequence_index - t_shift < 0 ||
-        cur_sequence_index - t_shift >= time_size) {
-      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
-               NRAM2GDRAM);
-    } else {
-      __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
-               memcpy_size * sizeof(T), GDRAM2NRAM);
-      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
-               NRAM2GDRAM);
-    }
-    __asm__ volatile("sync;");
-  }
-}
-template <typename T>
-__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
-    const T *input, const int *shifts, T *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  const int tmp_max_number_hw_per_core =
-      max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
-  const int loop_time = time_size / tmp_max_number_hw_per_core +
-                        ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
-  int segmentime_size = tmp_max_number_hw_per_core;
-  int res_segment = time_size % tmp_max_number_hw_per_core;
-  for (int cur_segment_index = taskId;
-       cur_segment_index < loop_time * batch_size * channel_size;
-       cur_segment_index += taskDim) {
-    int n_index = cur_segment_index / loop_time / channel_size;
-    int group_id = cur_segment_index / loop_time % channel_size / group_channel;
-    int t_shift = shifts[n_index * group_size + group_id];
-    int index = n_index * time_size * channel_size * hw_size +
-                (cur_segment_index / loop_time % channel_size) * hw_size +
-                cur_segment_index % loop_time * segmentime_size * hw_size *
-                    channel_size;
-    char *dst_gdram2nram = data_nram;
-    const T *src_gdram2nram = input + index;
-    int count_gdram2nram = -1;
-    int count_nram2gdram = -1;
-    int next_sequence_index =
-        index / hw_size / channel_size % time_size + segmentime_size;
-    int cur_sequence_index = index / hw_size / channel_size % time_size;
-    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
-    __asm__ volatile("sync;");
-    if (max_number_hw_per_core == 0) {
-      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
-                 cur_sequence_index, max_length_per_core, output);
-      continue;
-    }
-    if (abs(t_shift) >= time_size) {
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 res_segment - 1);
-      } else {
-        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                 segmentime_size - 1);
-      }
-      continue;
-    }
-    if (t_shift == 0) {
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index;
-        count_gdram2nram = res_segment - 1;
-        count_nram2gdram = res_segment - 1;
-      } else {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index;
-        count_gdram2nram = segmentime_size - 1;
-        count_nram2gdram = segmentime_size - 1;
-      }
-    } else if (t_shift > 0) {
-      int first_index_cur_channel =
-          n_index * time_size * channel_size * hw_size +
-          (cur_segment_index / loop_time % channel_size) * hw_size;
-      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram =
-            input +
-            (index - t_shift * channel_size * hw_size < first_index_cur_channel
-                 ? first_index_cur_channel
-                 : index - t_shift * channel_size * hw_size);
-        count_gdram2nram = res_segment - 1;
-        count_nram2gdram = res_segment - 1;
-        if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
-          dst_gdram2nram =
-              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
-          count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
-        }
-      } else {
-        if (t_shift >= next_sequence_index) {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   segmentime_size - 1);
-          continue;
-        } else if (cur_sequence_index < t_shift &&
-                   t_shift < next_sequence_index) {
-          dst_gdram2nram =
-              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
-          src_gdram2nram = input + first_index_cur_channel;
-          count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
-          count_nram2gdram = segmentime_size - 1;
-        } else {
-          dst_gdram2nram = data_nram;
-          src_gdram2nram = input + index - t_shift * channel_size * hw_size;
-          count_gdram2nram = segmentime_size - 1;
-          count_nram2gdram = segmentime_size - 1;
-        }
-      }
-    } else {
-      int offset_index = time_size + t_shift;
-      if (cur_sequence_index >= offset_index) {
-        if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   res_segment - 1);
-          continue;
-        } else {
-          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-                   segmentime_size - 1);
-          continue;
-        }
-      } else {
-        dst_gdram2nram = data_nram;
-        src_gdram2nram = input + index - t_shift * channel_size * hw_size;
-        if (cur_sequence_index - t_shift + segmentime_size < time_size) {
-          count_gdram2nram = segmentime_size - 1;
-          count_nram2gdram = segmentime_size - 1;
-        } else {
-          count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
-          count_nram2gdram =
-              (segmentime_size - 1) < (time_size - cur_sequence_index - 1)
-                  ? (segmentime_size - 1)
-                  : (time_size - cur_sequence_index - 1);
-        }
-      }
-    }
-    __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
-             hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
-             count_gdram2nram);
-    __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
-             channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
-             count_nram2gdram);
-    __asm__ volatile("sync;");
-  }
-}
-__mlu_entry__ void MLUUnion1KernelTinShift(
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
-                             batch_size, time_size, channel_size, hw_size,
-                             group_size, group_channel);
-    }; break;
-    case CNRT_FLOAT32: {
-      mluMultiKernelTinShift((float *)input, (const int *)shifts,
-                             (float *)output, batch_size, time_size,
-                             channel_size, hw_size, group_size, group_channel);
-    }; break;
-    default: { return; }
-  }
-}
-__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const int max_number_hw_per_core, const int max_length_per_core,
-    const cnrtDataType_t data_dtype) {
-  // make sure that memcore is not used
-  if (coreId == 0x80) {
-    return;
-  }
-  switch (data_dtype) {
-    case CNRT_FLOAT16: {
-      mluMultiKernelTinShiftSplitSequence(
-          (half *)input, (const int *)shifts, (half *)output, batch_size,
-          time_size, channel_size, hw_size, group_size, group_channel,
-          max_number_hw_per_core, max_length_per_core);
-    }; break;
-    case CNRT_FLOAT32: {
-      mluMultiKernelTinShiftSplitSequence(
-          (float *)input, (const int *)shifts, (float *)output, batch_size,
-          time_size, channel_size, hw_size, group_size, group_channel,
-          max_number_hw_per_core, max_length_per_core);
-    }; break;
-    default: { return; }
-  }
-}
-void KernelTinShiftForward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *input, const void *shifts, void *output, const int batch_size,
-    const int time_size, const int channel_size, const int hw_size,
-    const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  if (channel_per_core >= 1) {
-    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
-        input, shifts, output, batch_size, time_size, channel_size, hw_size,
-        group_size, group_channel, data_dtype);
-  } else {
-    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
-        input, shifts, output, batch_size, time_size, channel_size, hw_size,
-        group_size, group_channel, max_number_hw_per_core, max_length_per_core,
-        data_dtype);
-  }
-}
-void KernelTinShiftBackward(
-    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-    const void *grad_output, const void *shifts, void *grad_input,
-    const int batch_size, const int time_size, const int channel_size,
-    const int hw_size, const int group_size, const int group_channel,
-    const cnrtDataType_t data_dtype, const int channel_per_core,
-    const int max_number_hw_per_core, const int max_length_per_core) {
-  if (channel_per_core >= 1) {
-    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
-        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
-        hw_size, group_size, group_channel, data_dtype);
-  } else {
-    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
-        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
-        hw_size, group_size, group_channel, max_number_hw_per_core,
-        max_length_per_core, data_dtype);
-  }
-}
--- a/mmcv/ops/csrc/common/mps/MPSDevice.h
+++ b/mmcv/ops/csrc/common/mps/MPSDevice.h
-//  Copyright © 2022 Apple Inc.
-// This file is modify from:
-// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
-#pragma once
-#include <ATen/ATen.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/Exception.h>
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-typedef id<MTLDevice> MTLDevice_t;
-#else
-typedef void* MTLDevice;
-typedef void* MTLDevice_t;
-#endif
-using namespace std;
-namespace at {
-namespace mps {
-//-----------------------------------------------------------------
-//  MPSDevice
-//
-// MPSDevice is a singleton class that returns the default device
-//-----------------------------------------------------------------
-class TORCH_API MPSDevice {
- public:
-  /**
-   * MPSDevice should not be cloneable.
-   */
-  MPSDevice(MPSDevice& other) = delete;
-  /**
-   * MPSDevice should not be assignable.
-   */
-  void operator=(const MPSDevice&) = delete;
-  /**
-   * Gets single instance of the Device.
-   */
-  static MPSDevice* getInstance();
-  /**
-   * Returns the single device.
-   */
-  MTLDevice_t device() { return _mtl_device; }
-  ~MPSDevice();
- private:
-  static MPSDevice* _device;
-  MTLDevice_t _mtl_device;
-  MPSDevice();
-};
-TORCH_API bool is_available();
-TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
-}  // namespace mps
-}  // namespace at
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.h
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.h
-#ifndef _MPS_LIBRARY_H_
-#define _MPS_LIBRARY_H_
-#include <string>
-#include <unordered_map>
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
-typedef id<MTLLibrary> MTLLibrary_t;
-#else
-typedef void* MTLComputePipelineState;
-typedef void* MTLComputePipelineState_t;
-typedef void* MTLLibrary;
-typedef void* MTLLibrary_t;
-#endif
-class MPSLibrary {
- public:
-  // disable constructor for singleton
-  static MPSLibrary* createFromUrl(const std::string& library_url);
-  static MPSLibrary* createFromSource(const std::string& source);
-  ~MPSLibrary();
-  MTLLibrary_t library() { return _library; }
-  MTLComputePipelineState_t getComputePipelineState(
-      const std::string& function_name);
- private:
-  MTLLibrary_t _library;
-  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
-};
-class MPSLibraryManager {
- public:
-  // disable constructor for singleton
-  MPSLibraryManager(const MPSLibraryManager&) = delete;
-  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
-  MPSLibraryManager(MPSLibraryManager&&) = delete;
-  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
-  static MPSLibraryManager* getInstance();
-  bool hasLibrary(const std::string& name);
-  MPSLibrary* getLibrary(const std::string& library_url);
-  MPSLibrary* createLibraryFromSouce(const std::string& name,
-                                     const std::string& sources);
-  ~MPSLibraryManager();
- private:
-  MPSLibraryManager();
-  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
-};
-#endif
--- a/mmcv/ops/csrc/common/mps/MPSLibrary.mm
+++ b/mmcv/ops/csrc/common/mps/MPSLibrary.mm
-#include "MPSLibrary.h"
-#include "MPSDevice.h"
-static std::unique_ptr<MPSLibraryManager> mps_library_manager=nullptr;
-MPSLibraryManager* MPSLibraryManager::getInstance() {
-  if(!mps_library_manager)
-    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
-  return mps_library_manager.get();
-}
-MPSLibraryManager::~MPSLibraryManager() {}
-MPSLibraryManager::MPSLibraryManager() {}
-bool MPSLibraryManager::hasLibrary(const std::string& name) {
-  return _library_map.find(name) != _library_map.end();
-}
-MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
-  if (_library_map.find(library_url) != _library_map.end()) {
-    return _library_map[library_url].get();
-  }
-  _library_map.emplace(std::make_pair(
-      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
-  return _library_map[library_url].get();
-}
-MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
-                                                      const std::string& source) {
-  NSString* ns_name = [NSString stringWithCString:name.c_str()];
-  if (_library_map.find(name) != _library_map.end()) {
-    NSLog(@"Library %@ already exist.", ns_name);
-    return nullptr;
-  }
-  _library_map.emplace(
-      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
-  return _library_map[name].get();
-}
-MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
-  MPSLibrary* library = new MPSLibrary();
-  @autoreleasepool {
-    NSError* error = nil;
-    // load library and func
-    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
-    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
-    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
-                                                                                 error:&error];
-    if (library->_library == nil) {
-      NSLog(@"Failed to find library, error %@.", error);
-      exit(1);
-    }
-  }
-  return library;
-}
-MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
-  MPSLibrary* library = new MPSLibrary();
-  @autoreleasepool {
-    NSError* error = nil;
-    // load library and func
-    NSString* code_str = [NSString stringWithCString:sources.c_str()];
-    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
-                                                                                  options:nil
-                                                                                    error:&error];
-    if (library->_library == nil) {
-      NSLog(@"Failed to find library, error %@.", error);
-      exit(1);
-    }
-  }
-  return library;
-}
-MPSLibrary::~MPSLibrary() {
-  [_library release];
-  _library = nil;
-}
-MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
-  if (_pso_map.find(function_name) != _pso_map.end()) {
-    return _pso_map[function_name];
-  }
-  MTLComputePipelineState_t pso;
-  @autoreleasepool {
-    NSError* error = nil;
-    // create function
-    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
-    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
-    if (func == nil) {
-      NSLog(@"Failed to created pipeline state object, error %@.", error);
-      exit(1);
-    }
-    // create pipeline
-    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
-                                                                                     error:&error];
-    _pso_map.emplace(std::make_pair(function_name, pso));
-  }
-  return _pso_map[function_name];
-}
--- a/mmcv/ops/csrc/common/mps/MPSStream.h
+++ b/mmcv/ops/csrc/common/mps/MPSStream.h
-//  Copyright © 2022 Apple Inc.
-// This file is modify from:
-// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
-#pragma once
-#include <cstdint>
-#include <utility>
-#include <c10/core/DeviceGuard.h>
-#include <c10/core/Stream.h>
-#include <c10/util/Exception.h>
-#include "MPSDevice.h"
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
-typedef id<MTLCommandQueue> MTLCommandQueue_t;
-typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
-typedef id<MTLSharedEvent> MTLSharedEvent_t;
-typedef id<MTLDevice> MTLDevice_t;
-#else
-typedef void* MTLCommandQueue_t;
-typedef void* MTLCommandQueue;
-typedef void* MTLCommandBuffer_t;
-typedef void* MTLCommandBuffer;
-typedef void* MTLSharedEvent_t;
-typedef void* dispatch_queue_t;
-typedef void* MTLDevice_t;
-#define nil NULL;
-#endif
-namespace at {
-namespace mps {
-//-----------------------------------------------------------------
-//  MPSStream
-//-----------------------------------------------------------------
-class TORCH_API MPSStream {
- public:
-  enum Unchecked { UNCHECKED };
-  /// Construct a MPSStream from a Stream.  This construction is checked,
-  /// and will raise an error if the Stream is not, in fact, a MPS stream.
-  explicit MPSStream(Stream stream);
-  ~MPSStream();
-  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
-  dispatch_queue_t queue() const { return _serialQueue; }
-  MTLCommandBuffer_t commandBuffer();
-  void commit(bool flush);
-  void commitAndWait();
-  void synchronize();
-  void flush();
-  /// Get the MPS device index that this stream is associated with.
-  c10::DeviceIndex device_index() const { return _stream.device_index(); }
-  MTLCommandQueue_t stream() const { return _commandQueue; };
-  MTLDevice_t device() const { return [_commandQueue device]; }
-  /// Explicit conversion to Stream.
-  Stream unwrap() const { return _stream; }
- private:
-  Stream _stream;
-  MTLCommandQueue_t _commandQueue = nil;
-  MTLCommandBuffer_t _commandBuffer = nil;
-  void _flush(bool commitAndWait) const;
-  dispatch_queue_t _serialQueue = nullptr;
-};
-/**
- * Get the current MPS stream
- */
-TORCH_API MPSStream* getCurrentMPSStream();
-/**
- * Get the default MPS stream
- */
-TORCH_API MPSStream* getDefaultMPSStream();
-//-----------------------------------------------------------------
-//  MPSStreamImpl
-//-----------------------------------------------------------------
-class TORCH_API MPSStreamImpl {
- public:
-  /**
-   * Gets single instance of the MPSStream.
-   */
-  static MPSStream* getInstance();
- private:
-  static MPSStream* _stream;
-  MPSStreamImpl();
-};
-//-----------------------------------------------------------------
-//  MPSEvent
-//-----------------------------------------------------------------
-struct TORCH_API MPSEvent {
-  MPSEvent();
-  // MPSEvent(id<MTLDevice> device);
-  ~MPSEvent();
-  MTLSharedEvent_t event() const { return _event; }
-  void recordEvent(MPSStream* stream);
-  void waitForEvent(MPSStream* queue);  // waits on the cpu
-  bool queryEvent();
-  uint64_t getCurrentValue() { return _currentValue; }
-  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
- private:
-  bool _isRecorded = false;
-  uint64_t _currentValue = 0;
-  MTLSharedEvent_t _event;
-};
-typedef MPSEvent* mpsEvent_t;
-}  // namespace mps
-}  // namespace at
--- a/mmcv/ops/csrc/common/mps/MPSUtils.h
+++ b/mmcv/ops/csrc/common/mps/MPSUtils.h
-#ifndef _MPS_UTILS_H_
-#define _MPS_UTILS_H_
-#include <torch/extension.h>
-#ifdef __OBJC__
-#include <Foundation/Foundation.h>
-#include <Metal/Metal.h>
-#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
-typedef id<MTLBuffer> MTLBuffer_t;
-typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
-#else
-typedef void* MTLBuffer;
-typedef void* MTLBuffer_t;
-typedef void* MTLComputeCommandEncoder;
-typedef void* MTLComputeCommandEncoder_t;
-#endif
-// utils
-static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
-  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
-}
-template <typename T,
-          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
-template <typename T,
-          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
-  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
-}
-template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
-void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
-  [encoder setBytes:&t length:sizeof(t) atIndex:index];
-}
-inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
-template <typename T, typename... Args>
-void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
-  setMTLArg(encoder, index, std::forward<T>(t));
-  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
-}
-template <typename... Args>
-void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
-  [encoder setComputePipelineState:pso];
-  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
-}
-#endif
--- a/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
 #ifndef PYTORCH_CPP_HELPER
 #define PYTORCH_CPP_HELPER
-#include <torch/types.h>
+#include <torch/extension.h>
 #include <vector>
 using namespace at;
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 #define CHECK_CUDA(x) \
  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_MLU(x) \
-  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
 #define CHECK_CPU(x) \
-  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+  TORCH_CHECK(!x.device().is_cuda(), #x " must be a CPU tensor")
 #define CHECK_CONTIGUOUS(x) \
  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 #define CHECK_CUDA_INPUT(x) \
  CHECK_CUDA(x);            \
  CHECK_CONTIGUOUS(x)
-#define CHECK_MLU_INPUT(x) \
-  CHECK_MLU(x);            \
-  CHECK_CONTIGUOUS(x)
 #define CHECK_CPU_INPUT(x) \
  CHECK_CPU(x);            \
  CHECK_CONTIGUOUS(x)

--- a/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
+++ b/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -15,6 +15,5 @@ using at::Tensor;
 using phalf = at::Half;
 #define __PHALF(x) (x)
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 #endif  // PYTORCH_CUDA_HELPER