Sync main with mmcv1.x branch (#2800)

0c23eb02 · bdf · GitHub · 59c1418e · 0c23eb02 · 0c23eb02
Unverified Commit 0c23eb02 authored May 19, 2023 by bdf Committed by GitHub May 19, 2023
20 changed files
--- a/docs/en/understand_mmcv/ops.md
+++ b/docs/en/understand_mmcv/ops.md
@@ -9,7 +9,7 @@ We implement common ops used in detection, segmentation, etc.
 | BallQuery                    |     | √    | √   |     |        |
 | BBoxOverlaps                 |     | √    | √   | √   | √      |
 | BorderAlign                  |     | √    |     |     |        |
-| BoxIouRotated                | √   | √    |     |     |        |
+| BoxIouRotated                | √   | √    | √   |     |        |
 | BoxIouQuadri                 | √   | √    |     |     |        |
 | CARAFE                       |     | √    | √   |     |        |
 | ChamferDistance              |     | √    |     |     |        |

--- a/docs/zh_cn/understand_mmcv/ops.md
+++ b/docs/zh_cn/understand_mmcv/ops.md
@@ -9,7 +9,7 @@ MMCV 提供了检测、分割等任务中常用的算子
 | BallQuery                    |     | √    | √   |     |        |
 | BBoxOverlaps                 |     | √    | √   | √   | √      |
 | BorderAlign                  |     | √    |     |     |        |
-| BoxIouRotated                | √   | √    |     |     |        |
+| BoxIouRotated                | √   | √    | √   |     |        |
 | BoxIouQuadri                 | √   | √    |     |     |        |
 | CARAFE                       |     | √    | √   |     |        |
 | ChamferDistance              |     | √    |     |     |        |

--- a/mmcv/ops/box_iou_rotated.py
+++ b/mmcv/ops/box_iou_rotated.py
@@ -133,7 +133,10 @@ def box_iou_rotated(bboxes1: torch.Tensor,
    if aligned:
        ious = bboxes1.new_zeros(rows)
    else:
-        ious = bboxes1.new_zeros(rows * cols)
+        if bboxes1.device.type == 'mlu':
+            ious = bboxes1.new_zeros([rows, cols])
+        else:
+            ious = bboxes1.new_zeros(rows * cols)
    if not clockwise:
        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
        flip_mat[-1] = -1

--- a/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/iou3d_mlu_kernel.mlu
-/*************************************************************************
- * Copyright (C) 2022 Cambricon.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *************************************************************************/
-#include "common_mlu_helper.hpp"
-#include "iou3d_utils.hpp"
-#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
-/* NRAM buffer
- * Suppose deal N boxes once time.
----------------------------------------------------------------
-| Basic |score (1N)+       |intersect_pts(48N)|                |
-|       |valid_box(1N)     |+ ordered_pts(48N)| temp_long(72N) |
-|       |+ temp_buffer(10N)|                  |                |
-|--------------------------|------------------|----------------|
-| Reuse |     null         |     null         |rotated_pts(16N)|
-|-------|------------------|------------------|----------------|
---------------------------------------------------------------------------
-| Basic |  dist_ram(24N)   | valid_pts(24N)  |box1(5N)  |box1_buffer(5KB) |
-|       |                  |+ nums_in_ram(1N)|+ box2(5N)|+nram_save(5KB)  |
-|--------------------------|-----------------|----------|-----------------|
-| Reuse |  vec_buffer(5N)  |    null         |   null   |      null       |
-|-------|------------------|-----------------|----------|-----------------|
-Total Basic Memory Size = 239N * sizeof(float) + 10KB
-*/
-__nram__ char nram_buffer[MAX_NRAM_SIZE];
-__mlu_shared__ char sram_buffer[SIZE_SRAM_BUF];
-template <typename T>
-__mlu_func__ void iou3D_detection(int32_t &result_box_num, int32_t *output_data,
-                                  const T *boxes_data, float *scores_data,
-                                  const int core_limit, const int input_box_num,
-                                  const float iou_threshold,
-                                  mluMemcpyDirection_t scores_load_dir,
-                                  mluMemcpyDirection_t scores_store_dir,
-                                  mluMemcpyDirection_t boxes_load_dir) {
-  // NRAM divide by (2+4*COMPUTE_COUNT_ALIGN) copies of NRAM, counted by bytes
-  const int nram_save_limit_count = 256;
-  int box_read_limit_count = 256;
-  float div_thresh_iou = 1.0 / iou_threshold;
-  // every box require 239 * sizeof(float) space in nram;
-  const int32_t copies_of_nram = 239 * sizeof(float);
-  const int32_t limit = (MAX_NRAM_SIZE - 5 * box_read_limit_count * sizeof(T) -
-                         nram_save_limit_count * sizeof(int32_t)) /
-                        copies_of_nram;
-  // x,y,z,dx,dy,dz,angle
-  const T *input_x_ptr = boxes_data;
-  const T *input_y_ptr = input_x_ptr + input_box_num;
-  const T *input_dx_ptr = input_y_ptr + 2 * input_box_num;
-  const T *input_dy_ptr = input_dx_ptr + input_box_num;
-  const T *input_angle_ptr = input_dy_ptr + 2 * input_box_num;
-  float *input_score_ptr = scores_data;
-  // data split
-  int avg_cluster = 0;
-  int rem_cluster = 0;
-  int len_cluster = 0;
-  int cluster_offset = 0;
-  if (clusterDim > 0) {
-    // union
-    avg_cluster = input_box_num / clusterDim;
-    rem_cluster = input_box_num % clusterDim;
-    len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
-    cluster_offset = avg_cluster * clusterId +
-                     (clusterId <= rem_cluster ? clusterId : rem_cluster);
-  } else {
-    // block
-    len_cluster = input_box_num;
-    cluster_offset = 0;
-  }
-  int len_core = input_box_num;
-  int input_offset = 0;
-  if (core_limit > 1) {
-    int avg_core = len_cluster / coreDim;
-    int rem_core = len_cluster % coreDim;
-    len_core = avg_core + (coreId < rem_core ? 1 : 0);
-    int core_offset =
-        avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-    input_offset = cluster_offset + core_offset;
-  }
-  int32_t max_seg_pad = IOU3D_DOWN(limit, IOU3D_SIZE);
-  int repeat_iou_compute = len_core / max_seg_pad;
-  int remain_iou_compute = len_core % max_seg_pad;
-  // basic consistent memory layout
-  void *score = ((char *)nram_buffer);
-  void *valid_box = ((char *)score) + 1 * max_seg_pad * sizeof(float);
-  void *temp_buffer = ((char *)valid_box) + 1 * max_seg_pad * sizeof(float);
-  void *intersect_pts_x =
-      ((char *)temp_buffer) + 10 * max_seg_pad * sizeof(float);
-  void *intersect_pts_y =
-      ((char *)intersect_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_x =
-      ((char *)intersect_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *ordered_pts_y =
-      ((char *)ordered_pts_x) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_1 =
-      ((char *)ordered_pts_y) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_2 = ((char *)temp_long_1) + 24 * max_seg_pad * sizeof(float);
-  void *temp_long_3 = ((char *)temp_long_2) + 24 * max_seg_pad * sizeof(float);
-  void *dist_ram = ((char *)temp_long_3) + 24 * max_seg_pad * sizeof(float);
-  void *valid_pts = ((char *)dist_ram) + 24 * max_seg_pad * sizeof(float);
-  void *nums_in_ram = ((char *)valid_pts) + 24 * max_seg_pad * sizeof(float);
-  T *box1 = (T *)(((char *)nums_in_ram) + 1 * max_seg_pad * sizeof(float));
-  T *box2 = (T *)(((char *)box1) + 5 * max_seg_pad * sizeof(float));
-  void *box1_buffer = ((char *)box2) + 5 * max_seg_pad * sizeof(float);
-  int32_t *nram_save =
-      (int32_t *)(((char *)box1_buffer) + 5 * box_read_limit_count * sizeof(T));
-  // nram_save ~ nram_save_limit_count * sizeof(int32_t)
-  int nram_save_count = 0;
-  // reuse memory
-  void *rotated_pts1_x = ((char *)dist_ram);
-  void *rotated_pts1_y =
-      ((char *)rotated_pts1_x) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_x =
-      ((char *)rotated_pts1_y) + 4 * max_seg_pad * sizeof(float);
-  void *rotated_pts2_y =
-      ((char *)rotated_pts2_x) + 4 * max_seg_pad * sizeof(float);
-  void *vec_buffer = ((char *)temp_long_1) + 5 * max_seg_pad * sizeof(float);
-  // vec_buffer ~ 16 * max_seg_pad * sizeof(float)
-  // First, initialize ram with all 0, or could cause nan/inf unexcepted results
-  __bang_write_zero((unsigned char *)nram_buffer, copies_of_nram * max_seg_pad);
-  // number 8 and 0xff relay on box_read_limit_count initial as 256
-  const int max_box_seg_id = (input_box_num - 1) >> 8;
-  const int last_rem_box_number = ((input_box_num - 1) & 0xff) + 1;
-  for (int32_t cur_box = 0; cur_box < input_box_num; ++cur_box) {
-    __sync_all();
-    int box_seg_id = cur_box >> 8, box_id = cur_box & 0xff;
-    box_read_limit_count = box_seg_id == max_box_seg_id ? last_rem_box_number
-                                                        : box_read_limit_count;
-    if (box_id == 0) {
-      // x,y,z,dx,dy,dz,angle
-      int offset_num = box_seg_id << 8;
-      // x
-      __memcpy((char *)box1_buffer, input_x_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // y
-      __memcpy((char *)box1_buffer + box_read_limit_count * 1 * sizeof(T),
-               input_y_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dx
-      __memcpy((char *)box1_buffer + box_read_limit_count * 2 * sizeof(T),
-               input_dx_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // dy
-      __memcpy((char *)box1_buffer + box_read_limit_count * 3 * sizeof(T),
-               input_dy_ptr + offset_num, box_read_limit_count * 1 * sizeof(T),
-               boxes_load_dir, box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-      // angle
-      __memcpy((char *)box1_buffer + box_read_limit_count * 4 * sizeof(T),
-               input_angle_ptr + offset_num,
-               box_read_limit_count * 1 * sizeof(T), boxes_load_dir,
-               box_read_limit_count * 1 * sizeof(T),
-               box_read_limit_count * 1 * sizeof(T), 0);
-    }
-    if (((float *)input_score_ptr)[cur_box] == 0) {
-      continue;
-    }
-    // save result
-    nram_save[nram_save_count] = cur_box;
-    result_box_num++;
-    nram_save_count++;
-    if (clusterId == 0 && coreId == 0 &&
-        nram_save_count == nram_save_limit_count) {
-      pvLock();
-      __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-               NRAM2GDRAM);
-      pvUnlock();
-      output_data += nram_save_count;
-      nram_save_count = 0;
-    }
-    // prepare box1
-    // x
-    __bang_write_value((float *)box1, max_seg_pad,
-                       float(((T *)box1_buffer)[box_id]));
-    // y
-    __bang_write_value(
-        (float *)box1 + max_seg_pad, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 1 * box_read_limit_count]));
-    // dx
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 2, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 2 * box_read_limit_count]));
-    // dy
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 3, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 3 * box_read_limit_count]));
-    // angle
-    __bang_write_value(
-        (float *)box1 + max_seg_pad * 4, max_seg_pad,
-        float(((T *)box1_buffer)[box_id + 4 * box_read_limit_count]));
-    float max_area = 1.0f *
-                     ((T *)box1_buffer)[box_id + 2 * box_read_limit_count] *
-                     ((T *)box1_buffer)[box_id + 3 * box_read_limit_count];
-    // update score
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = max_seg_pad;
-      int cpy_len =
-          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_pad;
-      // int half_offset = std::is_same<T, half>::value ? max_seg_pad * 5 : 0;
-      int half_offset = (sizeof(T) == sizeof(half)) ? max_seg_pad * 5 : 0;
-      // score
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(float), scores_load_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      // x
-      __memcpy(box2 + half_offset, input_x_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // y
-      __memcpy(box2 + half_offset + seg_len * 1,
-               input_y_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dx
-      __memcpy(box2 + half_offset + seg_len * 2,
-               input_dx_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // dy
-      __memcpy(box2 + half_offset + seg_len * 3,
-               input_dy_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // angle
-      __memcpy(box2 + half_offset + seg_len * 4,
-               input_angle_ptr + input_offset + i * max_seg_pad,
-               cpy_len * 1 * sizeof(T), boxes_load_dir, cpy_len * 1 * sizeof(T),
-               cpy_len * 1 * sizeof(T), 0);
-      // if (std::is_same<T, half>::value) {
-      if (sizeof(T) == sizeof(half)) {
-        __bang_half2float((float *)box2, (half *)(box2 + half_offset),
-                          seg_len * 5);
-      }
-      // Calculate rotated vertices
-      void *temp1_ram = ((char *)temp_buffer);
-      void *temp2_ram = ((char *)temp_buffer) + seg_len * sizeof(float);
-      void *temp3_ram = ((char *)temp_buffer) + 2 * seg_len * sizeof(float);
-      void *temp4_ram = ((char *)temp_buffer) + 3 * seg_len * sizeof(float);
-      getRotatedVertices((float *)rotated_pts1_x, (float *)rotated_pts1_y,
-                         (float *)box1, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-      getRotatedVertices((float *)rotated_pts2_x, (float *)rotated_pts2_y,
-                         (float *)box2, (float *)temp1_ram, (float *)temp2_ram,
-                         (float *)temp3_ram, (float *)temp4_ram, seg_len);
-      __bang_write_zero((float *)valid_pts, 24 * seg_len);
-      __bang_write_zero((float *)nums_in_ram, seg_len);
-      __bang_write_value(((float *)valid_box), seg_len, 1.0f);
-      void *vec1_x = ((char *)vec_buffer);
-      void *vec1_y = ((char *)vec1_x) + 4 * seg_len * sizeof(float);
-      void *vec2_x = ((char *)vec1_y) + 4 * seg_len * sizeof(float);
-      void *vec2_y = ((char *)vec2_x) + 4 * seg_len * sizeof(float);
-      void *temp5_ram = ((char *)temp_buffer) + 4 * seg_len * sizeof(float);
-      void *temp6_ram = ((char *)temp_buffer) + 5 * seg_len * sizeof(float);
-      void *temp7_ram = ((char *)temp_buffer) + 6 * seg_len * sizeof(float);
-      void *temp8_ram = ((char *)temp_buffer) + 7 * seg_len * sizeof(float);
-      void *temp9_ram = ((char *)temp_buffer) + 8 * seg_len * sizeof(float);
-      void *temp10_ram = ((char *)temp_buffer) + 9 * seg_len * sizeof(float);
-      // Get all intersection points
-      getIntersectPts(
-          (float *)rotated_pts1_x, (float *)rotated_pts1_y,
-          (float *)rotated_pts2_x, (float *)rotated_pts2_y, (float *)vec1_x,
-          (float *)vec1_y, (float *)vec2_x, (float *)vec2_y,
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)valid_pts, (float *)nums_in_ram, (float *)temp1_ram,
-          (float *)temp2_ram, (float *)temp3_ram, (float *)temp4_ram,
-          (float *)temp5_ram, (float *)temp6_ram, (float *)temp7_ram,
-          (float *)temp8_ram, (float *)temp9_ram, (float *)temp10_ram, seg_len);
-      // Where nums_in <= 2, set valid_box to false
-      __bang_write_value((float *)temp9_ram, COMPUTE_COUNT_ALIGN, (float)2);
-      __bang_cycle_gt((float *)temp1_ram, (float *)nums_in_ram,
-                      (float *)temp9_ram, seg_len, COMPUTE_COUNT_ALIGN);
-      __bang_and((float *)valid_box, (float *)valid_box, (float *)temp1_ram,
-                 seg_len);
-      __bang_cycle_and((float *)valid_pts, (float *)valid_pts,
-                       (float *)valid_box, 24 * seg_len, seg_len);
-      // Convex-hull-graham to order the intersection points in clockwise order
-      // and find the contour area
-      convexHullGraham(
-          (float *)intersect_pts_x, (float *)intersect_pts_y,
-          (float *)ordered_pts_x, (float *)ordered_pts_y, (float *)dist_ram,
-          (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-          (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-          (float *)temp_long_1, (float *)temp_long_2, (float *)temp_long_3,
-          seg_len, seg_len);
-      // Calculate polygon area
-      // set temp1 = intersection part area
-      polygonArea((float *)ordered_pts_x, (float *)ordered_pts_y,
-                  (float *)valid_box, (float *)valid_pts, (float *)nums_in_ram,
-                  (float *)temp1_ram, (float *)temp2_ram, (float *)temp3_ram,
-                  (float *)temp4_ram, (float *)temp5_ram, (float *)temp6_ram,
-                  (float *)temp7_ram, (float *)temp8_ram, (float *)temp9_ram,
-                  seg_len);
-      // area
-      __bang_mul((float *)temp2_ram, (float *)box2 + seg_len * 2,
-                 (float *)box2 + seg_len * 3, seg_len);
-      // get the area_U: area + max_area - area_I
-      __bang_add_scalar((float *)temp2_ram, (float *)temp2_ram, float(max_area),
-                        seg_len);
-      __bang_sub((float *)temp2_ram, (float *)temp2_ram, (float *)temp1_ram,
-                 seg_len);  // area_U
-      if (iou_threshold > 0.0) {
-        __bang_mul_scalar((float *)temp1_ram, (float *)temp1_ram,
-                          div_thresh_iou, seg_len);
-      } else {
-        __bang_mul_scalar((float *)temp2_ram, (float *)temp2_ram, iou_threshold,
-                          seg_len);
-      }
-      __bang_ge((float *)temp1_ram, (float *)temp2_ram, (float *)temp1_ram,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)temp1_ram, seg_len);
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_pad, score,
-               cpy_len * sizeof(float), scores_store_dir,
-               cpy_len * sizeof(float), cpy_len * sizeof(float), 0);
-      pvUnlock();
-    }
-  }
-  if (clusterId == 0 && coreId == 0 && nram_save_count) {
-    pvLock();
-    __memcpy(output_data, nram_save, nram_save_count * sizeof(int32_t),
-             NRAM2GDRAM);
-    pvUnlock();
-  }
-}
-__mlu_global__ void MLUBlockorUnionIKernelOU3D(
-    const void *input_boxes, const int input_box_num, const float iou_threshold,
-    const cnrtDataType_t data_type_input, void *workspace, void *result_num,
-    void *output) {
-  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  mluMemcpyDirection_t scores_load_dir = GDRAM2NRAM;
-  mluMemcpyDirection_t scores_store_dir = NRAM2GDRAM;
-  mluMemcpyDirection_t boxes_load_dir = GDRAM2NRAM;
-  float *scores_data = (float *)workspace;
-  float *boxes_data = (float *)input_boxes;
-  const int cluster_score_size = input_box_num * sizeof(float);
-  const int cluster_boxes_size = input_box_num * 7 * input_dwidth;
-  char *sram_score = (char *)sram_buffer;
-  char *sram_boxes = (char *)sram_buffer + cluster_score_size;
-  if (clusterDim == 1 && SIZE_SRAM_BUF > cluster_score_size) {
-    scores_data = (float *)sram_score;
-    scores_load_dir = SRAM2NRAM;
-    scores_store_dir = NRAM2SRAM;
-    if (coreId == 0x80) {
-      __sramset((void *)sram_buffer, input_box_num, 1.0f);
-    }
-  } else {
-    if (coreId == 0) {
-      __gdramset(scores_data, input_box_num, 1.0f);
-    }
-  }
-  if (clusterDim == 1 &&
-      SIZE_SRAM_BUF - cluster_score_size >= cluster_boxes_size) {
-    boxes_load_dir = SRAM2NRAM;
-    boxes_data = (float *)sram_boxes;
-    if (coreId == 0x80) {
-      __memcpy((char *)boxes_data, (char *)input_boxes, cluster_boxes_size,
-               GDRAM2SRAM);
-    }
-  }
-  __sync_cluster();
-  int32_t result_box_num = 0;
-  int32_t *out_data = (int32_t *)output;
-  switch (data_type_input) {
-    default: { return; }
-    case CNRT_FLOAT16: {
-      iou3D_detection(result_box_num, out_data, (half *)boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-    case CNRT_FLOAT32: {
-      iou3D_detection(result_box_num, out_data, boxes_data, scores_data,
-                      taskDim, input_box_num, iou_threshold, scores_load_dir,
-                      scores_store_dir, boxes_load_dir);
-    }; break;
-  }
-  ((int32_t *)result_num)[0] = result_box_num;
-}
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output) {
-  switch (k_type) {
-    default: { return; }
-    case CNRT_FUNC_TYPE_BLOCK:
-    case CNRT_FUNC_TYPE_UNION1:
-    case CNRT_FUNC_TYPE_UNION2:
-    case CNRT_FUNC_TYPE_UNION4:
-    case CNRT_FUNC_TYPE_UNION8:
-    case CNRT_FUNC_TYPE_UNION16: {
-      MLUBlockorUnionIKernelOU3D<<<k_dim, k_type, queue>>>(
-          (void *)boxes_dram, input_box_num, iou_threshold, data_type_input,
-          workspace, output_size, output);
-    }; break;
-  }
-}
--- a/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/iou3d_utils.hpp
--- a/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/ms_deform_attn_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roiaware_pool3d_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/three_nn_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/voxelization_mlu_kernel.mlu
--- a/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/box_iou_rotated.cpp
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "mlu_common_helper.h"
+void BoxIouRotatedMLUKernelLauncher(const Tensor boxes1, const Tensor boxes2,
+                                    Tensor ious, const int mode_flag,
+                                    const bool aligned) {
+  // get compute handle
+  auto handle = mluOpGetCurrentHandle();
+  auto boxes1_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes1, boxes1.suggest_memory_format());
+  auto boxes2_contiguous = torch_mlu::cnnl::ops::cnnl_contiguous(
+      boxes2, boxes2.suggest_memory_format());
+  auto ious_contiguous =
+      torch_mlu::cnnl::ops::cnnl_contiguous(ious, ious.suggest_memory_format());
+  MluOpTensorDescriptor boxes1_desc, boxes2_desc, ious_desc;
+  boxes1_desc.set(boxes1_contiguous);
+  boxes2_desc.set(boxes2_contiguous);
+  ious_desc.set(ious_contiguous);
+  auto boxes1_impl = torch_mlu::getMluTensorImpl(boxes1_contiguous);
+  auto boxes2_impl = torch_mlu::getMluTensorImpl(boxes2_contiguous);
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious_contiguous);
+  auto boxes1_ptr = boxes1_impl->cnnlMalloc();
+  auto boxes2_ptr = boxes2_impl->cnnlMalloc();
+  auto ious_ptr = ious_impl->cnnlMalloc();
+  CNLOG(INFO) << "Call mluOpBoxIouRotated().";
+  mluOpBoxIouRotated(handle, mode_flag, aligned, boxes1_desc.desc(), boxes1_ptr,
+                     boxes2_desc.desc(), boxes2_ptr, ious_desc.desc(),
+                     ious_ptr);
+}
+void box_iou_rotated_mlu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  BoxIouRotatedMLUKernelLauncher(boxes1, boxes2, ious, mode_flag, aligned);
+}
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, MLU, box_iou_rotated_mlu);
--- a/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/iou3d_mlu.cpp
@@ -10,114 +10,30 @@
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
-#include "pytorch_device_registry.hpp"
+#include "mlu_common_helper.h"
-#include "pytorch_mlu_helper.hpp"
-void KernelIou3d(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
-                 const cnrtDataType_t data_type_input, const void *boxes_dram,
-                 const int input_box_num, const float iou_threshold,
-                 void *workspace, void *output_size, void *output);
-int selectType(uint32_t use_job, int box_num_per_core) {
-  // the box_num_per_core should be at least 256, otherwise the real IO
-  // bandwidth would be very low
-  while (box_num_per_core < 256 && use_job >= 4) {
-    box_num_per_core *= 2;
-    use_job /= 2;
-  }
-  return use_job;
-}
-static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
-                               int &core_num_per_class,
-                               const int input_box_num) {
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
-  uint32_t job_limit = getJobLimitCapability();
-  uint32_t core_number = job_limit;
-  int box_num_per_core = (input_box_num + core_number - 1) / core_number;
-  int use_job = selectType(job_limit, box_num_per_core);
-  // initiate k_type as Union1
-  k_dim->x = core_dim;
-  k_dim->y = 1;
-  k_dim->z = 1;
-  *k_type = CNRT_FUNC_TYPE_UNION1;
-  switch (job_limit) {
-    case CN_KERNEL_CLASS_BLOCK:
-    case CN_KERNEL_CLASS_UNION:
-    case CN_KERNEL_CLASS_UNION2:
-    case CN_KERNEL_CLASS_UNION4:
-    case CN_KERNEL_CLASS_UNION8:
-    case CN_KERNEL_CLASS_UNION16: {
-      if (use_job < 4) {
-        k_dim->x = 1;
-        *k_type = CNRT_FUNC_TYPE_BLOCK;
-      } else if (use_job == 4) {
-        k_dim->x = core_dim;
-        *k_type = CNRT_FUNC_TYPE_UNION1;
-      } else {
-        k_dim->x = use_job;
-        *k_type = (cnrtFunctionType_t)use_job;
-      }
-    }; break;
-    default:
-      LOG(WARNING) << "[cnnlNms_v2]: got unsupported job limit number."
-                   << " Use default CN_KERNEL_CLASS_UNION1 with UNION1 task.";
-  }
-  return CNNL_STATUS_SUCCESS;
-}
 void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
                                 float iou_threshold) {
-  // dimension parameters check
-  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
-              boxes.dim(), "D");
-  TORCH_CHECK(boxes.size(1) == 7,
-              "boxes should have 7 elements in dimension 1, got ",
-              boxes.size(1));
-  // data type check
-  TORCH_CHECK(
-      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
-      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
  if (boxes.numel() == 0) {
    return;
  }
-  const size_t max_input_num = 2147483648;  // 2^31, 2G num
-  TORCH_CHECK(boxes.numel() < max_input_num,
-              "boxes.numel() should be less than 2147483648, got ",
-              boxes.numel());
-  int input_box_num = boxes.size(0);
-  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
-  cnrtDim3_t k_dim;
-  cnrtJobType_t k_type;
-  int core_num_per_class;
-  policyFunc(&k_dim, &k_type, core_num_per_class, input_box_num);
-  // transpose boxes (n, 7) to (7, n) for better performance
+  int input_box_num = boxes.size(0);
-  auto boxes_t = boxes.transpose(0, 1);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes);
-  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
+  auto output = keep.to(boxes.options().dtype(at::kInt));
-  auto output = at::empty({input_box_num}, boxes.options().dtype(at::kLong));
  auto output_size = at::empty({1}, boxes.options().dtype(at::kInt));
-  // workspace
+  MluOpTensorDescriptor boxes_desc, output_desc;
-  const int info_num = 7;  // x, y,z, dx, dy, dz,angle
+  boxes_desc.set(boxes_);
-  size_t space_size = 0;
+  output_desc.set(output);
-  if (boxes.scalar_type() == at::kHalf) {
-    space_size = input_box_num * sizeof(int16_t) * info_num +
-                 input_box_num * sizeof(float) + sizeof(float);
-  } else {
-    space_size = input_box_num * sizeof(float) * (info_num + 1) + sizeof(float);
-  }
-  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+  // workspace
+  size_t workspace_size = 0;
+  auto handle = mluOpGetCurrentHandle();
+  mluOpGetNmsWorkspaceSize(handle, boxes_desc.desc(), NULL, &workspace_size);
+  auto workspace = at::empty(workspace_size, boxes.options().dtype(at::kByte));
  // get compute queue
-  auto queue = torch_mlu::getCurQueue();
  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
  auto boxes_ptr = boxes_impl->cnnlMalloc();
  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
@@ -127,11 +43,29 @@ void IoU3DNMS3DMLUKernelLauncher(Tensor boxes, Tensor &keep, Tensor &keep_num,
  auto output_size_impl = torch_mlu::getMluTensorImpl(keep_num);
  auto output_size_ptr = output_size_impl->cnnlMalloc();
-  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  // nms desc
-  CNLOG(INFO) << "Launch Kernel KernelIou3d<<<Union" << k_type / core_dim
+  mluOpNmsDescriptor_t nms_desc;
-              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  const mluOpNmsBoxPointMode_t box_mode = (mluOpNmsBoxPointMode_t)0;
-  KernelIou3d(k_dim, k_type, queue, data_type_input, boxes_ptr, input_box_num,
+  const mluOpNmsOutputMode_t output_mode = (mluOpNmsOutputMode_t)0;
-              iou_threshold, workspace_ptr, output_size_ptr, output_ptr);
+  const mluOpNmsAlgo_t algo = (mluOpNmsAlgo_t)0;
+  const mluOpNmsMethodMode_t method_mode = (mluOpNmsMethodMode_t)0;
+  const float soft_nms_sigma = 0.0;
+  const float confidence_threshold = 0.0;
+  const int input_layout = 0;
+  const bool pad_to_max_output_size = false;
+  const int max_output_size = input_box_num;
+  const float offset = 0.0;
+  mluOpCreateNmsDescriptor(&nms_desc);
+  mluOpSetNmsDescriptor(nms_desc, box_mode, output_mode, algo, method_mode,
+                        iou_threshold, soft_nms_sigma, max_output_size,
+                        confidence_threshold, offset, input_layout,
+                        pad_to_max_output_size);
+  mluOpNms(handle, nms_desc, boxes_desc.desc(), boxes_ptr, NULL, NULL,
+           workspace_ptr, workspace_size, output_desc.desc(), output_ptr,
+           output_size_ptr);
+  mluOpDestroyNmsDescriptor(nms_desc);
 }
 void iou3d_nms3d_forward_mlu(const Tensor boxes, Tensor &keep, Tensor &keep_num,

--- a/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
+++ b/mmcv/ops/csrc/pytorch/mlu/mlu_common_helper.h
@@ -18,8 +18,8 @@
 #include "pytorch_device_registry.hpp"
 #define MLUOP_MAJOR 0
-#define MLUOP_MINOR 5
+#define MLUOP_MINOR 6
-#define MLUOP_PATCHLEVEL 302
+#define MLUOP_PATCHLEVEL 0
 mluOpDataType_t getMluOpDataType(const caffe2::TypeMeta& data_type);
 mluOpTensorLayout_t getMluOpSuggestLayout(const at::Tensor& input);

--- a/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/ms_deform_attn_mlu.cpp
--- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
--- a/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
--- a/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/roiaware_pool3d_mlu.cpp
--- a/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp
+++ b/mmcv/ops/csrc/pytorch/mlu/three_nn_mlu.cpp