[Refactor] Adapt mlu code to cntoolkit3.0.1

e847cf8a · bdf · Zaida Zhou · 4c6e99c8 · e847cf8a · e847cf8a
Commit e847cf8a authored Oct 10, 2022 by bdf Committed by Zaida Zhou Nov 23, 2022
9 changed files
--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(

      // right - left + offset ---> left
      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);

      // bottom - top + offset ---> right
      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);

      // zero vector ---> bottom
-      __nramset(vec_bottom, batches_stride, 0.f);
+      __bang_write_value(vec_bottom, batches_stride, 0.f);

      // width --> vec_left
      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
      // get the b1_area
      // (b1_x2 - b1_x1 + offset)  --->  vec_top
      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);

      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);

      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
      // --->  vec_top;
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
      // get the b2_area
      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);

      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);

      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
      // --->  b2_x1;
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
      T *inter_s = height;

      // offset vector ---> vec_b2_y1
-      __nramset(vec_b2_y1, batches_stride, T(offset));
+      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
      T *vec_offset = vec_b2_y1;

      if (mode == 0) {
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
      int32_t base1 = b1 * COORD_NUM;

      // set bbox1 and bbox2 to nram
-      __nramset(vec_b1_x1, batches_stride, bbox1[base1]);
-      __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
+      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
+      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
+      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
+      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);

      for (int32_t j = 0; j < num_loop_cpy; j++) {
        int32_t index2 = j * batches_stride;
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(

        // right - left + offset ---> left
        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
        // bottom - top + offset ---> right
        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);

        // zero vector ---> bottom
-        __nramset(vec_bottom, batches_stride, (T)0);
+        __bang_write_value(vec_bottom, batches_stride, (T)0);

        // width --> vec_left
        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
        // get the b1_area
        // (b1_x2 - b1_x1 + offset)  --->  vec_top
        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
        // --->  vec_top;
        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
        // get the b2_area
        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
        // --->  b2_x1;
        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
        T *inter_s = height;

        // offset vector ---> vec_b2_y1
-        __nramset(vec_b2_y1, batches_stride, T(offset));
+        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
        T *vec_offset = vec_b2_y1;

        if (mode == 0) {

--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
    blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;

    // set output_nram to zero
-    __nramset(output_nram, param.output_nram_size, T(0));
+    __bang_write_value(output_nram, param.output_nram_size, T(0));

    // loop blocks of kernel window: grid_dim.(Kh, Kw)
    for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
                T *sum = sum_array;

                for (int g = 0; g < blkSize.G; ++g) {
-                  __bang_mul_const(sum, src, mask_array[mask_index],
-                                   param.block_Cg_NFU);
+                  __bang_mul_scalar(sum, src, mask_array[mask_index],
+                                    param.block_Cg_NFU);
                  //
                  // NOTE: Since block_Cg_NFU >= block_Cg_stride,
                  // overlapped writing may occur on sum_array.
@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
          T *base_grad_input = (T *)grad_input + input_index;
          __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
                   GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], num_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], num_align);
          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                            (T *)grad_input_buff, num_align);
          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
          T *base_grad_input = (T *)grad_input + input_index;
          __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
                   GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], rem_for_loop_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], rem_for_loop_align);
          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                            (T *)grad_input_buff, rem_for_loop);
          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                          const int wi, const int c, const int k_up,
                          const int group, const int scale) {
  if (dtype == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelCarafeBackward<half>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
  } else {
-    backward::MLUUnion1KernelCarafeBackward<float>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
  }
 }
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
  // get sign bit
  const float move_23bit = 8388608.0;
  // 0x80000000 = 1,000000000,0000000000000000000000000000
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
  // get 1 or 0 from sign bit
  // judg is Odd
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
                   (char *)src_addition, src_count * sizeof(float),
                   NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000001);
  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  // minus xor, positive num invariant
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
  // convert int32 to float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x7fffff);
  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x4b000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x4b000000);
  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
                   src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_sub_const(dst, dst, move_23bit, src_count);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
  // add one
  __bang_add(dst, dst, dst_addition, src_count);
  // set sign for float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * 4, 128);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // dst_addition = abs(src)
  __bang_mul(dst_addition, src, (float *)dst, src_count);
  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xbf800000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
  // set negative flag -1.0 = 0xbf80000
  __bang_cycle_eq(
      (float *)dst, (float *)dst, (float *)src_addition, src_count,
      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
  __bang_active_abs(dst_addition, src, src_count);
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
  // mask shift move 23
  __bang_cycle_add_tz(
      dst_addition, dst_addition, src_addition, src_count,
@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // to fix max value
  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
  // means max value.
-  __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
              src_count * floatDchar);
  // get low 23bit
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            (unsigned)0x007fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
  // mask low 23bit is 1
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * floatDchar,
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // set 9 high bit ===> dst
  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
-  __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  // src or dst_addition
  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
-  __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
 #endif  // __BANG_ARCH__ >= 300
 }

+/*!
+ * @brief Converts float32 to half data type,
+ * the rounding mode on MLU200 is rd, on MLU300 is rn.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores half type data.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ inline void convertFloat2half(half *dst, float *src,
+                                           int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2half_rn(dst, src, src_count);
+#else
+  __bang_float2half_rd(dst, src, src_count);
+#endif
+}
+
 #endif  // COMMON_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@@ -9,14 +9,9 @@
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *************************************************************************/
-#include "common_mlu_helper.hpp"
+#include "nms_utils.hpp"

-#define NMS_SIZE (64)
 #define COORD_DIM (4)
-#define MEMORY_CORE (0x80)
-#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
-#define REDUCE_NUM \
-  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)

 #define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
 #define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
@@ -24,348 +19,129 @@
 __nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
 __mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];

-__mlu_func__ void pvLock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_lock(0, 0);
-  }
-#endif
-}
-
-__mlu_func__ void pvUnlock() {
-#if __BANG_ARCH__ == 270
-  if (coreId != MEMORY_CORE) {
-    __bang_unlock(0, 0);
-  }
-#endif
-}
-
 enum Addr { SRAM, GDRAM };

 template <typename IN_DT, typename OUT_DT>
 __mlu_func__ void nms_detection(
-    uint32_t *output_box_num, const int output_mode, const int input_layout,
-    OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
-    const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
-    const int buffer_size, IN_DT *sram, const int core_limit,
-    const int input_box_num, const int input_stride, const int output_stride,
-    const int keepNum, const float thresh_iou, const float thresh_score,
+    uint32_t &output_box_num, const int output_mode, OUT_DT *output_dram,
+    IN_DT *input_data_score, const IN_DT *input_data_box, const Addr input_ram,
+    IN_DT *sram, const int core_limit, const int input_num_boxes,
+    const int max_output_size, const float thresh_iou, const float thresh_score,
    const float offset, const int algo) {
-  // global value, it is stored in sram with a offset from the begin.
-  const int flag_offset_size = 28;
-  int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
-  loop_end_flag[0] = 0;
+  // global value
+  int32_t *exit_flag = (int32_t *)(sram + 28);
+  exit_flag[0] = 0;
  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
-  const int nms_buffer_count1 = 9;
+  int nms_buffer_count1 = 9;
  // temp nram buffer to store selected target.
-  const int nram_save_limit_count = 256;
+  int nram_save_limit_count = 256;
  float div_thresh_iou = 1.0 / thresh_iou;

  // input data ptr
-  IN_DT *input_score_ptr;
-  const IN_DT *input_x1_ptr;
-  const IN_DT *input_y1_ptr;
-  const IN_DT *input_x2_ptr;
-  const IN_DT *input_y2_ptr;
-  input_score_ptr = input_data_score;
-  input_x1_ptr = input_data_box;
-  if (input_layout == 0) {
-    // [boxes_num, 4]
-    input_y1_ptr = input_x1_ptr + 1;
-    input_x2_ptr = input_x1_ptr + 2;
-    input_y2_ptr = input_x1_ptr + 3;
-  } else if (input_layout == 1) {
-    // [4, boxes_num]
-    input_y1_ptr = input_x1_ptr + input_stride;
-    input_x2_ptr = input_y1_ptr + input_stride;
-    input_y2_ptr = input_x2_ptr + input_stride;
-  }
-
-  // nram data ptr
-  IN_DT *x1;
-  IN_DT *y1;
-  IN_DT *x2;
-  IN_DT *y2;
-  IN_DT *score;
-  IN_DT *inter_x1;
-  IN_DT *inter_y1;
-  IN_DT *inter_x2;
-  IN_DT *inter_y2;
-  IN_DT *max_box;  // the max score, x1, y1, x2, y2
-  IN_DT *x1_mask;
-  IN_DT *y1_mask;
-  IN_DT *x2_mask;
-  IN_DT *y2_mask;
-  OUT_DT *nram_save;
+  const IN_DT *input_x1_ptr = input_data_box;
+  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
+  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
+  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;

  int limit = 0;        // find limit when GDRAM or SRAM
-  int len_core = 0;     // the length deal by every core
  int max_seg_pad = 0;  // the max length every repeat
  int repeat = 0;
  int remain = 0;
  int remain_pad = 0;
  int input_offset = 0;  // offset of input_data for current core
  int nram_save_count = 0;
-  // mask for collect x1, y1, x2, y2. each mask has 128 elements
-  const int mask_size = 128;
-  const int total_mask_size = 512;

  if (output_mode == 0) {
-    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * sizeof(OUT_DT) -
-             total_mask_size * sizeof(IN_DT)) /
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT)) /
            (nms_buffer_count1 * sizeof(IN_DT));
  } else {
-    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
-             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
-             total_mask_size * sizeof(IN_DT)) /
+    // 5 maens: score, x1, y1, x2, y2
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * 5 * sizeof(OUT_DT)) /
            (nms_buffer_count1 * sizeof(IN_DT));
  }

-  if (core_limit == 1) {
-    len_core = input_box_num;
-    input_offset = 0;
-  } else {
-    int avg_core = input_box_num / core_limit;
-    int rem = input_box_num % core_limit;
-    len_core = avg_core + (taskId < rem ? 1 : 0);
-    input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
-  }
-  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
-  repeat = len_core / max_seg_pad;
-  remain = len_core % max_seg_pad;
-  remain_pad = PAD_UP(remain, NMS_SIZE);
+  int max_seg_iou_compute = 0;
+  int repeat_iou_compute = 0;
+  int remain_iou_compute = 0;
+  int remain_pad_iou_compute = 0;

-  // if datatype is half, we should convert it to float when compute the IoU
-  int max_seg_iou_compute =
-      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
-  int repeat_iou_compute = len_core / max_seg_iou_compute;
-  int remain_iou_compute = len_core % max_seg_iou_compute;
-  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
-  // initial the address point
-  score = buffer;
-  x1 = score + max_seg_pad;
-  y1 = x1 + max_seg_pad;
-  x2 = y1 + max_seg_pad;
-  y2 = x2 + max_seg_pad;
-  inter_x1 = y2 + max_seg_pad;
-  inter_y1 = inter_x1 + max_seg_pad;
-  inter_x2 = inter_y1 + max_seg_pad;
-  inter_y2 = inter_x2 + max_seg_pad;
-  x1_mask = inter_y2 + max_seg_pad;
-  y1_mask = x1_mask + mask_size;
-  x2_mask = y1_mask + mask_size;
-  y2_mask = x2_mask + mask_size;
-  max_box = y2_mask + mask_size;  // the max score, x1, y1, x2, y2
-  // offset two line from max_box
-  nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
+  getComputeParamsBlockOrU1(sizeof(IN_DT), input_num_boxes, limit, core_limit,
+                            input_offset, max_seg_pad, repeat, remain,
+                            remain_pad, max_seg_iou_compute, repeat_iou_compute,
+                            remain_iou_compute, remain_pad_iou_compute);

-  // set mask for __bang_collect instruction
-  if (input_layout == 0) {
-    __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
-    for (int idx = 0; idx < mask_size; idx++) {
-      int index = (idx % COORD_DIM) * mask_size + idx;
-      x1_mask[index] = (IN_DT)1.0;
-    }
-  }
+  // init the data ptr
+  IN_DT *score = (IN_DT *)nram_buffer;
+  IN_DT *x1 = score + max_seg_pad;
+  IN_DT *y1 = x1 + max_seg_pad;
+  IN_DT *x2 = y1 + max_seg_pad;
+  IN_DT *y2 = x2 + max_seg_pad;
+  IN_DT *inter_x1 = y2 + max_seg_pad;
+  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
+  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
+  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
+  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
+  OUT_DT *nram_save =
+      (OUT_DT *)((char *)max_box +
+                 NFU_ALIGN_SIZE);  // offset two line from max_box

-  for (int keep = 0; keep < keepNum; keep++) {  // loop until the max_score <= 0
+#if __BANG_ARCH__ >= 300
+  float max_box_x1 = 0;
+  float max_box_y1 = 0;
+  float max_box_x2 = 0;
+  float max_box_y2 = 0;
+#endif
+  mluMemcpyDirection_t load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t store_dir = NRAM2SRAM;
+  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+
+  for (int keep = 0; keep < max_output_size;
+       keep++) {  // loop until the max_score <= 0
    if (core_limit != 1) {
      __sync_cluster();  // sync before current loop
    }

-    /******find max start******/
+    /******FIND MAX START******/
    int max_index = 0;         // the max score index
    int global_max_index = 0;  // for U1
-    float max_area = 0;        // the max score area
+    float max_area = 0;        // the max socre area
    max_box[0] = 0;            // init 0
-
-    for (int i = 0; i <= repeat; i++) {
-      if (i == repeat && remain == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
-      // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
-      // num that half data type could express.
-      if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
-        // seg length exceeds the max num for fp16 datatype!
-        return;
-      }
-      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
-      /******nms load start******/
-      mluMemcpyDirection_t load_dir = SRAM2NRAM;
-      if (src == SRAM) {
-        load_dir = SRAM2NRAM;
-      } else {
-        load_dir = GDRAM2NRAM;
-      }
-      __nramset(score, seg_len, (IN_DT)0);
-      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-
-      /******nms load end******/
-
-      __bang_max(inter_x1, score, seg_len);
-      if (inter_x1[0] > max_box[0]) {
-        max_box[0] = inter_x1[0];
-
-        if (sizeof(IN_DT) == sizeof(half)) {
-          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        } else if (sizeof(IN_DT) == sizeof(float)) {
-          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
-                      i * max_seg_pad;  // offset start from head of input_data
-        }
-      }
-    }  // for repeat
-
-    int stride = 1;
-    if (input_layout == 0) {
-      stride = input_stride;
-    } else if (input_layout == 1) {
-      stride = 1;
-    }
+    findCoreMaxBox(input_data_score, score, inter_x1, max_box, input_x1_ptr,
+                   input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
+                   input_offset, repeat, remain, remain_pad, max_seg_pad,
+                   max_index);

    if (core_limit == 1) {
-      max_box[1] = input_x1_ptr[max_index * stride];
-      max_box[2] = input_y1_ptr[max_index * stride];
-      max_box[3] = input_x2_ptr[max_index * stride];
-      max_box[4] = input_y2_ptr[max_index * stride];
-      if (algo == 0 || offset == 0.0) {
-        max_area = ((float)max_box[3] - (float)max_box[1]) *
-                   ((float)max_box[4] - (float)max_box[2]);
-      } else {
-        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                   ((float)max_box[4] - (float)max_box[2] + offset);
-      }
-      input_score_ptr[max_index] = 0;
+#if __BANG_ARCH__ >= 300
+      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+                 max_box_x2, max_box_y2);
+#else
+      calMaxArea(max_box, algo, offset, max_area);
+#endif
+      input_data_score[max_index] = 0;
      global_max_index = max_index;
-      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
    } else if (core_limit == 4) {
-      // find the max with sram
-      // the max box's x1, y1, x2, y2 on every core
-      if (coreId != MEMORY_CORE) {
-        max_box[1] = input_x1_ptr[max_index * stride];
-        max_box[2] = input_y1_ptr[max_index * stride];
-        max_box[3] = input_x2_ptr[max_index * stride];
-        max_box[4] = input_y2_ptr[max_index * stride];
-      }
-      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
-      // copy every core's box info to sram, form: score---x1---y1---x2---y2---
-      for (int i = 0; i < INFO_NUM; i++) {
-        __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
-                 NRAM2SRAM);
-      }
-      // copy every core's max_index to sram, use 2 half to store max_index
-      __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
-               sizeof(uint32_t),
-               NRAM2SRAM);  // int32_t datatype
      __sync_cluster();
+      findClusterMaxBox(sram, max_box, inter_x1, input_data_score, core_limit);

-      // copy score from sram to nram and find the max
-      __nramset(inter_x1, NMS_SIZE, (IN_DT)0);
-      __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
-      __bang_max(max_box, inter_x1, NMS_SIZE);
-      int max_core = 0;
-      if (sizeof(IN_DT) == sizeof(half)) {
-        max_core = ((uint16_t *)max_box)[1];
-      } else if (sizeof(IN_DT) == sizeof(float)) {
-        max_core = ((uint32_t *)max_box)[1];
-      }
-
-      // copy the max box from SRAM to NRAM
-      __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // x1
-      __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // y1
-      __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // x2
-      __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
-               SRAM2NRAM);  // y2
-      __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
-               sizeof(uint32_t), SRAM2NRAM);
-      if (algo == 0 || offset == 0.0) {
-        max_area = ((float)max_box[3] - (float)max_box[1]) *
-                   ((float)max_box[4] - (float)max_box[2]);
-      } else {
-        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                   ((float)max_box[4] - (float)max_box[2] + offset);
-      }
-      global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
-      input_score_ptr[global_max_index] = 0;
+#if __BANG_ARCH__ >= 300
+      calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+                 max_box_x2, max_box_y2);
+#else
+      calMaxArea(max_box, algo, offset, max_area);
+#endif
+      global_max_index = ((uint32_t *)(max_box + 5))[0];
+      input_data_score[global_max_index] = 0;
    }
    // by now, we get: max_score|max_index|max_box|max_area
-    /******find max end******/
-
-    /******nms store start******/
-    // store to nram
-    if (float(max_box[0]) > thresh_score) {
-      OUT_DT *save_ptr;
-      int save_offset = 0;
-      int save_str_num = 0;
-      save_ptr = nram_save;
-      save_offset = nram_save_count;
-      save_str_num = nram_save_limit_count;
-      if (coreId == 0) {
-        if (output_mode == 0) {  // index1, index2, ...
-          __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
-                   1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
-                   1 * sizeof(uint32_t), 0);
-        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
-                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
-        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
-                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
-                   4);
-        }
-      }
-      nram_save_count++;
-      (*output_box_num)++;
-    }
+    /******FIND MAX END******/

-    // store to sram/gdram
-    if (*output_box_num != 0) {
-      mluMemcpyDirection_t store_dir = NRAM2GDRAM;
-      if (dst == SRAM) {
-        store_dir = NRAM2SRAM;
-      } else {  // dst == GDRAM
-        store_dir = NRAM2GDRAM;
-      }
-      if ((nram_save_count == nram_save_limit_count) ||
-          (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
-        if (nram_save_count != 0) {
-          if (coreId == 0) {
-            if (output_mode == 0) {  // index1, index2, ...
-              pvLock();
-              __memcpy(output_data, nram_save,
-                       nram_save_count * sizeof(uint32_t), store_dir);
-              pvUnlock();
-              output_data += nram_save_count;
-            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-              pvLock();
-              __memcpy(output_data, nram_save,
-                       nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
-              pvUnlock();
-              output_data += nram_save_count * INFO_NUM;
-            } else if (output_mode ==
-                       2) {  // score---, x1---, y1---, x2---, y2---
-              pvLock();
-              __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
-                       store_dir, output_stride * sizeof(IN_DT),
-                       nram_save_limit_count * sizeof(IN_DT), 4);
-              pvUnlock();
-              output_data += nram_save_count;
-            }
-            nram_save_count = 0;
-          }
-        }
-      }  // if move data nram->sram/gdram
-    }    // if dst
+    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
+                max_output_size, thresh_score, output_mode, nram_save_count,
+                output_box_num);

    // if the max score <= 0, end
    if (core_limit == 1) {
@@ -375,190 +151,40 @@ __mlu_func__ void nms_detection(
    } else {
      if (float(max_box[0]) <= thresh_score) {
        if (coreId == 0) {
-          loop_end_flag[0] = 1;
+          exit_flag[0] = 1;
        }
      }
      __sync_cluster();
-      if (loop_end_flag[0] == 1) {
+      if (exit_flag[0] == 1) {
        break;
      }
    }
-    /******nms store end******/
-
-    // To solve half data accuracy, we convert half to float to calculate IoU.
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = 0;  // the length every nms compute
-      int cpy_len = 0;  // the length every nms memcpy
-      i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
-                              : seg_len = max_seg_iou_compute;
-      i == repeat_iou_compute ? cpy_len = remain_iou_compute
-                              : cpy_len = max_seg_iou_compute;
-
-      /******nms load start******/
-      mluMemcpyDirection_t load_dir = SRAM2NRAM;
-      if (src == SRAM) {
-        load_dir = SRAM2NRAM;
-      } else {
-        load_dir = GDRAM2NRAM;
-      }
-
-      __nramset((float *)score, seg_len, 0.0f);
-      int dt_offset = 0;
-      if (sizeof(IN_DT) == sizeof(float)) {
-        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        dt_offset = 0;
-      } else if (sizeof(IN_DT) == sizeof(half)) {
-        __nramset(x1, seg_len, half(0));
-        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __bang_half2float((float *)score, (half *)x1, seg_len);
-        dt_offset = max_seg_iou_compute;
-      }
-
-      if (input_layout == 0) {
-        // the following number 4 means x1, y1, x2, y2
-        __memcpy(
-            inter_x1,
-            input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
-            cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
-            cpy_len * COORD_DIM * sizeof(IN_DT),
-            cpy_len * COORD_DIM * sizeof(IN_DT), 0);
-        // here use collect instruction to transpose the [n, 4] shape into [4,
-        // n] shape to avoid
-        // discrete memory accessing.
-        for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
-          // the following number 32 means 32 elements will be selected out by
-          // once operation
-          __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         x1_mask, mask_size);
-          __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         y1_mask, mask_size);
-          __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         x2_mask, mask_size);
-          __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
-                         y2_mask, mask_size);
-        }
-      } else if (input_layout == 1) {
-        __memcpy(x1 + dt_offset,
-                 input_x1_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(y1 + dt_offset,
-                 input_y1_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(x2 + dt_offset,
-                 input_x2_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-        __memcpy(y2 + dt_offset,
-                 input_y2_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
-                 cpy_len * sizeof(IN_DT), 0);
-      }
-      /******nms load end******/
-
-      /******nms compute start******/
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
-                          seg_len);
-      }
-      // 1、 compute IOU
-      // get the area_I
-      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
-      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                      seg_len);                                  // inter_x1
-      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
-      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                      seg_len);  // inter_x2
-      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
-                         seg_len);                               // inter_w
-      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
-      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                      seg_len);                                  // inter_y1
-      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
-      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                      seg_len);  // inter_y2
-      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
-                         seg_len);  // inter_h
-      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-                 seg_len);  // area_I
-      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-      }
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-      // get the area_U: area + max_area - area_I
-      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
-                       seg_len);
-      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);  // area_U
-      // 2、 select the box
-      // if IOU greater than thres, set the score to zero, abort it: area_U >
-      // area_I * (1 / thresh)?
-      if (thresh_iou > 0.0) {
-        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                         seg_len);
-      } else {
-        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                         seg_len);
-      }
-      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-      /******nms compute end******/
-
-      // update the score
-      mluMemcpyDirection_t update_dir = NRAM2SRAM;
-      if (dst == SRAM) {
-        update_dir = NRAM2SRAM;
-      } else {
-        update_dir = NRAM2GDRAM;
-      }
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_float2half_rd((half *)score, (float *)score, seg_len);
-      }
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-               cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
-               cpy_len * sizeof(IN_DT), 0);
-      pvUnlock();
-    }  // for repeat
-  }    // for keepNum
+/******NMS STORE END******/
+#if __BANG_ARCH__ >= 300
+    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
+                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
+                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box_x1,
+                max_box_y1, max_box_x2, max_box_y2, nram_save,
+                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
+                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
+                input_offset, offset, max_area, input_num_boxes, algo);
+#else
+    scoreUpdate(input_data_score, load_dir, store_dir, input_x1_ptr,
+                input_y1_ptr, input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score,
+                inter_x1, inter_y1, inter_x2, inter_y2, max_box, max_box[1],
+                max_box[2], max_box[3], max_box[4], nram_save,
+                repeat_iou_compute, remain_iou_compute, remain_pad_iou_compute,
+                max_seg_iou_compute, max_seg_pad, thresh_iou, div_thresh_iou,
+                input_offset, offset, max_area, input_num_boxes, algo);
+#endif
+  }  // for max_output_size
 }

 __mlu_global__ void MLUUnion1KernelNMS(
    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int input_stride,
-    const int max_output_size, const float iou_threshold,
-    const float confidence_threshold, const int mode, const int input_layout,
-    void *workspace, void *result_num, void *output,
+    const int input_num_boxes, const int max_output_size,
+    const float iou_threshold, const float confidence_threshold,
+    const int output_mode, void *workspace, void *result_num, void *output,
    const cnrtDataType_t data_type_input, const float offset, const int algo) {
  if (data_type_input == CNRT_FLOAT16) {
    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
@@ -569,82 +195,48 @@ __mlu_global__ void MLUUnion1KernelNMS(
  } else {
  }

-  int output_stride = max_output_size;
-  uint32_t result_box_num = 0;
-  if (mode == 0) {
-    uint32_t *out_data = (uint32_t *)output;
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *boxes_data = (half *)input_boxes;
-        half *confi_data = (half *)workspace;
-        half *buffer = (half *)nram_buffer;
-        half *sram = (half *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *boxes_data = (float *)input_boxes;
-        float *confi_data = (float *)workspace;
-        float *buffer = (float *)nram_buffer;
-        float *sram = (float *)sram_buffer;
+  uint32_t output_box_num = 0;
+  float *score_data = (float *)workspace;
+  float *boxes_data = (float *)input_boxes;
+  float *sram = (float *)sram_buffer;

-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
+  if (output_mode == 0) {
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection(output_box_num, output_mode, (uint32_t *)output, score_data,
+                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
+                    max_output_size, iou_threshold, confidence_threshold,
+                    offset, algo);
+    } else {
+      nms_detection(output_box_num, output_mode, (uint32_t *)output,
+                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
+                    taskDim, input_num_boxes, max_output_size, iou_threshold,
+                    confidence_threshold, offset, algo);
    }
  } else {
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *boxes_data = (half *)input_boxes;
-        half *confi_data = (half *)workspace;
-        half *out_data = (half *)output;
-        half *buffer = (half *)nram_buffer;
-        half *sram = (half *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *boxes_data = (float *)input_boxes;
-        float *confi_data = (float *)workspace;
-        float *out_data = (float *)output;
-        float *buffer = (float *)nram_buffer;
-        float *sram = (float *)sram_buffer;
-
-        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
-                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
-                      sram, taskDim, input_num_boxes, input_stride,
-                      output_stride, max_output_size, iou_threshold,
-                      confidence_threshold, offset, algo);
-        ((uint32_t *)result_num)[0] = result_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection(output_box_num, output_mode, (float *)output, score_data,
+                    boxes_data, GDRAM, sram, taskDim, input_num_boxes,
+                    max_output_size, iou_threshold, confidence_threshold,
+                    offset, algo);
+    } else {
+      nms_detection(output_box_num, output_mode, (half *)output,
+                    (half *)score_data, (half *)boxes_data, GDRAM, (half *)sram,
+                    taskDim, input_num_boxes, max_output_size, iou_threshold,
+                    confidence_threshold, offset, algo);
    }
  }
+  ((uint32_t *)result_num)[0] = output_box_num;
 }

 template <typename IN_DT, typename OUT_DT>
 __mlu_func__ void nms_detection_ux(
-    int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
+    int32_t *exit_flag, uint32_t &output_box_num, OUT_DT *output_dram,
    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
-    const int input_layout, const int input_num_boxes, const int input_stride,
-    const int max_output_size, const float thresh_iou, const float thresh_score,
-    const float offset, const int output_mode, const int algo) {
-  loop_end_flag[0] = 0;
+    const int input_num_boxes, const int max_output_size,
+    const float thresh_iou, const float thresh_score, const float offset,
+    const int output_mode, const int algo) {
+  exit_flag[0] = 0;
+
  IN_DT *sram = (IN_DT *)sram_buffer;

  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
@@ -654,16 +246,10 @@ __mlu_func__ void nms_detection_ux(
  float div_thresh_iou = 1.0 / thresh_iou;

  // input data ptr
-  IN_DT *input_score_ptr;
-  const IN_DT *input_x1_ptr;
-  const IN_DT *input_y1_ptr;
-  const IN_DT *input_x2_ptr;
-  const IN_DT *input_y2_ptr;
-  input_score_ptr = score_data;
-  input_x1_ptr = boxes_data;
-  input_y1_ptr = input_x1_ptr + input_stride;
-  input_x2_ptr = input_y1_ptr + input_stride;
-  input_y2_ptr = input_x2_ptr + input_stride;
+  const IN_DT *input_x1_ptr = boxes_data;
+  const IN_DT *input_y1_ptr = input_x1_ptr + input_num_boxes;
+  const IN_DT *input_x2_ptr = input_y1_ptr + input_num_boxes;
+  const IN_DT *input_y2_ptr = input_x2_ptr + input_num_boxes;

  int limit = 0;        // find limit when GDRAM or SRAM
  int max_seg_pad = 0;  // the max length every repeat
@@ -682,41 +268,16 @@ __mlu_func__ void nms_detection_ux(
            (nms_buffer_count1 * sizeof(IN_DT));
  }

-  // data split
-  int avg_cluster = input_num_boxes / clusterDim;
-  int rem_cluster = input_num_boxes % clusterDim;
-  int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
-  int cluster_offset = avg_cluster * clusterId +
-                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
-
-  int avg_core = len_cluster / coreDim;
-  int rem_core = len_cluster % coreDim;
-  int len_core = avg_core + (coreId < rem_core ? 1 : 0);
-  int core_offset =
-      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
-  int input_offset = cluster_offset + core_offset;
-
-  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
-
-  // core 0 of each cluster calculate the max score index
-  int max_index_avg_core = input_num_boxes / clusterDim;
-  int max_index_rem_core = input_num_boxes % clusterDim;
-  int max_index_len_core =
-      max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
-  int max_index_input_offset =
-      max_index_avg_core * clusterId +
-      (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
-  repeat = max_index_len_core / max_seg_pad;
-  remain = max_index_len_core % max_seg_pad;
-  remain_pad = PAD_UP(remain, NMS_SIZE);
-
-  // if datatype is fp16, we should cvt to fp32 when compute iou
-  int max_seg_iou_compute =
-      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
-  int repeat_iou_compute = len_core / max_seg_iou_compute;
-  int remain_iou_compute = len_core % max_seg_iou_compute;
-  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+  int input_offset = 0;
+  int max_seg_iou_compute = 0;
+  int repeat_iou_compute = 0;
+  int remain_iou_compute = 0;
+  int remain_pad_iou_compute = 0;

+  getComputeParamsUx(sizeof(IN_DT), input_num_boxes, limit, input_offset,
+                     max_seg_pad, repeat, remain, remain_pad,
+                     max_seg_iou_compute, repeat_iou_compute,
+                     remain_iou_compute, remain_pad_iou_compute);
  // init the nram ptr
  IN_DT *score = (IN_DT *)nram_buffer;
  IN_DT *x1 = score + max_seg_pad;
@@ -731,320 +292,94 @@ __mlu_func__ void nms_detection_ux(
  OUT_DT *nram_save =
      (OUT_DT *)((char *)max_box +
                 NFU_ALIGN_SIZE);  // offset two line from max_box
-
-  mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
-  mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
-  input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
-  input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+#if __BANG_ARCH__ >= 300
+  float max_box_x1 = 0;
+  float max_box_y1 = 0;
+  float max_box_x2 = 0;
+  float max_box_y2 = 0;
+#endif
+  mluMemcpyDirection_t load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t store_dir = NRAM2SRAM;
+  load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;

  for (int keep = 0; keep < max_output_size;
       keep++) {  // loop until the max_score <= 0
    __sync_all();

-    /******FIND MAX START******/
    int max_index = 0;
    int global_max_index = 0;  // for Ux
    float max_area = 0;        // the max socre area
    max_box[0] = 0;            // init 0

    if (coreId == 0) {
-      for (int i = 0; i <= repeat; i++) {
-        if (i == repeat && remain == 0) {
-          break;
-        }
-
-        int seg_len = (i == repeat)
-                          ? remain_pad
-                          : max_seg_pad;  // the length every nms compute
-        // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
-        // num
-        // that fp16 could express.
-        if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
-          return;
-        }
-        int cpy_len = (i == repeat)
-                          ? remain
-                          : max_seg_pad;  // the length every nms memcpy
-
-        /******NMS LOAD START******/
-        __bang_write_zero(score, seg_len);
-        __memcpy(score,
-                 input_score_ptr + max_index_input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-
-        /******NMS LOAD END******/
-
-        __bang_max(inter_x1, score, seg_len);
-        if (inter_x1[0] > max_box[0]) {
-          max_box[0] = inter_x1[0];
-          if (sizeof(IN_DT) == sizeof(half)) {
-            max_index =
-                ((uint16_t *)inter_x1)[1] + max_index_input_offset +
-                i * max_seg_pad;  // offset start from head of input_data
-          } else if (sizeof(IN_DT) == sizeof(float)) {
-            max_index =
-                ((uint32_t *)inter_x1)[1] + max_index_input_offset +
-                i * max_seg_pad;  // offset start from head of input_data
-          }
-        }
-      }  // for repeat
-
-      // the max box's x1, y1, x2, y2 on every cluster
-      max_box[1] = input_x1_ptr[max_index];
-      max_box[2] = input_y1_ptr[max_index];
-      max_box[3] = input_x2_ptr[max_index];
-      max_box[4] = input_y2_ptr[max_index];
-      ((uint32_t *)(max_box + 5))[0] = max_index;
+      findCoreMaxBox(score_data, score, inter_x1, max_box, input_x1_ptr,
+                     input_y1_ptr, input_x2_ptr, input_y2_ptr, load_dir,
+                     input_offset, repeat, remain, remain_pad, max_seg_pad,
+                     max_index);
      // copy max box info to sram
      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
    }
    __sync_all();
-    // copy all partial max to the sram of cluster 0
-    if (clusterId != 0) {
-      __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
-               SRAM2SRAM, 0);
-    }
-    __sync_all();
-
-    // reduce between clusters to get the global max box
-    if (clusterId == 0) {
-      if (coreId == 0) {
-        __bang_write_zero(inter_x1, NMS_SIZE);
-        __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
-                 REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
-        __bang_max(max_box, inter_x1, NMS_SIZE);
-        int max_cluster = (sizeof(IN_DT) == sizeof(half))
-                              ? ((uint16_t *)max_box)[1]
-                              : ((uint32_t *)max_box)[1];
-        __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
-                 REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-        __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
-      }
-      __sync_cluster();
-      if (coreId == 0x80 && clusterDim > 1) {
-        // broadcast global max box to each cluster's sram
-        for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
-          __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
-                   cluster_idx);
-        }
-      }
-      __sync_cluster();
-    }
-    __sync_all();
+#if __BANG_ARCH__ <= 372
+    findGlobalMaxBox(max_box, sram, inter_x1);
+#endif

-    // copy the global max box to max_box
-    __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
-    if (algo == 0 || offset == 0.0) {
-      max_area = ((float)max_box[3] - (float)max_box[1]) *
-                 ((float)max_box[4] - (float)max_box[2]);
-    } else {
-      max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
-                 ((float)max_box[4] - (float)max_box[2] + offset);
-    }
+#if __BANG_ARCH__ >= 300
+    calMaxArea(max_box, algo, offset, max_area, max_box_x1, max_box_y1,
+               max_box_x2, max_box_y2);
+#else
+    calMaxArea(max_box, algo, offset, max_area);
+#endif
    global_max_index = ((uint32_t *)(max_box + 5))[0];
-    if (coreId != 0x80) {
-      input_score_ptr[global_max_index] = 0;
+    if (coreId != MEMORY_CORE) {
+      score_data[global_max_index] = 0;
    }
-    // by now, we get: max_score|max_index|max_box|max_area
-    /******FIND MAX END******/

-    /******NMS STORE START******/
-    // store to nram
-    if (float(max_box[0]) > thresh_score) {
-      OUT_DT *save_ptr;
-      int save_offset = 0;
-      int save_str_num = 0;
-      save_ptr = nram_save;
-      save_offset = nram_save_count;
-      save_str_num = nram_save_limit_count;
-      if (clusterId == 0 && coreId == 0) {
-        if (output_mode == 0) {  // index1, index2, ...
-          save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
-        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
-                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
-                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
-        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
-          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
-                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
-                   4);
-        }
-      }
-      nram_save_count++;
-      output_box_num++;
-    }
-
-    // store to sram/gdram
-    if (output_box_num != 0) {
-      if ((nram_save_count == nram_save_limit_count) ||
-          (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
-        if (nram_save_count != 0) {
-          if (clusterId == 0 && coreId == 0) {
-            if (output_mode == 0) {  // index1, index2, ...
-              pvLock();
-              __memcpy(output_dram, nram_save,
-                       nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
-              pvUnlock();
-              output_dram += nram_save_count;
-            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
-              pvLock();
-              __memcpy(output_dram, nram_save,
-                       nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
-              pvUnlock();
-              output_dram += nram_save_count * INFO_NUM;
-            } else if (output_mode ==
-                       2) {  // score---, x1---, y1---, x2---, y2---
-              pvLock();
-              __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
-                       NRAM2GDRAM, max_output_size * sizeof(IN_DT),
-                       nram_save_limit_count * sizeof(IN_DT), 4);
-              pvUnlock();
-              output_dram += nram_save_count;
-            }
-            nram_save_count = 0;
-          }
-        }
-      }  // if move data nram->sram/gdram
-    }    // if dst
+    storeResult(max_box, nram_save, output_dram, keep, nram_save_limit_count,
+                max_output_size, thresh_score, output_mode, nram_save_count,
+                output_box_num);

    if (float(max_box[0]) <= thresh_score) {
      if (clusterId == 0 && coreId == 0) {
-        loop_end_flag[0] = 1;  // dram
+        exit_flag[0] = 1;  // dram
      }
    }
    __sync_all();
-    if (loop_end_flag[0] == 1) {
+    if (exit_flag[0] == 1) {
      break;
    }
-    /******NMS STORE END******/
-
-    // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
-    for (int i = 0; i <= repeat_iou_compute; i++) {
-      if (i == repeat_iou_compute && remain_iou_compute == 0) {
-        break;
-      }
-      int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
-                                              : max_seg_iou_compute;
-      int cpy_len =
-          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
-
-      /******NMS LOAD START******/
-      __nramset((float *)score, seg_len, 0.0f);
-      int dt_offset = 0;
-      if (sizeof(IN_DT) == sizeof(float)) {
-        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-        dt_offset = 0;
-      } else if (sizeof(IN_DT) == sizeof(half)) {
-        __nramset(x1, seg_len, half(0));
-        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
-                 cpy_len * sizeof(IN_DT), input_load_dir,
-                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-        __bang_half2float((float *)score, (half *)x1, seg_len);
-        dt_offset = max_seg_iou_compute;
-      }
-
-      __memcpy(x1 + dt_offset,
-               input_x1_ptr + input_offset + i * max_seg_iou_compute,
-               cpy_len * sizeof(IN_DT), input_load_dir,
-               max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
-      /******NMS LOAD END******/
-
-      /******NMS COMPUTE START******/
-      if (sizeof(IN_DT) == sizeof(half)) {
-        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
-                          seg_len);
-        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
-                          seg_len);
-      }
-      // 1、 compute IOU
-      // get the area_I
-      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
-      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
-                      seg_len);                                  // inter_x1
-      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
-      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
-                      seg_len);  // inter_x2
-      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
-                         seg_len);                               // inter_w
-      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
-      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
-                      seg_len);                                  // inter_y1
-      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
-      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
-                      seg_len);  // inter_y2
-      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
-                 seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-      }
-      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
-                         seg_len);  // inter_h
-      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
-                 seg_len);  // area_I
-      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
-      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
-      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
-      if (algo == 1 && offset != 0.0) {
-        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
-        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
-      }
-      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
-                 seg_len);  // area
-      // get the area_U: area + max_area - area_I
-      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
-                       seg_len);
-      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
-                 seg_len);  // area_U
-      // 2、 select the box
-      // if IOU greater than thres, set the score to zero, abort it: area_U >
-      // area_I * (1 / thresh)?
-      if (thresh_iou > 0.0) {
-        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
-                         seg_len);
-      } else {
-        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
-                         seg_len);
-      }
-      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
-                seg_len);
-      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
-      /******NMS COMPUTE END******/
-
-      if (sizeof(IN_DT) == 2) {
-        __bang_float2half_rd((half *)score, (float *)score, seg_len);
-      }
-      pvLock();
-      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
-               cpy_len * sizeof(IN_DT), input_store_dir,
-               cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
-      pvUnlock();
-    }  // for repeat
-  }    // for max_output_size
+/******NMS STORE END******/
+#if __BANG_ARCH__ >= 300
+    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
+                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
+                inter_y1, inter_x2, inter_y2, max_box, max_box_x1, max_box_y1,
+                max_box_x2, max_box_y2, nram_save, repeat_iou_compute,
+                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
+                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
+                max_area, input_num_boxes, algo);
+#else
+    scoreUpdate(score_data, load_dir, store_dir, input_x1_ptr, input_y1_ptr,
+                input_x2_ptr, input_y2_ptr, x1, y1, x2, y2, score, inter_x1,
+                inter_y1, inter_x2, inter_y2, max_box, max_box[1], max_box[2],
+                max_box[3], max_box[4], nram_save, repeat_iou_compute,
+                remain_iou_compute, remain_pad_iou_compute, max_seg_iou_compute,
+                max_seg_pad, thresh_iou, div_thresh_iou, input_offset, offset,
+                max_area, input_num_boxes, algo);
+#endif
+  }  // for max_output_size
 }

 __mlu_global__ void MLUUionXKernelNMS(
    const void *input_boxes, const void *input_confidence,
-    const int input_num_boxes, const int input_layout, const int input_stride,
-    const int max_output_size, const float iou_threshold,
-    const float confidence_threshold, const float offset,
-    const cnrtDataType_t data_type_input, const int output_mode, const int algo,
-    void *workspace, void *result_num, void *output) {
+    const int input_num_boxes, const int max_output_size,
+    const float iou_threshold, const float confidence_threshold,
+    const float offset, const cnrtDataType_t data_type_input,
+    const int output_mode, const int algo, void *workspace, void *result_num,
+    void *output) {
  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
-  int32_t *loop_end_flag =
-      (int32_t *)((char *)workspace +
-                  INFO_NUM * input_num_boxes * input_dwidth);
+  int32_t *exit_flag = (int32_t *)((char *)workspace +
+                                   INFO_NUM * input_num_boxes * input_dwidth);
  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;

@@ -1062,88 +397,55 @@ __mlu_global__ void MLUUionXKernelNMS(
    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
  }
  __sync_cluster();
+
  uint32_t output_box_num = 0;
+  float *score_data;
+  float *boxes_data;
+  score_data = (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+  boxes_data = (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+
  if (output_mode == 0) {
-    uint32_t *output_dram = (uint32_t *)output;
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *score_data;
-        half *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *score_data;
-        float *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
+                       score_data, boxes_data, input_ram, input_num_boxes,
+                       max_output_size, iou_threshold, confidence_threshold,
+                       offset, output_mode, algo);
+    } else {
+      nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output,
+                       (half *)score_data, (half *)boxes_data, input_ram,
+                       input_num_boxes, max_output_size, iou_threshold,
+                       confidence_threshold, offset, output_mode, algo);
    }
  } else {
-    switch (data_type_input) {
-      default: { return; }
-      case CNRT_FLOAT16: {
-        half *output_dram = (half *)output;
-        half *score_data;
-        half *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
-      case CNRT_FLOAT32: {
-        float *output_dram = (float *)output;
-        float *score_data;
-        float *boxes_data;
-        score_data =
-            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
-        boxes_data =
-            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
-        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
-                         boxes_data, input_ram, input_layout, input_num_boxes,
-                         input_stride, max_output_size, iou_threshold,
-                         confidence_threshold, offset, output_mode, algo);
-        ((uint32_t *)result_num)[0] = output_box_num;
-      }; break;
+    if (data_type_input == CNRT_FLOAT32) {
+      nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data,
+                       boxes_data, input_ram, input_num_boxes, max_output_size,
+                       iou_threshold, confidence_threshold, offset, output_mode,
+                       algo);
+    } else {
+      nms_detection_ux(exit_flag, output_box_num, (half *)output,
+                       (half *)score_data, (half *)boxes_data, input_ram,
+                       input_num_boxes, max_output_size, iou_threshold,
+                       confidence_threshold, offset, output_mode, algo);
    }
  }
+  ((uint32_t *)result_num)[0] = output_box_num;
 }

 void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
               const cnrtDataType_t data_type_input, const void *boxes_ptr,
               const void *scores_ptr, const int input_num_boxes,
-               const int input_stride, const int max_output_boxes,
-               const float iou_threshold, const float offset,
-               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
+               const int max_output_boxes, const float iou_threshold,
+               const float offset, void *workspace_ptr, void *output_size_ptr,
+               void *output_ptr) {
  switch (k_type) {
    default: { return; }
    case CNRT_FUNC_TYPE_BLOCK:
    case CNRT_FUNC_TYPE_UNION1: {
      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
-          boxes_ptr, scores_ptr, input_num_boxes, input_stride,
+          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
-          /*output_mode=*/0,
-          /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
+          /*output_mode=*/0, workspace_ptr, output_size_ptr, output_ptr,
          data_type_input, offset, /*algo=*/1);
    }; break;
    case CNRT_FUNC_TYPE_UNION2:
@@ -1151,11 +453,10 @@ void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
    case CNRT_FUNC_TYPE_UNION8:
    case CNRT_FUNC_TYPE_UNION16: {
      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
-          boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
-          input_stride, max_output_boxes, iou_threshold,
-          /*confidence_threshold=*/0.0, offset, data_type_input,
-          /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
-          output_ptr);
+          (void *)boxes_ptr, (void *)scores_ptr, input_num_boxes,
+          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0, offset,
+          data_type_input, /*output_mode=*/0, /*algo=*/1, workspace_ptr,
+          output_size_ptr, output_ptr);
    }; break;
  }
 }
--- a/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+/*************************************************************************
+ * Copyright (C) [2019-2022] by Cambricon, Inc.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef NMS_UTILS_HPP_
+#define NMS_UTILS_HPP_
+#include "common_mlu_helper.hpp"
+
+#define NMS_SIZE (64)
+#define NMS_UP(x, y) (x / y + (int)(x % y > 0)) * y
+#define NMS_DOWN(x, y) (x / y) * y
+#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+#define MEMORY_CORE (0x80)
+#define REDUCE_NUM \
+  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
+template <typename T>
+static __mlu_func__ void computeReluN(T *nram_dst, T *nram_src, void *nram_tmp,
+                                      const int deal_num,
+                                      const T threshold = 0) {
+  if (threshold < 0) {
+    return;
+  }
+  if (threshold) {
+#if __BANG_ARCH__ >= 300
+    __bang_relun(nram_dst, nram_src, deal_num, threshold);
+#else
+    int align_num = NFU_ALIGN_SIZE / sizeof(T);
+    T *nram_aux_a = (T *)nram_tmp;
+    T *nram_aux_b = nram_aux_a + deal_num;
+    T *nram_zero = nram_aux_b + align_num;
+    __bang_write_value(nram_aux_b, align_num, threshold);
+    __bang_write_zero(nram_zero, align_num);
+    __bang_cycle_lt((T *)nram_aux_a, nram_src, (T *)nram_aux_b, deal_num,
+                    align_num);
+    __bang_mul(nram_dst, nram_src, (T *)nram_aux_a, deal_num);
+    __bang_cycle_eq((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_zero, deal_num,
+                    align_num);
+    __bang_cycle_mul((T *)nram_aux_a, (T *)nram_aux_a, (T *)nram_aux_b,
+                     deal_num, align_num);
+    __bang_add(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
+    __bang_cycle_gt((T *)nram_aux_a, nram_dst, (T *)nram_zero, deal_num,
+                    align_num);
+    __bang_mul(nram_dst, nram_dst, (T *)nram_aux_a, deal_num);
+#endif
+  } else {
+#if __BANG_ARCH__ >= 300
+    __bang_relu(nram_dst, nram_src, deal_num);
+#else
+    __bang_active_relu(nram_dst, nram_src, deal_num);
+#endif
+  }
+}
+
+__mlu_func__ void getComputeParamsBlockOrU1(
+    const int input_dwidth, const int input_box_num, const int limit,
+    const int core_limit, int &input_offset, int &max_seg_pad, int &repeat,
+    int &remain, int &remain_pad, int &max_seg_iou_compute,
+    int &repeat_iou_compute, int &remain_iou_compute,
+    int &remain_pad_iou_compute) {
+  int avg_core = input_box_num / core_limit;
+  int rem = input_box_num % core_limit;
+  int len_core = avg_core + (coreId < rem ? 1 : 0);
+  input_offset = avg_core * coreId + (coreId <= rem ? coreId : rem);
+  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
+  repeat = len_core / max_seg_pad;
+  remain = len_core % max_seg_pad;
+  remain_pad = NMS_UP(remain, NMS_SIZE);
+
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  max_seg_iou_compute = NMS_DOWN(max_seg_pad / (4 / input_dwidth), NMS_SIZE);
+  repeat_iou_compute = len_core / max_seg_iou_compute;
+  remain_iou_compute = len_core % max_seg_iou_compute;
+  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
+}
+
+__mlu_func__ void getComputeParamsUx(
+    const int input_dwidth, const int input_num_boxes, const int limit,
+    int &input_offset, int &max_seg_pad, int &repeat, int &remain,
+    int &remain_pad, int &max_seg_iou_compute, int &repeat_iou_compute,
+    int &remain_iou_compute, int &remain_pad_iou_compute) {
+  // data split
+  int avg_cluster = input_num_boxes / clusterDim;
+  int rem_cluster = input_num_boxes % clusterDim;
+  int len_cluster = avg_cluster + (clusterId < rem_cluster);
+  int cluster_offset = avg_cluster * clusterId +
+                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
+
+  int avg_core = len_cluster / coreDim;
+  int rem_core = len_cluster % coreDim;
+  int len_core = avg_core + (coreId < rem_core);
+  int core_offset =
+      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
+  input_offset = cluster_offset + core_offset;
+
+  max_seg_pad = NMS_DOWN(limit, NMS_SIZE);
+
+  // core 0 of each cluster calculate the max score index
+  int max_index_len_core = avg_cluster + (clusterId < rem_cluster);
+  repeat = max_index_len_core / max_seg_pad;
+  remain = max_index_len_core % max_seg_pad;
+  remain_pad = NMS_UP(remain, NMS_SIZE);
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  max_seg_iou_compute =
+      NMS_DOWN(max_seg_pad / (sizeof(float) / input_dwidth), NMS_SIZE);
+  repeat_iou_compute = len_core / max_seg_iou_compute;
+  remain_iou_compute = len_core % max_seg_iou_compute;
+  remain_pad_iou_compute = NMS_UP(remain_iou_compute, NMS_SIZE);
+}
+
+template <typename IN_DT>
+__mlu_func__ void findGlobalMaxBox(IN_DT *max_box, IN_DT *sram,
+                                   IN_DT *inter_x1) {
+  // copy all partial max to the sram of cluster 0
+  if (clusterId != 0) {
+    __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
+             SRAM2SRAM, 0);
+  }
+  __sync_all();
+
+  // reduce between clusters to get the global max box
+  if (clusterId == 0) {
+    if (coreId == 0) {
+      __bang_write_zero(inter_x1, NMS_SIZE);
+      __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+               REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                            ? ((uint16_t *)max_box)[1]
+                            : ((uint32_t *)max_box)[1];
+      __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
+               REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+    }
+    __sync_cluster();
+    if (coreId == 0x80 && clusterDim > 1) {
+      // broadcast global max box to each cluster's sram
+      for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
+        __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
+                 cluster_idx);
+      }
+    }
+    __sync_cluster();
+  }
+  __sync_all();
+
+  // copy the global max box to max_box
+  __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+}
+
+template <typename IN_DT>
+__mlu_func__ void findCoreMaxBox(
+    IN_DT *input_score_ptr, IN_DT *score, IN_DT *inter_x1, IN_DT *max_box,
+    const IN_DT *input_x1_ptr, const IN_DT *input_y1_ptr,
+    const IN_DT *input_x2_ptr, const IN_DT *input_y2_ptr,
+    const mluMemcpyDirection_t load_dir, const int input_offset,
+    const int repeat, const int remain, const int remain_pad,
+    const int max_seg_pad, int &max_index) {
+  if (coreId != 0x80) {
+    for (int i = 0; i <= repeat; i++) {
+      if (i == repeat && remain == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
+      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
+      /******NMS LOAD START******/
+      __bang_write_zero(score, seg_len);
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+
+      /******NMS LOAD END******/
+
+      __bang_max(inter_x1, score, seg_len);
+      if (inter_x1[0] > max_box[0]) {
+        max_box[0] = inter_x1[0];
+        if (sizeof(IN_DT) == sizeof(half)) {
+          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        } else if (sizeof(IN_DT) == sizeof(float)) {
+          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        }
+      }
+    }  // for repeat
+    // the max box's x1, y1, x2, y2 on every core
+    max_box[1] = input_x1_ptr[max_index];
+    max_box[2] = input_y1_ptr[max_index];
+    max_box[3] = input_x2_ptr[max_index];
+    max_box[4] = input_y2_ptr[max_index];
+    ((uint32_t *)(max_box + 5))[0] = max_index;
+  }
+}
+
+template <typename IN_DT>
+__mlu_func__ void findClusterMaxBox(IN_DT *sram, IN_DT *max_box,
+                                    IN_DT *inter_x1, IN_DT *input_data_score,
+                                    const int core_limit) {
+  // find the max with sram
+  // copy every core's box info to sram, form: score---x1---y1---x2---y2---
+  __memcpy(sram + REDUCE_NUM * coreId, max_box, REDUCE_NUM * sizeof(IN_DT),
+           NRAM2SRAM);  // int32_t datatype
+  __sync_cluster();
+
+  // copy score from sram to nram and find the max
+  __bang_write_zero(inter_x1, 64);
+  __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+           REDUCE_NUM * sizeof(IN_DT), coreDim - 1);
+  __bang_max(max_box, inter_x1, 64);
+  int max_core = sizeof(IN_DT) == sizeof(half) ? ((uint16_t *)max_box)[1]
+                                               : ((uint32_t *)max_box)[1];
+  // copy the max box to max_box
+  __memcpy(max_box, sram + max_core * REDUCE_NUM, REDUCE_NUM * sizeof(IN_DT),
+           SRAM2NRAM);
+}
+
+/*****************************************************************************/
+/*******************************CALCULATE MAX AREA****************************/
+/*****************************************************************************/
+
+template <typename IN_DT>
+__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
+                             float &max_area) {
+  if (algo == 0 || offset == 0.0) {
+    max_area = ((float)max_box[3] - (float)max_box[1]) *
+               ((float)max_box[4] - (float)max_box[2]);
+  } else {
+    max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+               ((float)max_box[4] - (float)max_box[2] + offset);
+  }
+}
+
+template <typename IN_DT>
+__mlu_func__ void calMaxArea(IN_DT *max_box, const int algo, float offset,
+                             float &max_area, float &max_box_x1,
+                             float &max_box_y1, float &max_box_x2,
+                             float &max_box_y2) {
+  // the case of random inf will break the requirement of x1<=x2, y1<=y2
+  // so exchange it if it happens.
+  max_box_x1 = float(max_box[1]);
+  max_box_x2 = float(max_box[3]);
+  if (max_box[1] > max_box[3]) {
+    max_box_x1 = float(max_box[3]);
+    max_box_x2 = float(max_box[1]);
+  }
+  max_box_y1 = float(max_box[2]);
+  max_box_y2 = float(max_box[4]);
+  if (max_box[2] > max_box[4]) {
+    max_box_y1 = float(max_box[4]);
+    max_box_y2 = float(max_box[2]);
+  }
+  if (algo == 0 || offset == 0.0) {
+    max_area = (max_box_x2 - max_box_x1) * (max_box_y2 - max_box_y1);
+  } else {
+    max_area =
+        (max_box_x2 - max_box_x1 + offset) * (max_box_y2 - max_box_y1 + offset);
+  }
+}
+
+/***********************************************************************/
+/*******************************STORE RESULT****************************/
+/***********************************************************************/
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void storeResult(IN_DT *max_box, OUT_DT *nram_save,
+                              OUT_DT *&output_dram, const int keep,
+                              const int nram_save_limit_count,
+                              const int max_output_size,
+                              const float thresh_score, const int output_mode,
+                              int &nram_save_count, uint32_t &output_box_num) {
+  /******NMS STORE START******/
+  // store to nram
+  if (float(max_box[0]) > thresh_score) {
+    OUT_DT *save_ptr;
+    int save_offset = 0;
+    int save_str_num = 0;
+    save_ptr = nram_save;
+    save_offset = nram_save_count;
+    save_str_num = nram_save_limit_count;
+    if (clusterId == 0 && coreId == 0) {
+      if (output_mode == 0) {  // index1, index2, ...
+        save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
+      } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+        __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                 INFO_NUM * sizeof(IN_DT), NRAM2NRAM, INFO_NUM * sizeof(IN_DT),
+                 INFO_NUM * sizeof(IN_DT), 0);
+      } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+        __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT), NRAM2NRAM,
+                 save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT), 4);
+      }
+    }
+    nram_save_count++;
+    output_box_num++;
+  }
+
+  // store to sram/gdram
+  if (output_box_num != 0) {
+    if ((nram_save_count == nram_save_limit_count) ||
+        (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
+      if (nram_save_count != 0) {
+        if (clusterId == 0 && coreId == 0) {
+          if (output_mode == 0) {  // index1, index2, ...
+            pvLock();
+            __memcpy(output_dram, nram_save, nram_save_count * sizeof(uint32_t),
+                     NRAM2GDRAM);
+            pvUnlock();
+            output_dram += nram_save_count;
+          } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+            pvLock();
+            __memcpy(output_dram, nram_save,
+                     nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
+            pvUnlock();
+            output_dram += nram_save_count * INFO_NUM;
+          } else if (output_mode ==
+                     2) {  // score---, x1---, y1---, x2---, y2---
+            pvLock();
+            __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
+                     NRAM2GDRAM, max_output_size * sizeof(IN_DT),
+                     nram_save_limit_count * sizeof(IN_DT), 4);
+            pvUnlock();
+            output_dram += nram_save_count;
+          }
+          nram_save_count = 0;
+        }
+      }
+    }  // if move data nram->sram/gdram
+  }    // if dst
+}
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void scoreUpdate(
+    IN_DT *input_score_ptr, const mluMemcpyDirection_t load_dir,
+    const mluMemcpyDirection_t store_dir, const IN_DT *input_x1_ptr,
+    const IN_DT *input_y1_ptr, const IN_DT *input_x2_ptr,
+    const IN_DT *input_y2_ptr, IN_DT *x1, IN_DT *y1, IN_DT *x2, IN_DT *y2,
+    IN_DT *score, IN_DT *inter_x1, IN_DT *inter_y1, IN_DT *inter_x2,
+    IN_DT *inter_y2, IN_DT *max_box, const float max_box_x1,
+    const float max_box_y1, const float max_box_x2, const float max_box_y2,
+    OUT_DT *nram_save, int repeat_iou_compute, int remain_iou_compute,
+    int remain_pad_iou_compute, int max_seg_iou_compute, int max_seg_pad,
+    const float thresh_iou, const float div_thresh_iou, const int input_offset,
+    const float offset, const float max_area, const int input_num_boxes,
+    const int algo) {
+  for (int i = 0; i <= repeat_iou_compute; i++) {
+    if (i == repeat_iou_compute && remain_iou_compute == 0) {
+      break;
+    }
+    int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
+                                            : max_seg_iou_compute;
+    int cpy_len =
+        (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
+    /******NMS LOAD START******/
+    int dt_offset = 0;
+    if (sizeof(IN_DT) == sizeof(float)) {
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      dt_offset = 0;
+    } else if (sizeof(IN_DT) == sizeof(half)) {
+      __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      __bang_half2float((float *)score, (half *)x1, seg_len);
+      dt_offset = max_seg_iou_compute;
+    }
+#if __BANG_ARCH__ >= 300
+    __memcpy(inter_x1 + dt_offset,
+             input_x1_ptr + input_offset + i * max_seg_iou_compute,
+             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
+             input_num_boxes * sizeof(IN_DT), 3);
+
+    if (sizeof(IN_DT) == sizeof(half)) {
+      __bang_half2float((float *)inter_x1,
+                        (half *)inter_x1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_y1,
+                        (half *)inter_y1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_x2,
+                        (half *)inter_x2 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)inter_y2,
+                        (half *)inter_y2 + max_seg_iou_compute, seg_len);
+    }
+    // box transfer
+    __bang_minequal((float *)x1, (float *)inter_x1, (float *)inter_x2, seg_len);
+    __bang_maxequal((float *)x2, (float *)inter_x1, (float *)inter_x2, seg_len);
+    __bang_minequal((float *)y1, (float *)inter_y1, (float *)inter_y2, seg_len);
+    __bang_maxequal((float *)y2, (float *)inter_y1, (float *)inter_y2, seg_len);
+    // 1、 compute IOU
+    // get the area_I
+    __bang_maxeq_scalar((float *)inter_x1, (float *)x1, max_box_x1,
+                        seg_len);  // inter_x1
+    __bang_mineq_scalar((float *)inter_x2, (float *)x2, max_box_x2,
+                        seg_len);  // inter_x2
+    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+    }
+    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
+                 seg_len);  // inter_w
+    __bang_maxeq_scalar((float *)inter_y1, (float *)y1, float(max_box_y1),
+                        seg_len);  // inter_y1
+    __bang_mineq_scalar((float *)inter_y2, (float *)y2, float(max_box_y2),
+                        seg_len);  // inter_y2
+    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+    }
+    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
+                 seg_len);  // inter_h
+    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+               seg_len);  // area_I
+    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+    if (algo == 1 && offset != 0.0) {
+      __bang_fusion(FUSION_FSA, (float *)inter_y1, (float *)x2, (float *)x1,
+                    offset, seg_len, seg_len);
+      __bang_fusion(FUSION_FSA, (float *)inter_y2, (float *)y2, (float *)y1,
+                    offset, seg_len, seg_len);
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+    } else {
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_fusion(FUSION_FSM, (float *)inter_x2, (float *)y2, (float *)y1,
+                    (float *)inter_y1, seg_len, seg_len);
+    }
+    // get the area_U: area + max_area - area_I
+    __bang_fusion(FUSION_FAS, (float *)inter_x2, (float *)inter_x2, max_area,
+                  (float *)inter_x1, seg_len, seg_len);
+    // 2、 select the box
+    // if IOU greater than thres, set the score to zero, abort it: area_U >
+    // area_I * (1 / thresh)?
+    if (thresh_iou > 0.0) {
+      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                        seg_len);
+    } else {
+      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                        seg_len);
+    }
+    // process for nan
+    __bang_lt((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
+    __bang_not((float *)inter_x1, (float *)inter_x1, seg_len);
+    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+/******NMS COMPUTE END******/
+#else
+    __memcpy(x1 + dt_offset,
+             input_x1_ptr + input_offset + i * max_seg_iou_compute,
+             cpy_len * sizeof(IN_DT), load_dir, max_seg_pad * sizeof(IN_DT),
+             input_num_boxes * sizeof(IN_DT), 3);
+    if (sizeof(IN_DT) == sizeof(half)) {
+      __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute, seg_len);
+      __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute, seg_len);
+    }
+    // 1、 compute IOU
+    // get the area_I
+    __bang_write_value((float *)inter_y1, seg_len,
+                       float(max_box[1]));  // max_x1
+    __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                    seg_len);  // inter_x1
+    __bang_write_value((float *)inter_y2, seg_len,
+                       float(max_box[3]));  // max_x2
+    __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                    seg_len);  // inter_x2
+    __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+    }
+    computeReluN((float *)inter_x1, (float *)inter_x1, NULL,
+                 seg_len);  // inter_w
+    __bang_write_value((float *)inter_x2, seg_len,
+                       float(max_box[2]));  // max_y1
+    __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                    seg_len);  // inter_y1
+    __bang_write_value((float *)inter_x2, seg_len,
+                       float(max_box[4]));  // max_y2
+    __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                    seg_len);  // inter_y2
+    __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+               seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+    }
+    computeReluN((float *)inter_y1, (float *)inter_y1, NULL,
+                 seg_len);  // inter_h
+    __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+               seg_len);  // area_I
+    // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+    __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+    __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+    if (algo == 1 && offset != 0.0) {
+      __bang_add_scalar((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      __bang_add_scalar((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+    }
+    __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+               seg_len);  // area
+    // get the area_U: area + max_area - area_I
+    __bang_add_scalar((float *)inter_x2, (float *)inter_x2, float(max_area),
+                      seg_len);
+    __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+               seg_len);  // area_U
+    // 2、 select the box
+    // if IOU greater than thresh, set the score to zero, abort it: area_U >
+    // area_I * (1 / thresh)?
+    if (thresh_iou > 0.0) {
+      __bang_mul_scalar((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                        seg_len);
+    } else {
+      __bang_mul_scalar((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                        seg_len);
+    }
+    __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1, seg_len);
+    __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+/******NMS COMPUTE END******/
+#endif
+    // update the score
+    if (sizeof(IN_DT) == sizeof(half)) {
+      convertFloat2half((half *)score, (float *)score, seg_len);
+    }
+    pvLock();
+    __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+             cpy_len * sizeof(IN_DT), store_dir, cpy_len * sizeof(IN_DT),
+             cpy_len * sizeof(IN_DT), 0);
+    pvUnlock();
+  }
+}
+
+#endif  // NMS_UTILS_HPP_
--- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
  int w_seg = position.w_end - position.w_start;
  int size = h_seg * w_seg * shape_full.c;

-  __memcpy(dst,
-           src + position.n_start * n_offset + position.h_start * h_offset +
-               position.w_start * w_offset,
+  __memcpy(dst, src + position.n_start * n_offset +
+                    position.h_start * h_offset + position.w_start * w_offset,
           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
           n_seg - 1);
 }
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
  int elem_count =
      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
                 NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram, elem_count, (T)0);
+  __bang_write_value(y_nram, elem_count, (T)0);

  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
  int y_h_offset = shape_seg.w * shape_seg.c;
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
  int elem_count =
      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram_temp, elem_count, (T)0);
+  __bang_write_value(y_nram_temp, elem_count, (T)0);

  int y_n_offset = align_hw * align_c;
  int y_h_offset = shape_seg.w * align_c;
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
  int elem_count =
      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
                 NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(dx_nram, elem_count, (T)0);
+  __bang_write_value(dx_nram, elem_count, (T)0);

  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
  int dy_h_offset = shape_seg.w * dy_full.c;
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
  // fill zeros to dx
  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
-  __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
+  __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
+                     (T)0);

  int dy_n_offset_seg = align_hw * align_c;
  int dy_h_offset_seg = shape_seg.w * align_c;

--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);

          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
-          __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
-          __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
-          __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
-          __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
+          __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
+          __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
+          __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
+          __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);

          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
      }  // loop_roi_grid_w
    }    // loop_roi_grid_h
    T count_value = (T)(1.0 / count);
-    __bang_mul_const(nram_out, nram_out, count_value, align_channel);
+    __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
  }  // loop_cyc_num
 }
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
    case CNRT_FLOAT16: {
      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
                         channels, pooled_height, pooled_width, input_height,
-                         input_width, sampling_ratio,
-                         (half)spatial_scale, num_rois);
+                         input_width, sampling_ratio, (half)spatial_scale,
+                         num_rois);
    }; break;
    case CNRT_FLOAT32: {
      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
                                      &x_high, &y_low, &y_high);
          if (x_low >= 0 && y_low >= 0) {
            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_low * wo * c + x_low * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_low * wo * c + x_high * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_high * wo * c + x_low * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_high * wo * c + x_high * c,
                              (T *)buffer + c_align, c);
@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
              }
              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
                       GDRAM2NRAM);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_high * wo * c + x_high * c + i * deal_once,

--- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
      }

      if (is_empty) {
-        __nramset((T *)nram_out, c_slice_align, (T)0);
+        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
                 c_slice * t_size, NRAM2GDRAM);
        if (NULL != argmax) {
-          __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
        }
@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
          for (int i = 0; i < c_slice; i++) {
            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
          }
-          __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
-                           c_slice_align);
-          __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                            c_slice_align);
+          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
+                            c_slice_align);

          /*compute input_w*/
-          __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
-                           c_slice_align);
+          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                            c_slice_align);
          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
                     c_slice_align);
-          __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                            c_slice_align);
          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
                     c_slice_align);
          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
                       rois_num, (float)spatial_scale, (float *)output_data,
                       argmax);
    }; break;
-    default: {
-      break;
-    }
+    default: { break; }
  }
 }
 }  // namespace forward
@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
                   align_c);

  // Perform 'temp_result - hstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                    align_c);

  // Perform 'temp_result1 - temp_result2 * width' operation
-  __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
-                   align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                    align_c);
  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
             (float *)nram_argmax_fp_w, align_c);

  // Perform 'temp_result - wstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
+                    wstart, align_c);

  // Perform 'temp_result = h * w_compute + w' operation
-  __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                   w_compute, align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                    w_compute, align_c);
  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
             (float *)nram_argmax_fp_w, align_c);

  if (loop_flag == 1) {
-    __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                     (loop_id * true_limit), align_c);
+    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                      (loop_id * true_limit), align_c);
  }
  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
         */

        // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }

-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        // Perform pooling operation on NRAM.
@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
         */

        // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }
-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        int ping_pong = 0;
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
                       height, width, pooled_height, pooled_width, rois_num,
                       (const float)spatial_scale, high_precision);
    }; break;
-    default: {
-      break;
-    }
+    default: { break; }
  }
 }
 }  // namespace backward

--- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
    int t_shift = shifts[n_index * group_size + group_id];
    int index = cur_channel_index % channel_size * hw_size +
                n_index * time_size * channel_size * hw_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
    __asm__ volatile("sync;");
    if (abs(t_shift) >= time_size) {
      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
    int next_sequence_index =
        index / hw_size / channel_size % time_size + segmentime_size;
    int cur_sequence_index = index / hw_size / channel_size % time_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
    __asm__ volatile("sync;");
    if (max_number_hw_per_core == 0) {
      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,