[Refactor] Adapt mlu code to cntoolkit3.0.1

e847cf8a · bdf · Zaida Zhou · 4c6e99c8 · e847cf8a · e847cf8a
Commit e847cf8a authored Oct 10, 2022 by bdf Committed by Zaida Zhou Nov 23, 2022
9 changed files
--- a/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
@@ -88,14 +88,14 @@ __mlu_func__ void bboxOverlapsWorkflow(

      // right - left + offset ---> left
      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-      __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+      __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);

      // bottom - top + offset ---> right
      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-      __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+      __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);

      // zero vector ---> bottom
-      __nramset(vec_bottom, batches_stride, 0.f);
+      __bang_write_value(vec_bottom, batches_stride, 0.f);

      // width --> vec_left
      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -107,11 +107,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
      // get the b1_area
      // (b1_x2 - b1_x1 + offset)  --->  vec_top
      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-      __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+      __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);

      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-      __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+      __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);

      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
      // --->  vec_top;
@@ -121,11 +121,11 @@ __mlu_func__ void bboxOverlapsWorkflow(
      // get the b2_area
      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-      __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);

      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-      __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+      __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);

      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
      // --->  b2_x1;
@@ -137,7 +137,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
      T *inter_s = height;

      // offset vector ---> vec_b2_y1
-      __nramset(vec_b2_y1, batches_stride, T(offset));
+      __bang_write_value(vec_b2_y1, batches_stride, T(offset));
      T *vec_offset = vec_b2_y1;

      if (mode == 0) {
@@ -164,10 +164,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
      int32_t base1 = b1 * COORD_NUM;

      // set bbox1 and bbox2 to nram
-      __nramset(vec_b1_x1, batches_stride, bbox1[base1]);
-      __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
-      __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
-      __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
+      __bang_write_value(vec_b1_x1, batches_stride, bbox1[base1]);
+      __bang_write_value(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
+      __bang_write_value(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
+      __bang_write_value(vec_b1_y2, batches_stride, bbox1[base1 + 3]);

      for (int32_t j = 0; j < num_loop_cpy; j++) {
        int32_t index2 = j * batches_stride;
@@ -195,13 +195,13 @@ __mlu_func__ void bboxOverlapsWorkflow(

        // right - left + offset ---> left
        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
-        __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+        __bang_add_scalar(vec_left, vec_left, (T)offset, batches_stride);
        // bottom - top + offset ---> right
        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
-        __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+        __bang_add_scalar(vec_right, vec_right, (T)offset, batches_stride);

        // zero vector ---> bottom
-        __nramset(vec_bottom, batches_stride, (T)0);
+        __bang_write_value(vec_bottom, batches_stride, (T)0);

        // width --> vec_left
        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
@@ -213,10 +213,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
        // get the b1_area
        // (b1_x2 - b1_x1 + offset)  --->  vec_top
        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
-        __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+        __bang_add_scalar(vec_top, vec_top, (T)offset, batches_stride);
        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
-        __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+        __bang_add_scalar(vec_bottom, vec_bottom, (T)offset, batches_stride);
        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
        // --->  vec_top;
        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
@@ -225,10 +225,10 @@ __mlu_func__ void bboxOverlapsWorkflow(
        // get the b2_area
        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
-        __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
-        __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+        __bang_add_scalar(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
        // --->  b2_x1;
        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
@@ -239,7 +239,7 @@ __mlu_func__ void bboxOverlapsWorkflow(
        T *inter_s = height;

        // offset vector ---> vec_b2_y1
-        __nramset(vec_b2_y1, batches_stride, T(offset));
+        __bang_write_value(vec_b2_y1, batches_stride, T(offset));
        T *vec_offset = vec_b2_y1;

        if (mode == 0) {

--- a/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/carafe_mlu_kernel.mlu
@@ -139,7 +139,7 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
    blkEnd.Wo = blkStart.Wo + blkSize.Wo - 1;

    // set output_nram to zero
-    __nramset(output_nram, param.output_nram_size, T(0));
+    __bang_write_value(output_nram, param.output_nram_size, T(0));

    // loop blocks of kernel window: grid_dim.(Kh, Kw)
    for (blkId.Kh = 0; blkId.Kh < grid_dim.Kh; ++blkId.Kh) {
@@ -313,8 +313,8 @@ __mlu_func__ void carafeForwardBLOCK(T *input, T *mask,
                T *sum = sum_array;

                for (int g = 0; g < blkSize.G; ++g) {
-                  __bang_mul_const(sum, src, mask_array[mask_index],
-                                   param.block_Cg_NFU);
+                  __bang_mul_scalar(sum, src, mask_array[mask_index],
+                                    param.block_Cg_NFU);
                  //
                  // NOTE: Since block_Cg_NFU >= block_Cg_stride,
                  // overlapped writing may occur on sum_array.
@@ -446,8 +446,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
          T *base_grad_input = (T *)grad_input + input_index;
          __memcpy((T *)input_buff, (T *)base_input, num_align * sizeof(T),
                   GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], num_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], num_align);
          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                            (T *)grad_input_buff, num_align);
          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -485,8 +485,8 @@ __mlu_func__ void CarafeCompute(T *input, T *mask, T *grad_output,
          T *base_grad_input = (T *)grad_input + input_index;
          __memcpy((T *)input_buff, (T *)base_input, rem_for_loop * sizeof(T),
                   GDRAM2NRAM);
-          __bang_mul_const((T *)grad_input_buff, (T *)grad_output_buff,
-                           ((T *)mask_buff)[mask_index], rem_for_loop_align);
+          __bang_mul_scalar((T *)grad_input_buff, (T *)grad_output_buff,
+                            ((T *)mask_buff)[mask_index], rem_for_loop_align);
          __bang_atomic_add((T *)grad_input_buff, (T *)base_grad_input,
                            (T *)grad_input_buff, rem_for_loop);
          __bang_mul((T *)input_buff, (T *)grad_output_buff, (T *)input_buff,
@@ -541,12 +541,12 @@ void KernelCarafeBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
                          const int wi, const int c, const int k_up,
                          const int group, const int scale) {
  if (dtype == CNRT_FLOAT16) {
-    backward::MLUUnion1KernelCarafeBackward<half>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<half><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
  } else {
-    backward::MLUUnion1KernelCarafeBackward<float>
-        <<<k_dim, k_type, queue>>>(input, mask, grad_output, grad_input,
-                                   grad_mask, n, hi, wi, c, k_up, group, scale);
+    backward::MLUUnion1KernelCarafeBackward<float><<<k_dim, k_type, queue>>>(
+        input, mask, grad_output, grad_input, grad_mask, n, hi, wi, c, k_up,
+        group, scale);
  }
 }
--- a/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
+++ b/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -211,51 +211,52 @@ __mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
  // get sign bit
  const float move_23bit = 8388608.0;
  // 0x80000000 = 1,000000000,0000000000000000000000000000
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
  // get 1 or 0 from sign bit
  // judg is Odd
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
                   (char *)src_addition, src_count * sizeof(float),
                   NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000001);
  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  // minus xor, positive num invariant
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
  // convert int32 to float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x7fffff);
  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
                    src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x4b000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x4b000000);
  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
                   src_count * sizeof(float), NFU_ALIGN_SIZE);
-  __bang_sub_const(dst, dst, move_23bit, src_count);
+  __bang_sub_scalar(dst, dst, move_23bit, src_count);
  // add one
  __bang_add(dst, dst, dst_addition, src_count);
  // set sign for float32
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xffffffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xffffffff);
  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x00000001);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x00000001);
  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));

-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0x80000000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0x80000000);
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * 4, 128);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
@@ -291,18 +292,20 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // dst_addition = abs(src)
  __bang_mul(dst_addition, src, (float *)dst, src_count);
  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     1.0f);
  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
                  NFU_ALIGN_SIZE / sizeof(float));
  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            0xbf800000);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     0xbf800000);
  // set negative flag -1.0 = 0xbf80000
  __bang_cycle_eq(
      (float *)dst, (float *)dst, (float *)src_addition, src_count,
      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
  __bang_active_abs(dst_addition, src, src_count);
-  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
+  __bang_write_value((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     8388608.0f);
  // mask shift move 23
  __bang_cycle_add_tz(
      dst_addition, dst_addition, src_addition, src_count,
@@ -314,12 +317,12 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // to fix max value
  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
  // means max value.
-  __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, 16777215.0, src_count);
  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
              src_count * floatDchar);
  // get low 23bit
-  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
-            (unsigned)0x007fffff);
+  __bang_write_value((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+                     (unsigned)0x007fffff);
  // mask low 23bit is 1
  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
                    (char *)src_addition, src_count * floatDchar,
@@ -327,16 +330,36 @@ __mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
  // set 9 high bit ===> dst
  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
-  __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_write_value(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
                   NFU_ALIGN_SIZE / sizeof(float));
  // src or dst_addition
  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
-  __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_mul_scalar((float *)dst, (float *)dst, -2.0, src_count);
  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
             src_count * floatDchar);
 #endif  // __BANG_ARCH__ >= 300
 }

+/*!
+ * @brief Converts float32 to half data type,
+ * the rounding mode on MLU200 is rd, on MLU300 is rn.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores half type data.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ inline void convertFloat2half(half *dst, float *src,
+                                           int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2half_rn(dst, src, src_count);
+#else
+  __bang_float2half_rd(dst, src, src_count);
+#endif
+}
+
 #endif  // COMMON_MLU_HELPER_HPP_
--- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
--- a/mmcv/ops/csrc/common/mlu/nms_utils.hpp
+++ b/mmcv/ops/csrc/common/mlu/nms_utils.hpp
--- a/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
@@ -53,9 +53,8 @@ __mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
  int w_seg = position.w_end - position.w_start;
  int size = h_seg * w_seg * shape_full.c;

-  __memcpy(dst,
-           src + position.n_start * n_offset + position.h_start * h_offset +
-               position.w_start * w_offset,
+  __memcpy(dst, src + position.n_start * n_offset +
+                    position.h_start * h_offset + position.w_start * w_offset,
           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
           n_seg - 1);
 }
@@ -89,7 +88,7 @@ __mlu_func__ void psamaskCollectForward(
  int elem_count =
      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
                 NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram, elem_count, (T)0);
+  __bang_write_value(y_nram, elem_count, (T)0);

  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
  int y_h_offset = shape_seg.w * shape_seg.c;
@@ -155,7 +154,7 @@ __mlu_func__ void psamaskDistributeForward(
      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
  int elem_count =
      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(y_nram_temp, elem_count, (T)0);
+  __bang_write_value(y_nram_temp, elem_count, (T)0);

  int y_n_offset = align_hw * align_c;
  int y_h_offset = shape_seg.w * align_c;
@@ -242,7 +241,7 @@ __mlu_func__ void psamaskCollectBackward(
  int elem_count =
      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
                 NFU_ALIGN_SIZE / sizeof(T));
-  __nramset(dx_nram, elem_count, (T)0);
+  __bang_write_value(dx_nram, elem_count, (T)0);

  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
  int dy_h_offset = shape_seg.w * dy_full.c;
@@ -331,7 +330,8 @@ __mlu_func__ void psamaskDistributeBackward(
  // fill zeros to dx
  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
-  __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
+  __bang_write_value(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)),
+                     (T)0);

  int dy_n_offset_seg = align_hw * align_c;
  int dy_h_offset_seg = shape_seg.w * align_c;

--- a/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
@@ -130,10 +130,10 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);

          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
-          __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
-          __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
-          __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
-          __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
+          __bang_mul_scalar(tmp_cyc1, tmp_cyc1, w1, align_channel);
+          __bang_mul_scalar(tmp_cyc2, tmp_cyc2, w2, align_channel);
+          __bang_mul_scalar(tmp_cyc3, tmp_cyc3, w3, align_channel);
+          __bang_mul_scalar(tmp_cyc4, tmp_cyc4, w4, align_channel);

          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
@@ -146,7 +146,7 @@ __mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
      }  // loop_roi_grid_w
    }    // loop_roi_grid_h
    T count_value = (T)(1.0 / count);
-    __bang_mul_const(nram_out, nram_out, count_value, align_channel);
+    __bang_mul_scalar(nram_out, nram_out, count_value, align_channel);
    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
  }  // loop_cyc_num
 }
@@ -242,8 +242,8 @@ __mlu_global__ void MLUUnion1KernelRoiAlignAvg(
    case CNRT_FLOAT16: {
      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
                         channels, pooled_height, pooled_width, input_height,
-                         input_width, sampling_ratio,
-                         (half)spatial_scale, num_rois);
+                         input_width, sampling_ratio, (half)spatial_scale,
+                         num_rois);
    }; break;
    case CNRT_FLOAT32: {
      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
@@ -346,31 +346,31 @@ __mlu_func__ void unionRoiAlignBp(
                                      &x_high, &y_low, &y_high);
          if (x_low >= 0 && y_low >= 0) {
            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w1,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_low * wo * c + x_low * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w2,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_low * wo * c + x_high * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w3,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_high * wo * c + x_low * c,
                              (T *)buffer + c_align, c);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
-                             c_align);
-            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
-                             1 / count, c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer, (T)w4,
+                              c_align);
+            __bang_mul_scalar((T *)buffer + c_align, (T *)buffer + c_align,
+                              1 / count, c_align);
            __bang_atomic_add((T *)buffer + c_align,
                              image_offset + y_high * wo * c + x_high * c,
                              (T *)buffer + c_align, c);
@@ -401,34 +401,34 @@ __mlu_func__ void unionRoiAlignBp(
              }
              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
                       GDRAM2NRAM);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w1,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w2,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w3,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
                  (T *)buffer + align_c, deal_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
-                               align_c);
-              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
-                               1 / count, align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer, (T)w4,
+                                align_c);
+              __bang_mul_scalar((T *)buffer + align_c, (T *)buffer + align_c,
+                                1 / count, align_c);
              __bang_atomic_add(
                  (T *)buffer + align_c,
                  image_offset + y_high * wo * c + x_high * c + i * deal_once,

--- a/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -204,11 +204,11 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
      }

      if (is_empty) {
-        __nramset((T *)nram_out, c_slice_align, (T)0);
+        __bang_write_value((T *)nram_out, c_slice_align, (T)0);
        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
                 c_slice * t_size, NRAM2GDRAM);
        if (NULL != argmax) {
-          __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __bang_write_value((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
        }
@@ -238,18 +238,18 @@ __mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
          for (int i = 0; i < c_slice; i++) {
            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
          }
-          __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
-                           c_slice_align);
-          __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                            c_slice_align);
+          __bang_mul_scalar((float *)nram_ping, (float *)nram_a, (float)width,
+                            c_slice_align);

          /*compute input_w*/
-          __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
-                           c_slice_align);
+          __bang_mul_scalar((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                            c_slice_align);
          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
                     c_slice_align);
-          __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
-                           c_slice_align);
+          __bang_add_scalar((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                            c_slice_align);
          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
                     c_slice_align);
          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
@@ -290,9 +290,7 @@ __mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
                       rois_num, (float)spatial_scale, (float *)output_data,
                       argmax);
    }; break;
-    default: {
-      break;
-    }
+    default: { break; }
  }
 }
 }  // namespace forward
@@ -328,30 +326,30 @@ __mlu_func__ void convertIndex(
                   align_c);

  // Perform 'temp_result - hstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                    align_c);

  // Perform 'temp_result1 - temp_result2 * width' operation
-  __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
-                   align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                    align_c);
  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
             (float *)nram_argmax_fp_w, align_c);

  // Perform 'temp_result - wstart' operation
-  __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
-                   align_c);
+  __bang_sub_scalar((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w,
+                    wstart, align_c);

  // Perform 'temp_result = h * w_compute + w' operation
-  __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                   w_compute, align_c);
+  __bang_mul_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                    w_compute, align_c);
  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
             (float *)nram_argmax_fp_w, align_c);

  if (loop_flag == 1) {
-    __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
-                     (loop_id * true_limit), align_c);
+    __bang_sub_scalar((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                      (loop_id * true_limit), align_c);
  }
  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
@@ -460,21 +458,22 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
         */

        // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }

-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        // Perform pooling operation on NRAM.
@@ -523,20 +522,21 @@ __mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
         */

        // Load the data from GDRAM to NRAM.
-        __memcpy((T *)nram_grads + align_c * high_precision,
-                 (const T *)grads + (n * pooled_height * pooled_width +
-                                     ph * pooled_width + pw) *
-                                        channels,
-                 channels * sizeof(T), GDRAM2NRAM);
+        __memcpy(
+            (T *)nram_grads + align_c * high_precision,
+            (const T *)grads +
+                (n * pooled_height * pooled_width + ph * pooled_width + pw) *
+                    channels,
+            channels * sizeof(T), GDRAM2NRAM);
        if (high_precision) {
          __bang_half2float((float *)nram_grads,
                            (half *)nram_grads + align_c * high_precision,
                            align_c);
        }
-        __memcpy((int32_t *)nram_argmax,
-                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
-                                            ph * pooled_width + pw) *
-                                               channels,
+        __memcpy((int32_t *)nram_argmax, (const int32_t *)argmax +
+                                             (n * pooled_height * pooled_width +
+                                              ph * pooled_width + pw) *
+                                                 channels,
                 channels * sizeof(int32_t), GDRAM2NRAM);

        int ping_pong = 0;
@@ -713,9 +713,7 @@ __mlu_global__ void MLUKernelRoiPoolBackward(
                       height, width, pooled_height, pooled_width, rois_num,
                       (const float)spatial_scale, high_precision);
    }; break;
-    default: {
-      break;
-    }
+    default: { break; }
  }
 }
 }  // namespace backward

--- a/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
+++ b/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
@@ -26,7 +26,7 @@ __mlu_func__ void mluMultiKernelTinShift(
    int t_shift = shifts[n_index * group_size + group_id];
    int index = cur_channel_index % channel_size * hw_size +
                n_index * time_size * channel_size * hw_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
    __asm__ volatile("sync;");
    if (abs(t_shift) >= time_size) {
      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
@@ -109,7 +109,7 @@ __mlu_func__ void mluMultiKernelTinShiftSplitSequence(
    int next_sequence_index =
        index / hw_size / channel_size % time_size + segmentime_size;
    int cur_sequence_index = index / hw_size / channel_size % time_size;
-    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __bang_write_value(data_nram, MAX_NRAM_SIZE, (char)0);
    __asm__ volatile("sync;");
    if (max_number_hw_per_core == 0) {
      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,